Update Eigen to: https://gitlab.com/libeigen/eigen/-/commit/fcb5106c6e16599eeabadf7d82a465d52229698f

BEGIN_PUBLIC

Update Eigen to: https://gitlab.com/libeigen/eigen/-/commit/fcb5106c6e16599eeabadf7d82a465d52229698f

END_PUBLIC

PiperOrigin-RevId: 368292210
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 30f1f52..1013ca0 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -246,8 +246,8 @@
       */
     const LDLT& adjoint() const { return *this; };
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     /** \brief Reports whether previous computation was successful.
       *
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 5876966..8c9b2b3 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -199,10 +199,10 @@
       * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
       * \code x = decomposition.adjoint().solve(b) \endcode
       */
-    const LLT& adjoint() const { return *this; };
+    const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; };
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     template<typename VectorType>
     LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 9a61665..20c789b 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -117,7 +117,7 @@
     {
       return Base::_set(other);
     }
-    
+
     /** Default constructor.
       *
       * For fixed-size matrices, does nothing.
@@ -177,24 +177,24 @@
       : Base(a0, a1, a2, a3, args...) {}
 
     /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
-      * 
+      *
       * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
-      * 
+      *
       * Example: \include Array_initializer_list_23_cxx11.cpp
       * Output: \verbinclude Array_initializer_list_23_cxx11.out
-      * 
+      *
       * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
-      * 
+      *
       * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed.
       * Therefore <code> Array<int,Dynamic,1>{{1,2,3,4,5}}</code> is legal and the more verbose syntax
       * <code>Array<int,Dynamic,1>{{1},{2},{3},{4},{5}}</code> can be avoided:
-      * 
+      *
       * Example: \include Array_initializer_list_vector_cxx11.cpp
       * Output: \verbinclude Array_initializer_list_vector_cxx11.out
-      * 
+      *
       * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes,
       * and implicit transposition is allowed for compile-time 1D arrays only.
-      * 
+      *
       * \sa  Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
       */
     EIGEN_DEVICE_FUNC
@@ -241,7 +241,7 @@
     /** constructs an initialized 2D vector with given coefficients
       * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */
     Array(const Scalar& val0, const Scalar& val1);
-    #endif  // end EIGEN_PARSED_BY_DOXYGEN 
+    #endif  // end EIGEN_PARSED_BY_DOXYGEN
 
     /** constructs an initialized 3D vector with given coefficients
       * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
@@ -288,8 +288,10 @@
       : Base(other.derived())
     { }
 
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT{ return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
 
     #ifdef EIGEN_ARRAY_PLUGIN
     #include EIGEN_ARRAY_PLUGIN
@@ -322,7 +324,7 @@
   * template parameter, i.e.:
   *   - `ArrayRowsCols<Type>` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size.
   *   - `ArraySize<Type>` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays.
-  * 
+  *
   * \sa class Array
   */
 
@@ -367,7 +369,7 @@
 /** \ingroup arraytypedefs */                                     \
 /** \brief \cpp11 */                                              \
 template <typename Type>                                          \
-using Array##SizeSuffix = Array<Type, Size, 1>; 
+using Array##SizeSuffix = Array<Type, Size, 1>;
 
 #define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size)                     \
 /** \ingroup arraytypedefs */                                     \
@@ -391,7 +393,7 @@
 #undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
 
 #endif // EIGEN_HAS_CXX11
-  
+
 #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
 using Eigen::Matrix##SizeSuffix##TypeSuffix; \
 using Eigen::Vector##SizeSuffix##TypeSuffix; \
diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index 757b318..2e9555b 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_ARRAYWRAPPER_H
 #define EIGEN_ARRAYWRAPPER_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class ArrayWrapper
   * \ingroup Core_Module
@@ -60,14 +60,14 @@
     EIGEN_DEVICE_FUNC
     explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
 
     EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
@@ -91,8 +91,8 @@
     inline void evalTo(Dest& dst) const { dst = m_expression; }
 
     EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<NestedExpressionType>::type& 
-    nestedExpression() const 
+    const typename internal::remove_all<NestedExpressionType>::type&
+    nestedExpression() const
     {
       return m_expression;
     }
@@ -158,14 +158,14 @@
     EIGEN_DEVICE_FUNC
     explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
 
     EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
@@ -185,8 +185,8 @@
     }
 
     EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<NestedExpressionType>::type& 
-    nestedExpression() const 
+    const typename internal::remove_all<NestedExpressionType>::type&
+    nestedExpression() const
     {
       return m_expression;
     }
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 508f17d..ab2ebf3 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -638,15 +638,15 @@
     #endif
   }
 
-  EIGEN_DEVICE_FUNC Index size() const        { return m_dstExpr.size(); }
-  EIGEN_DEVICE_FUNC Index innerSize() const   { return m_dstExpr.innerSize(); }
-  EIGEN_DEVICE_FUNC Index outerSize() const   { return m_dstExpr.outerSize(); }
-  EIGEN_DEVICE_FUNC Index rows() const        { return m_dstExpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
-  EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_dstExpr.size(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const EIGEN_NOEXCEPT { return m_dstExpr.innerSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const EIGEN_NOEXCEPT { return m_dstExpr.outerSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dstExpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_dstExpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return m_dstExpr.outerStride(); }
 
-  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
-  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
+  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() EIGEN_NOEXCEPT { return m_dst; }
+  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; }
 
   /// Assign src(row,col) to dst(row,col) through the assignment functor.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h
index 4978c91..480e044 100644
--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_BANDMATRIX_H
 #define EIGEN_BANDMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -45,7 +45,7 @@
     };
 
   public:
-    
+
     using Base::derived;
     using Base::rows;
     using Base::cols;
@@ -55,10 +55,10 @@
 
     /** \returns the number of sub diagonals */
     inline Index subs() const { return derived().subs(); }
-    
+
     /** \returns an expression of the underlying coefficient matrix */
     inline const CoefficientsType& coeffs() const { return derived().coeffs(); }
-    
+
     /** \returns an expression of the underlying coefficient matrix */
     inline CoefficientsType& coeffs() { return derived().coeffs(); }
 
@@ -130,7 +130,7 @@
       eigen_assert((i<0 && -i<=subs()) || (i>=0 && i<=supers()));
       return Block<const CoefficientsType,1,Dynamic>(coeffs(), supers()-i, std::max<Index>(0,i), 1, diagonalLength(i));
     }
-    
+
     template<typename Dest> inline void evalTo(Dest& dst) const
     {
       dst.resize(rows(),cols());
@@ -211,16 +211,16 @@
     }
 
     /** \returns the number of columns */
-    inline Index rows() const { return m_rows.value(); }
+    inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
 
     /** \returns the number of rows */
-    inline Index cols() const { return m_coeffs.cols(); }
+    inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
 
     /** \returns the number of super diagonals */
-    inline Index supers() const { return m_supers.value(); }
+    inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
 
     /** \returns the number of sub diagonals */
-    inline Index subs() const { return m_subs.value(); }
+    inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
 
     inline const CoefficientsType& coeffs() const { return m_coeffs; }
     inline CoefficientsType& coeffs() { return m_coeffs; }
@@ -275,16 +275,16 @@
     }
 
     /** \returns the number of columns */
-    inline Index rows() const { return m_rows.value(); }
+    inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
 
     /** \returns the number of rows */
-    inline Index cols() const { return m_coeffs.cols(); }
+    inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
 
     /** \returns the number of super diagonals */
-    inline Index supers() const { return m_supers.value(); }
+    inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
 
     /** \returns the number of sub diagonals */
-    inline Index subs() const { return m_subs.value(); }
+    inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
 
     inline const CoefficientsType& coeffs() const { return m_coeffs; }
 
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 6e938ea..3206d66 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_BLOCK_H
 #define EIGEN_BLOCK_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
@@ -52,7 +52,7 @@
     FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
     Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit,
     // FIXME DirectAccessBit should not be handled by expressions
-    // 
+    //
     // Alignment is needed by MapBase's assertions
     // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
     Alignment = 0
@@ -61,7 +61,7 @@
 
 template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false,
          bool HasDirectAccess = internal::has_direct_access<XprType>::ret> class BlockImpl_dense;
-         
+
 } // end namespace internal
 
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind> class BlockImpl;
@@ -109,9 +109,9 @@
     typedef Impl Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
-    
+
     typedef typename internal::remove_all<XprType>::type NestedExpression;
-  
+
     /** Column or Row constructor
       */
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -147,7 +147,7 @@
           && startCol >= 0 && blockCols >= 0 && startCol <= xpr.cols() - blockCols);
     }
 };
-         
+
 // The generic default implementation for dense block simplu forward to the internal::BlockImpl_dense
 // that must be specialized for direct and non-direct access...
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
@@ -296,23 +296,23 @@
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
-    { 
-      return m_xpr; 
+    {
+      return m_xpr;
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     XprType& nestedExpression() { return m_xpr; }
-      
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    StorageIndex startRow() const
-    { 
-      return m_startRow.value(); 
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startRow() const EIGEN_NOEXCEPT
+    {
+      return m_startRow.value();
     }
-      
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    StorageIndex startCol() const
-    { 
-      return m_startCol.value(); 
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startCol() const EIGEN_NOEXCEPT
+    {
+      return m_startCol.value();
     }
 
   protected:
@@ -344,7 +344,7 @@
       */
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     BlockImpl_dense(XprType& xpr, Index i)
-      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) 
+      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
                                 || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
              BlockRows==1 ? 1 : xpr.rows(),
              BlockCols==1 ? 1 : xpr.cols()),
@@ -378,17 +378,17 @@
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
-    { 
-      return m_xpr; 
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const EIGEN_NOEXCEPT
+    {
+      return m_xpr;
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     XprType& nestedExpression() { return m_xpr; }
-      
+
     /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index innerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index innerStride() const EIGEN_NOEXCEPT
     {
       return internal::traits<BlockType>::HasSameStorageOrderAsXprType
              ? m_xpr.innerStride()
@@ -396,23 +396,19 @@
     }
 
     /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index outerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index outerStride() const EIGEN_NOEXCEPT
     {
-      return m_outerStride;
+      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
+                    ? m_xpr.outerStride()
+                    : m_xpr.innerStride();
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    StorageIndex startRow() const
-    {
-      return m_startRow.value();
-    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startRow() const EIGEN_NOEXCEPT { return m_startRow.value(); }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    StorageIndex startCol() const
-    {
-      return m_startCol.value();
-    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startCol() const EIGEN_NOEXCEPT { return m_startCol.value(); }
 
   #ifndef __SUNPRO_CC
   // FIXME sunstudio is not friendly with the above friend...
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index a77c0fa..90c552f 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -14,7 +14,7 @@
 #define EIGEN_COREEVALUATORS_H
 
 namespace Eigen {
-  
+
 namespace internal {
 
 // This class returns the evaluator kind from the expression storage kind.
@@ -63,8 +63,8 @@
 template< typename T,
           typename Kind   = typename evaluator_traits<typename T::NestedExpression>::Kind,
           typename Scalar = typename T::Scalar> struct unary_evaluator;
-          
-// evaluator_traits<T> contains traits for evaluator<T> 
+
+// evaluator_traits<T> contains traits for evaluator<T>
 
 template<typename T>
 struct evaluator_traits_base
@@ -111,7 +111,7 @@
 {
   // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
   typedef traits<ExpressionType> ExpressionTraits;
-  
+
   enum {
     Alignment = 0
   };
@@ -143,8 +143,8 @@
 #endif
     eigen_internal_assert(outerStride==OuterStride);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index outerStride() const { return OuterStride; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index outerStride() const EIGEN_NOEXCEPT { return OuterStride; }
   const Scalar *data;
 };
 
@@ -172,7 +172,7 @@
     IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime,
     RowsAtCompileTime = PlainObjectType::RowsAtCompileTime,
     ColsAtCompileTime = PlainObjectType::ColsAtCompileTime,
-    
+
     CoeffReadCost = NumTraits<Scalar>::ReadCost,
     Flags = traits<Derived>::EvaluatorFlags,
     Alignment = traits<Derived>::Alignment
@@ -274,13 +274,13 @@
   : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
 {
   typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   evaluator() {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   explicit evaluator(const XprType& m)
-    : evaluator<PlainObjectBase<XprType> >(m) 
+    : evaluator<PlainObjectBase<XprType> >(m)
   { }
 };
 
@@ -292,10 +292,10 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   evaluator() {}
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   explicit evaluator(const XprType& m)
-    : evaluator<PlainObjectBase<XprType> >(m) 
+    : evaluator<PlainObjectBase<XprType> >(m)
   { }
 };
 
@@ -306,9 +306,9 @@
   : evaluator_base<Transpose<ArgType> >
 {
   typedef Transpose<ArgType> XprType;
-  
+
   enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,    
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
     Flags = evaluator<ArgType>::Flags ^ RowMajorBit,
     Alignment = evaluator<ArgType>::Alignment
   };
@@ -499,10 +499,10 @@
 {
   typedef CwiseNullaryOp<NullaryOp,PlainObjectType> XprType;
   typedef typename internal::remove_all<PlainObjectType>::type PlainObjectTypeCleaned;
-  
+
   enum {
     CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
-    
+
     Flags = (evaluator<PlainObjectTypeCleaned>::Flags
           &  (  HereditaryBits
               | (functor_has_linear_access<NullaryOp>::ret  ? LinearAccessBit : 0)
@@ -559,10 +559,10 @@
   : evaluator_base<CwiseUnaryOp<UnaryOp, ArgType> >
 {
   typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
-  
+
   enum {
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
-    
+
     Flags = evaluator<ArgType>::Flags
           & (HereditaryBits | LinearAccessBit | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
     Alignment = evaluator<ArgType>::Alignment
@@ -628,7 +628,7 @@
 {
   typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
   typedef ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > Base;
-  
+
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
@@ -637,10 +637,10 @@
   : evaluator_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
 {
   typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
-  
+
   enum {
     CoeffReadCost = evaluator<Arg1>::CoeffReadCost + evaluator<Arg2>::CoeffReadCost + evaluator<Arg3>::CoeffReadCost + functor_traits<TernaryOp>::Cost,
-    
+
     Arg1Flags = evaluator<Arg1>::Flags,
     Arg2Flags = evaluator<Arg2>::Flags,
     Arg3Flags = evaluator<Arg3>::Flags,
@@ -723,7 +723,7 @@
 {
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
   typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > Base;
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
@@ -733,10 +733,10 @@
   : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 {
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
-  
+
   enum {
     CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    
+
     LhsFlags = evaluator<Lhs>::Flags,
     RhsFlags = evaluator<Rhs>::Flags,
     SameType = is_same<typename Lhs::Scalar,typename Rhs::Scalar>::value,
@@ -813,12 +813,12 @@
   : evaluator_base<CwiseUnaryView<UnaryOp, ArgType> >
 {
   typedef CwiseUnaryView<UnaryOp, ArgType> XprType;
-  
+
   enum {
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
-    
+
     Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
-    
+
     Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost...
   };
 
@@ -884,7 +884,7 @@
   typedef typename XprType::PointerType PointerType;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  
+
   enum {
     IsRowMajor = XprType::RowsAtCompileTime,
     ColsAtCompileTime = XprType::ColsAtCompileTime,
@@ -956,17 +956,21 @@
     internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
   }
 protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index rowStride() const EIGEN_NOEXCEPT {
+    return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index colStride() const EIGEN_NOEXCEPT {
+     return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value();
+  }
 
   PointerType m_data;
   const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
   const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
 };
 
-template<typename PlainObjectType, int MapOptions, typename StrideType> 
+template<typename PlainObjectType, int MapOptions, typename StrideType>
 struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
   : public mapbase_evaluator<Map<PlainObjectType, MapOptions, StrideType>, PlainObjectType>
 {
@@ -974,7 +978,7 @@
   typedef typename XprType::Scalar Scalar;
   // TODO: should check for smaller packet types once we can handle multi-sized packet types
   typedef typename packet_traits<Scalar>::type PacketScalar;
-  
+
   enum {
     InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
                              ? int(PlainObjectType::InnerStrideAtCompileTime)
@@ -986,27 +990,27 @@
     HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
     HasNoStride = HasNoInnerStride && HasNoOuterStride,
     IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
-    
+
     PacketAccessMask = bool(HasNoInnerStride) ? ~int(0) : ~int(PacketAccessBit),
     LinearAccessMask = bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime) ? ~int(0) : ~int(LinearAccessBit),
     Flags = int( evaluator<PlainObjectType>::Flags) & (LinearAccessMask&PacketAccessMask),
-    
+
     Alignment = int(MapOptions)&int(AlignedMask)
   };
 
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map)
-    : mapbase_evaluator<XprType, PlainObjectType>(map) 
+    : mapbase_evaluator<XprType, PlainObjectType>(map)
   { }
 };
 
 // -------------------- Ref --------------------
 
-template<typename PlainObjectType, int RefOptions, typename StrideType> 
+template<typename PlainObjectType, int RefOptions, typename StrideType>
 struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
   : public mapbase_evaluator<Ref<PlainObjectType, RefOptions, StrideType>, PlainObjectType>
 {
   typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
-  
+
   enum {
     Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags,
     Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Alignment
@@ -1014,7 +1018,7 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   explicit evaluator(const XprType& ref)
-    : mapbase_evaluator<XprType, PlainObjectType>(ref) 
+    : mapbase_evaluator<XprType, PlainObjectType>(ref)
   { }
 };
 
@@ -1022,8 +1026,8 @@
 
 template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
          bool HasDirectAccess = internal::has_direct_access<ArgType>::ret> struct block_evaluator;
-         
-template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
   : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel>
 {
@@ -1031,15 +1035,15 @@
   typedef typename XprType::Scalar Scalar;
   // TODO: should check for smaller packet types once we can handle multi-sized packet types
   typedef typename packet_traits<Scalar>::type PacketScalar;
-  
+
   enum {
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
-    
+
     RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
     ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
     MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
-    
+
     ArgTypeIsRowMajor = (int(evaluator<ArgType>::Flags)&RowMajorBit) != 0,
     IsRowMajor = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? 1
                : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
@@ -1053,14 +1057,14 @@
                              ? int(outer_stride_at_compile_time<ArgType>::ret)
                              : int(inner_stride_at_compile_time<ArgType>::ret),
     MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
-    
-    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
+
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
     FlagsRowMajorBit = XprType::Flags&RowMajorBit,
     Flags0 = evaluator<ArgType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
                                            DirectAccessBit |
                                            MaskPacketAccessBit),
     Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
-    
+
     PacketAlignment = unpacket_traits<PacketScalar>::alignment,
     Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
                              && (OuterStrideAtCompileTime!=0)
@@ -1084,7 +1088,7 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   explicit block_evaluator(const XprType& block)
-    : unary_evaluator<XprType>(block) 
+    : unary_evaluator<XprType>(block)
   {}
 };
 
@@ -1096,12 +1100,12 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   explicit unary_evaluator(const XprType& block)
-    : m_argImpl(block.nestedExpression()), 
-      m_startRow(block.startRow()), 
+    : m_argImpl(block.nestedExpression()),
+      m_startRow(block.startRow()),
       m_startCol(block.startCol()),
       m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0)
   { }
- 
+
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
@@ -1109,13 +1113,13 @@
     RowsAtCompileTime = XprType::RowsAtCompileTime,
     ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
   };
- 
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
-  { 
-    return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); 
+  {
+    return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col);
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
@@ -1124,44 +1128,44 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index row, Index col)
-  { 
-    return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); 
+  {
+    return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col);
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index index)
   {
     return linear_coeffRef_impl(index, bool_constant<ForwardLinearAccess>());
   }
- 
+
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  PacketType packet(Index row, Index col) const 
-  { 
-    return m_argImpl.template packet<LoadMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col); 
+  PacketType packet(Index row, Index col) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col);
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  PacketType packet(Index index) const 
-  { 
+  PacketType packet(Index index) const
+  {
     if (ForwardLinearAccess)
       return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
     else
       return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
                                          RowsAtCompileTime == 1 ? index : 0);
   }
-  
+
   template<int StoreMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  void writePacket(Index row, Index col, const PacketType& x) 
+  void writePacket(Index row, Index col, const PacketType& x)
   {
-    return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x); 
+    return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x);
   }
-  
+
   template<int StoreMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketType& x) 
+  void writePacket(Index index, const PacketType& x)
   {
     if (ForwardLinearAccess)
       return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
@@ -1170,12 +1174,12 @@
                                               RowsAtCompileTime == 1 ? index : 0,
                                               x);
   }
- 
+
 protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const
   {
-    return m_argImpl.coeff(m_linear_offset.value() + index); 
+    return m_argImpl.coeff(m_linear_offset.value() + index);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const
@@ -1186,7 +1190,7 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& linear_coeffRef_impl(Index index, internal::true_type /* ForwardLinearAccess */)
   {
-    return m_argImpl.coeffRef(m_linear_offset.value() + index); 
+    return m_argImpl.coeffRef(m_linear_offset.value() + index);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& linear_coeffRef_impl(Index index, internal::false_type /* not ForwardLinearAccess */)
@@ -1200,10 +1204,10 @@
   const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
 };
 
-// TODO: This evaluator does not actually use the child evaluator; 
+// TODO: This evaluator does not actually use the child evaluator;
 // all action is via the data() as returned by the Block expression.
 
-template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAccess */ true>
   : mapbase_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>,
                       typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject>
@@ -1213,7 +1217,7 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   explicit block_evaluator(const XprType& block)
-    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
+    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block)
   {
     // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
     eigen_assert(((internal::UIntPtr(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
@@ -1236,7 +1240,7 @@
                                          evaluator<ElseMatrixType>::CoeffReadCost),
 
     Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits,
-    
+
     Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
   };
 
@@ -1248,7 +1252,7 @@
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
- 
+
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1268,7 +1272,7 @@
     else
       return m_elseImpl.coeff(index);
   }
- 
+
 protected:
   evaluator<ConditionMatrixType> m_conditionImpl;
   evaluator<ThenMatrixType> m_thenImpl;
@@ -1278,7 +1282,7 @@
 
 // -------------------- Replicate --------------------
 
-template<typename ArgType, int RowFactor, int ColFactor> 
+template<typename ArgType, int RowFactor, int ColFactor>
 struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
   : evaluator_base<Replicate<ArgType, RowFactor, ColFactor> >
 {
@@ -1289,12 +1293,12 @@
   };
   typedef typename internal::nested_eval<ArgType,Factor>::type ArgTypeNested;
   typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
-  
+
   enum {
     CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
     LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0,
     Flags = (evaluator<ArgTypeNestedCleaned>::Flags & (HereditaryBits|LinearAccessMask) & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit),
-    
+
     Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
   };
 
@@ -1305,7 +1309,7 @@
       m_rows(replicate.nestedExpression().rows()),
       m_cols(replicate.nestedExpression().cols())
   {}
- 
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
@@ -1316,10 +1320,10 @@
     const Index actual_col = internal::traits<XprType>::ColsAtCompileTime==1 ? 0
                            : ColFactor==1 ? col
                            : col % m_cols.value();
-    
+
     return m_argImpl.coeff(actual_row, actual_col);
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
@@ -1327,7 +1331,7 @@
     const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
                                   ? (ColFactor==1 ?  index : index%m_cols.value())
                                   : (RowFactor==1 ?  index : index%m_rows.value());
-    
+
     return m_argImpl.coeff(actual_index);
   }
 
@@ -1344,7 +1348,7 @@
 
     return m_argImpl.template packet<LoadMode,PacketType>(actual_row, actual_col);
   }
-  
+
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
@@ -1355,7 +1359,7 @@
 
     return m_argImpl.template packet<LoadMode,PacketType>(actual_index);
   }
- 
+
 protected:
   const ArgTypeNested m_arg;
   evaluator<ArgTypeNestedCleaned> m_argImpl;
@@ -1487,9 +1491,9 @@
     ReversePacket = (Direction == BothDirections)
                     || ((Direction == Vertical)   && IsColMajor)
                     || ((Direction == Horizontal) && IsRowMajor),
-                    
+
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
-    
+
     // let's enable LinearAccess only with vectorization because of the product overhead
     // FIXME enable DirectAccess with negative strides?
     Flags0 = evaluator<ArgType>::Flags,
@@ -1498,7 +1502,7 @@
                  ? LinearAccessBit : 0,
 
     Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess),
-    
+
     Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
   };
 
@@ -1508,7 +1512,7 @@
       m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
       m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1)
   { }
- 
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
@@ -1583,7 +1587,7 @@
     m_argImpl.template writePacket<LoadMode>
       (m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
   }
- 
+
 protected:
   evaluator<ArgType> m_argImpl;
 
@@ -1601,12 +1605,12 @@
   : evaluator_base<Diagonal<ArgType, DiagIndex> >
 {
   typedef Diagonal<ArgType, DiagIndex> XprType;
-  
+
   enum {
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
-    
+
     Flags = (unsigned int)(evaluator<ArgType>::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit,
-    
+
     Alignment = 0
   };
 
@@ -1615,7 +1619,7 @@
     : m_argImpl(diagonal.nestedExpression()),
       m_index(diagonal.index())
   { }
- 
+
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
@@ -1648,8 +1652,10 @@
   const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
 
 private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
 };
 
 
@@ -1673,25 +1679,25 @@
   : public dense_xpr_base<EvalToTemp<ArgType> >::type
 {
  public:
- 
+
   typedef typename dense_xpr_base<EvalToTemp>::type Base;
   EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp)
- 
+
   explicit EvalToTemp(const ArgType& arg)
     : m_arg(arg)
   { }
- 
+
   const ArgType& arg() const
   {
     return m_arg;
   }
 
-  Index rows() const 
+  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT
   {
     return m_arg.rows();
   }
 
-  Index cols() const 
+  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT
   {
     return m_arg.cols();
   }
@@ -1699,7 +1705,7 @@
  private:
   const ArgType& m_arg;
 };
- 
+
 template<typename ArgType>
 struct evaluator<EvalToTemp<ArgType> >
   : public evaluator<typename ArgType::PlainObject>
@@ -1707,7 +1713,7 @@
   typedef EvalToTemp<ArgType>                   XprType;
   typedef typename ArgType::PlainObject         PlainObject;
   typedef evaluator<PlainObject> Base;
-  
+
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
     : m_result(xpr.arg())
   {
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index 8b8de83..59974a5 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -74,7 +74,7 @@
   * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
   */
 template<typename BinaryOp, typename LhsType, typename RhsType>
-class CwiseBinaryOp : 
+class CwiseBinaryOp :
   public CwiseBinaryOpImpl<
           BinaryOp, LhsType, RhsType,
           typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
@@ -83,7 +83,7 @@
   internal::no_assignment_operator
 {
   public:
-    
+
     typedef typename internal::remove_all<BinaryOp>::type Functor;
     typedef typename internal::remove_all<LhsType>::type Lhs;
     typedef typename internal::remove_all<RhsType>::type Rhs;
@@ -116,21 +116,15 @@
       eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index rows() const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT {
       // return the fixed size type if available to enable compile time optimizations
-      if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
-        return m_rhs.rows();
-      else
-        return m_lhs.rows();
+      return internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic ? m_rhs.rows() : m_lhs.rows();
     }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index cols() const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT {
       // return the fixed size type if available to enable compile time optimizations
-      if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
-        return m_rhs.cols();
-      else
-        return m_lhs.cols();
+      return internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic ? m_rhs.cols() : m_lhs.cols();
     }
 
     /** \returns the left hand side nested expression */
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 8f3496f..289ec51 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -74,10 +74,10 @@
             && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
     }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const { return m_rows.value(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const { return m_rows.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const { return m_cols.value(); }
 
     /** \returns the functor representing the nullary operation */
     EIGEN_DEVICE_FUNC
@@ -131,7 +131,7 @@
   *
   * Here is an example with C++11 random generators: \include random_cpp11.cpp
   * Output: \verbinclude random_cpp11.out
-  * 
+  *
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index 1d2dd19..e68c4f7 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_CWISE_UNARY_OP_H
 #define EIGEN_CWISE_UNARY_OP_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 template<typename UnaryOp, typename XprType>
@@ -24,7 +24,7 @@
   typedef typename XprType::Nested XprTypeNested;
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   enum {
-    Flags = _XprTypeNested::Flags & RowMajorBit 
+    Flags = _XprTypeNested::Flags & RowMajorBit
   };
 };
 }
@@ -65,10 +65,10 @@
     explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
       : m_xpr(xpr), m_functor(func) {}
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index rows() const { return m_xpr.rows(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index cols() const { return m_xpr.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
 
     /** \returns the functor representing the unary operation */
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index 02b034d..a06d762 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -69,8 +69,10 @@
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     /** \returns the functor representing unary operation */
     EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }
@@ -108,16 +110,16 @@
 
     EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
-    
+
     EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); }
     EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); }
 
-    EIGEN_DEVICE_FUNC inline Index innerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const
     {
       return derived().nestedExpression().innerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
 
-    EIGEN_DEVICE_FUNC inline Index outerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const
     {
       return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 20cc482..9b16db6 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -14,7 +14,7 @@
 namespace Eigen {
 
 namespace internal {
-  
+
 // The index type defined by EIGEN_DEFAULT_DENSE_INDEX_TYPE must be a signed type.
 // This dummy function simply aims at checking that at compile time.
 static inline void check_DenseIndex_is_signed() {
@@ -22,7 +22,7 @@
 }
 
 } // end namespace internal
-  
+
 /** \class DenseBase
   * \ingroup Core_Module
   *
@@ -64,12 +64,12 @@
 
     /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc. */
     typedef typename internal::traits<Derived>::Scalar Scalar;
-    
+
     /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
       *
       * It is an alias for the Scalar type */
     typedef Scalar value_type;
-    
+
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;
 
@@ -158,7 +158,7 @@
           * a row-vector (if there is only one row). */
 
       NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
-        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, 
+        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
          * and 2 for matrices.
          */
 
@@ -175,11 +175,11 @@
       InnerStrideAtCompileTime = internal::inner_stride_at_compile_time<Derived>::ret,
       OuterStrideAtCompileTime = internal::outer_stride_at_compile_time<Derived>::ret
     };
-    
+
     typedef typename internal::find_best_packet<Scalar,SizeAtCompileTime>::type PacketScalar;
 
     enum { IsPlainObjectBase = 0 };
-    
+
     /** The plain matrix type corresponding to this expression.
       * \sa PlainObject */
     typedef Matrix<typename internal::traits<Derived>::Scalar,
@@ -189,7 +189,7 @@
                 internal::traits<Derived>::MaxRowsAtCompileTime,
                 internal::traits<Derived>::MaxColsAtCompileTime
           > PlainMatrix;
-    
+
     /** The plain array type corresponding to this expression.
       * \sa PlainObject */
     typedef Array<typename internal::traits<Derived>::Scalar,
@@ -211,7 +211,7 @@
 
     /** \returns the number of nonzero coefficients which is in practice the number
       * of stored coefficients. */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index nonZeros() const { return size(); }
 
     /** \returns the outer size.
@@ -219,7 +219,7 @@
       * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
       * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
       * column-major matrix, and the number of rows for a row-major matrix. */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     Index outerSize() const
     {
       return IsVectorAtCompileTime ? 1
@@ -229,9 +229,9 @@
     /** \returns the inner size.
       *
       * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
-      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a 
+      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a
       * column-major matrix, and the number of columns for a row-major matrix. */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     Index innerSize() const
     {
       return IsVectorAtCompileTime ? this->size()
@@ -411,7 +411,7 @@
       // size types on MSVC.
       return typename internal::eval<Derived>::type(derived());
     }
-    
+
     /** swaps *this with the expression \a other.
       *
       */
@@ -459,22 +459,48 @@
     // used.
     // TODO(rmlarsen): Replace with default template argument when we move to
     // c++11 or beyond.
-    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff() const {
+    EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff() const {
       return minCoeff<PropagateFast>();
     }
-    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff() const {
+    EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar maxCoeff() const {
       return maxCoeff<PropagateFast>();
     }
 
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;
 
+    // TODO(rmlarsen): Replace these methods with a default template argument.
+    template<typename IndexType>
+    EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const {
+      return minCoeff<PropagateFast>(row, col);
+    }
+    template<typename IndexType>
+    EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const {
+      return maxCoeff<PropagateFast>(row, col);
+    }
+    template<typename IndexType>
+     EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const {
+      return minCoeff<PropagateFast>(index);
+    }
+    template<typename IndexType>
+    EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const {
+      return maxCoeff<PropagateFast>(index);
+    }
+  
     template<typename BinaryOp>
     EIGEN_DEVICE_FUNC
     Scalar redux(const BinaryOp& func) const;
diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index 463b471..37fcdb5 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -27,7 +27,7 @@
   *
   * This class defines the \c operator() \c const function and friends, which can be used to read specific
   * entries of a matrix or array.
-  * 
+  *
   * \sa DenseCoeffsBase<Derived, WriteAccessors>, DenseCoeffsBase<Derived, DirectAccessors>,
   *     \ref TopicClassHierarchy
   */
@@ -295,7 +295,7 @@
   * This class defines the non-const \c operator() function and friends, which can be used to write specific
   * entries of a matrix or array. This class inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which
   * defines the const variant for reading specific entries.
-  * 
+  *
   * \sa DenseCoeffsBase<Derived, DirectAccessors>, \ref TopicClassHierarchy
   */
 template<typename Derived>
@@ -495,7 +495,7 @@
       *
       * \sa outerStride(), rowStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index innerStride() const
     {
       return derived().innerStride();
@@ -506,14 +506,14 @@
       *
       * \sa innerStride(), rowStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index outerStride() const
     {
       return derived().outerStride();
     }
 
     // FIXME shall we remove it ?
-    inline Index stride() const
+    EIGEN_CONSTEXPR inline Index stride() const
     {
       return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
     }
@@ -522,7 +522,7 @@
       *
       * \sa innerStride(), outerStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index rowStride() const
     {
       return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -532,7 +532,7 @@
       *
       * \sa innerStride(), outerStride(), rowStride()
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index colStride() const
     {
       return Derived::IsRowMajor ? innerStride() : outerStride();
@@ -570,8 +570,8 @@
       *
       * \sa outerStride(), rowStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT
     {
       return derived().innerStride();
     }
@@ -581,14 +581,14 @@
       *
       * \sa innerStride(), rowStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT
     {
       return derived().outerStride();
     }
 
     // FIXME shall we remove it ?
-    inline Index stride() const
+    EIGEN_CONSTEXPR inline Index stride() const EIGEN_NOEXCEPT
     {
       return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
     }
@@ -597,8 +597,8 @@
       *
       * \sa innerStride(), outerStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
-    inline Index rowStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rowStride() const EIGEN_NOEXCEPT
     {
       return Derived::IsRowMajor ? outerStride() : innerStride();
     }
@@ -607,8 +607,8 @@
       *
       * \sa innerStride(), outerStride(), rowStride()
       */
-    EIGEN_DEVICE_FUNC
-    inline Index colStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index colStride() const EIGEN_NOEXCEPT
     {
       return Derived::IsRowMajor ? innerStride() : outerStride();
     }
@@ -619,7 +619,7 @@
 template<int Alignment, typename Derived, bool JustReturnZero>
 struct first_aligned_impl
 {
-  static inline Index run(const Derived&)
+  static EIGEN_CONSTEXPR inline Index run(const Derived&) EIGEN_NOEXCEPT
   { return 0; }
 };
 
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 6966513..f6e1d0a 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -47,20 +47,20 @@
 
   EIGEN_DEVICE_FUNC
   plain_array()
-  { 
+  {
     check_static_allocation_size<T,Size>();
   }
 
   EIGEN_DEVICE_FUNC
   plain_array(constructor_without_unaligned_array_assert)
-  { 
+  {
     check_static_allocation_size<T,Size>();
   }
 };
 
 #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
-#elif EIGEN_GNUC_AT_LEAST(4,7) 
+#elif EIGEN_GNUC_AT_LEAST(4,7)
   // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
   // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
   // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
@@ -85,15 +85,15 @@
   EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size];
 
   EIGEN_DEVICE_FUNC
-  plain_array() 
+  plain_array()
   {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7);
     check_static_allocation_size<T,Size>();
   }
 
   EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert) 
-  { 
+  plain_array(constructor_without_unaligned_array_assert)
+  {
     check_static_allocation_size<T,Size>();
   }
 };
@@ -104,15 +104,15 @@
   EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size];
 
   EIGEN_DEVICE_FUNC
-  plain_array() 
-  { 
+  plain_array()
+  {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15);
     check_static_allocation_size<T,Size>();
   }
 
   EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert) 
-  { 
+  plain_array(constructor_without_unaligned_array_assert)
+  {
     check_static_allocation_size<T,Size>();
   }
 };
@@ -123,15 +123,15 @@
   EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size];
 
   EIGEN_DEVICE_FUNC
-  plain_array() 
+  plain_array()
   {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31);
     check_static_allocation_size<T,Size>();
   }
 
   EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert) 
-  { 
+  plain_array(constructor_without_unaligned_array_assert)
+  {
     check_static_allocation_size<T,Size>();
   }
 };
@@ -142,15 +142,15 @@
   EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size];
 
   EIGEN_DEVICE_FUNC
-  plain_array() 
-  { 
+  plain_array()
+  {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63);
     check_static_allocation_size<T,Size>();
   }
 
   EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert) 
-  { 
+  plain_array(constructor_without_unaligned_array_assert)
+  {
     check_static_allocation_size<T,Size>();
   }
 };
@@ -190,15 +190,15 @@
     EIGEN_DEVICE_FUNC
     explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()) {}
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
     }
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     DenseStorage& operator=(const DenseStorage& other)
-    { 
+    {
       if (this != &other) m_data = other.m_data;
-      return *this; 
+      return *this;
     }
 #if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
@@ -222,8 +222,8 @@
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
       numext::swap(m_data, other.m_data);
     }
-    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
@@ -240,8 +240,8 @@
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; }
     EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {}
-    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC const T *data() const { return 0; }
@@ -269,15 +269,15 @@
     EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
     EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
-    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) 
-    { 
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
       if (this != &other)
       {
         m_data = other.m_data;
         m_rows = other.m_rows;
         m_cols = other.m_cols;
       }
-      return *this; 
+      return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
@@ -304,14 +304,14 @@
     EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
     EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
-    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) 
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
         m_data = other.m_data;
         m_rows = other.m_rows;
       }
-      return *this; 
+      return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
@@ -319,8 +319,8 @@
       numext::swap(m_data,other.m_data);
       numext::swap(m_rows,other.m_rows);
     }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols(void) const EIGEN_NOEXCEPT {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
     EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) { m_rows = rows; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
@@ -351,8 +351,8 @@
       numext::swap(m_data,other.m_data);
       numext::swap(m_cols,other.m_cols);
     }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows(void) const EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
     EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
@@ -419,8 +419,8 @@
       numext::swap(m_rows,other.m_rows);
       numext::swap(m_cols,other.m_cols);
     }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
     void conservativeResize(Index size, Index rows, Index cols)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
@@ -474,7 +474,7 @@
         this->swap(tmp);
       }
       return *this;
-    }    
+    }
 #if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC
     DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
@@ -497,8 +497,8 @@
       numext::swap(m_data,other.m_data);
       numext::swap(m_cols,other.m_cols);
     }
-    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
@@ -550,7 +550,7 @@
         this->swap(tmp);
       }
       return *this;
-    }    
+    }
 #if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC
     DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
@@ -573,8 +573,8 @@
       numext::swap(m_data,other.m_data);
       numext::swap(m_rows,other.m_rows);
     }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) {return _Cols;}
     void conservativeResize(Index size, Index rows, Index)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index 563135f..3112d2c 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_DIAGONAL_H
 #define EIGEN_DIAGONAL_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class Diagonal
   * \ingroup Core_Module
@@ -84,20 +84,16 @@
                                : numext::mini<Index>(m_matrix.rows(),m_matrix.cols()-m_index.value());
     }
 
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return 1; }
 
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const
-    {
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT {
       return m_matrix.outerStride() + 1;
     }
 
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const
-    {
-      return 0;
-    }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return 0; }
 
     typedef typename internal::conditional<
                        internal::is_lvalue<MatrixType>::value,
@@ -149,8 +145,8 @@
     }
 
     EIGEN_DEVICE_FUNC
-    inline const typename internal::remove_all<typename MatrixType::Nested>::type& 
-    nestedExpression() const 
+    inline const typename internal::remove_all<typename MatrixType::Nested>::type&
+    nestedExpression() const
     {
       return m_matrix;
     }
@@ -167,12 +163,12 @@
 
   private:
     // some compilers may fail to optimize std::max etc in case of compile-time constants...
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index absDiagIndex() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rowOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? 0 : -m_index.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index colOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : 0; }
     // trigger a compile-time error if someone try to call packet
     template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
     template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index 0c34fb6..6b3c7d3 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -15,7 +15,7 @@
 
 /** \class EigenBase
   * \ingroup Core_Module
-  * 
+  *
   * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
   *
   * In other words, an EigenBase object is an object that can be copied into a MatrixBase.
@@ -29,7 +29,7 @@
 template<typename Derived> struct EigenBase
 {
 //   typedef typename internal::plain_matrix_type<Derived>::type PlainObject;
-  
+
   /** \brief The interface type of indices
     * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
     * \sa StorageIndex, \ref TopicPreprocessorDirectives.
@@ -56,15 +56,15 @@
   { return *static_cast<const Derived*>(this); }
 
   /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-  EIGEN_DEVICE_FUNC
-  inline Index rows() const { return derived().rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
   /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-  EIGEN_DEVICE_FUNC
-  inline Index cols() const { return derived().cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
   /** \returns the number of coefficients, which is rows()*cols().
     * \sa rows(), cols(), SizeAtCompileTime. */
-  EIGEN_DEVICE_FUNC
-  inline Index size() const { return rows() * cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index size() const EIGEN_NOEXCEPT { return rows() * cols(); }
 
   /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
   template<typename Dest>
diff --git a/Eigen/src/Core/ForceAlignedAccess.h b/Eigen/src/Core/ForceAlignedAccess.h
index 7b08b45..817a43a 100644
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h
@@ -41,10 +41,14 @@
 
     EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
 
     EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
     {
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index bc0fe39..53800a0 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -556,7 +556,7 @@
 peven_mask(const Packet& /*a*/) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   const size_t n = unpacket_traits<Packet>::size;
-  Scalar elements[n];
+  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
   for(size_t i = 0; i < n; ++i) {
     memset(elements+i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
   }
@@ -731,7 +731,7 @@
 predux_helper(const Packet& a, Op op) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   const size_t n = unpacket_traits<Packet>::size;
-  Scalar elements[n];
+  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
   pstoreu<Scalar>(elements, a);
   for(size_t k = n / 2; k > 0; k /= 2)  {
     for(size_t i = 0; i < k; ++i) {
diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h
index 7352d80..c514438 100644
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_INVERSE_H
 #define EIGEN_INVERSE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 template<typename XprType,typename StorageKind> class InverseImpl;
 
@@ -49,13 +49,13 @@
   typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
   typedef typename internal::ref_selector<Inverse>::type Nested;
   typedef typename internal::remove_all<XprType>::type NestedExpression;
-  
+
   explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
     : m_xpr(xpr)
   {}
 
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR  Index rows() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR  Index cols() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
 
   EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
 
@@ -81,7 +81,7 @@
 
 /** \internal
   * \brief Default evaluator for Inverse expression.
-  * 
+  *
   * This default evaluator for Inverse expression simply evaluate the inverse into a temporary
   * by a call to internal::call_assignment_no_alias.
   * Therefore, inverse implementers only have to specialize Assignment<Dst,Inverse<...>, ...> for
@@ -96,7 +96,7 @@
   typedef Inverse<ArgType> InverseType;
   typedef typename InverseType::PlainObject PlainObject;
   typedef evaluator<PlainObject> Base;
-  
+
   enum { Flags = Base::Flags | EvalBeforeNestingBit };
 
   unary_evaluator(const InverseType& inv_xpr)
@@ -105,11 +105,11 @@
     ::new (static_cast<Base*>(this)) Base(m_result);
     internal::call_assignment_no_alias(m_result, inv_xpr);
   }
-  
+
 protected:
   PlainObject m_result;
 };
-  
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index c437f1a..93d2ae9 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_MAP_H
 #define EIGEN_MAP_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 template<typename PlainObjectType, int MapOptions, typename StrideType>
@@ -104,13 +104,13 @@
     EIGEN_DEVICE_FUNC
     inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index innerStride() const
     {
       return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
     }
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index outerStride() const
     {
       return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 92c3b28..d856447 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -15,7 +15,7 @@
       EIGEN_STATIC_ASSERT((int(internal::evaluator<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
                           YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT)
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \ingroup Core_Module
   *
@@ -87,9 +87,11 @@
     typedef typename Base::CoeffReturnType CoeffReturnType;
 
     /** \copydoc DenseBase::rows() */
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_rows.value(); }
     /** \copydoc DenseBase::cols() */
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_cols.value(); }
 
     /** Returns a pointer to the first coefficient of the matrix or vector.
       *
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 7c67ffd..2920121 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -214,12 +214,12 @@
 template<typename Scalar>
 struct imag_ref_default_impl<Scalar, false>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline Scalar run(Scalar&)
   {
     return Scalar(0);
   }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline const Scalar run(const Scalar&)
   {
     return Scalar(0);
@@ -485,14 +485,31 @@
     EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
 #if EIGEN_HAS_CXX11_MATH
     EIGEN_USING_STD(round);
+#endif
     return Scalar(round(x));
-#elif EIGEN_HAS_C99_MATH
-    if (is_same<Scalar, float>::value) {
-      return Scalar(::roundf(x));
-    } else {
-      return Scalar(round(x));
-    }
+  }
+};
+
+#if !EIGEN_HAS_CXX11_MATH
+#if EIGEN_HAS_C99_MATH
+// Use ::roundf for float.
+template<>
+struct round_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static inline float run(const float& x)
+  {
+    return ::roundf(x);
+  }
+};
 #else
+template<typename Scalar>
+struct round_using_floor_ceil_impl
+{
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+    // Without C99 round/roundf, resort to floor/ceil.
     EIGEN_USING_STD(floor);
     EIGEN_USING_STD(ceil);
     // If not enough precision to resolve a decimal at all, return the input.
@@ -502,10 +519,17 @@
       return x;
     }
     return (x > Scalar(0)) ? Scalar(floor(x + Scalar(0.5))) : Scalar(ceil(x - Scalar(0.5)));
-#endif
   }
 };
 
+template<>
+struct round_impl<float> : round_using_floor_ceil_impl<float> {};
+
+template<>
+struct round_impl<double> : round_using_floor_ceil_impl<double> {};
+#endif // EIGEN_HAS_C99_MATH
+#endif // !EIGEN_HAS_CXX11_MATH
+
 template<typename Scalar>
 struct round_retval
 {
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 053003c..f0e59a9 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -29,7 +29,7 @@
       required_alignment = unpacket_traits<PacketScalar>::alignment,
       packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0
     };
-    
+
 public:
   typedef _Scalar Scalar;
   typedef Dense StorageKind;
@@ -44,7 +44,7 @@
     Options = _Options,
     InnerStrideAtCompileTime = 1,
     OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
-    
+
     // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
     EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
     Alignment = actual_alignment
@@ -297,24 +297,24 @@
       : Base(a0, a1, a2, a3, args...) {}
 
     /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
-      * 
+      *
       * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
-      * 
+      *
       * Example: \include Matrix_initializer_list_23_cxx11.cpp
       * Output: \verbinclude Matrix_initializer_list_23_cxx11.out
-      * 
+      *
       * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
-      * 
+      *
       * In the case of a compile-time column vector, implicit transposition from a single row is allowed.
       * Therefore <code>VectorXd{{1,2,3,4,5}}</code> is legal and the more verbose syntax
       * <code>RowVectorXd{{1},{2},{3},{4},{5}}</code> can be avoided:
-      * 
+      *
       * Example: \include Matrix_initializer_list_vector_cxx11.cpp
       * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out
-      * 
+      *
       * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes,
       * and implicit transposition is allowed for compile-time vectors only.
-      * 
+      *
       * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
       */
     EIGEN_DEVICE_FUNC
@@ -351,7 +351,7 @@
       * This is useful for dynamic-size vectors. For fixed-size vectors,
       * it is redundant to pass these parameters, so one should use the default constructor
       * Matrix() instead.
-      * 
+      *
       * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance,
       * calling Matrix<double,1,1>(1) will call the initialization constructor: Matrix(const Scalar&).
       * For fixed-size \c 1x1 matrices it is therefore recommended to use the default
@@ -367,7 +367,7 @@
       * This is useful for dynamic-size matrices. For fixed-size matrices,
       * it is redundant to pass these parameters, so one should use the default constructor
       * Matrix() instead.
-      * 
+      *
       * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance,
       * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y).
       * For fixed-size \c 1x2 or \c 2x1 vectors it is therefore recommended to use the default
@@ -376,7 +376,7 @@
       */
     EIGEN_DEVICE_FUNC
     Matrix(Index rows, Index cols);
-    
+
     /** \brief Constructs an initialized 2D vector with given coefficients
       * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
     Matrix(const Scalar& x, const Scalar& y);
@@ -423,8 +423,10 @@
       : Base(other.derived())
     { }
 
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
 
     /////////// Geometry module ///////////
 
@@ -463,14 +465,14 @@
   *
   * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is
   * a fixed-size vector of 4 complex floats.
-  * 
+  *
   * With \cpp11, template alias are also defined for common sizes.
   * They follow the same pattern as above except that the scalar type suffix is replaced by a
   * template parameter, i.e.:
   *   - `MatrixSize<Type>` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size.
   *   - `MatrixXSize<Type>` and `MatrixSizeX<Type>` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices.
   *   - `VectorSize<Type>` and `RowVectorSize<Type>` for column and row vectors.
-  * 
+  *
   * With \cpp11, you can also use fully generic column and row vector types: `Vector<Type,Size>` and `RowVector<Type,Size>`.
   *
   * \sa class Matrix
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 239bbba..b427576 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -45,8 +45,8 @@
 
     EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
 
     EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 609e114..fdd4d4f 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -135,9 +135,18 @@
   * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
   *     value by the fuzzy comparison operators.
   * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+  * \li digits() function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point). This is
+  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits">std::numeric_limits<T>::digits</a>
+  *     which is used as the default implementation if specialized.
   * \li digits10() function returning the number of decimal digits that can be represented without change. This is
   *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
   *     which is used as the default implementation if specialized.
+  * \li min_exponent() and max_exponent() functions returning the highest and lowest possible values, respectively,
+  *     such that the radix raised to the power exponent-1 is a normalized floating-point number.  These are equivalent to
+  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/min_exponent">std::numeric_limits<T>::min_exponent</a>/
+  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_exponent">std::numeric_limits<T>::max_exponent</a>.
+  * \li infinity() function returning a representation of positive infinity, if available.
+  * \li quiet_NaN function returning a non-signaling "not-a-number", if available.
   */
 
 template<typename T> struct GenericNumTraits
@@ -180,13 +189,24 @@
   }
 
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int min_exponent()
+  {
+    return numext::numeric_limits<T>::min_exponent;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int max_exponent()
+  {
+    return numext::numeric_limits<T>::max_exponent;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline Real dummy_precision()
   {
     // make sure to override this for floating-point types
     return Real(0);
   }
 
-
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline T highest() {
     return (numext::numeric_limits<T>::max)();
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index e53ca1b..202ed71 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -13,10 +13,10 @@
 
 #if defined(EIGEN_INITIALIZE_MATRICES_BY_ZERO)
 # define EIGEN_INITIALIZE_COEFFS
-# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i<base().size();++i) coeffRef(i)=Scalar(0);
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i<base().size();++i) coeffRef(i)=Scalar(0);
 #elif defined(EIGEN_INITIALIZE_MATRICES_BY_NAN)
 # define EIGEN_INITIALIZE_COEFFS
-# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i<base().size();++i) coeffRef(i)=std::numeric_limits<Scalar>::quiet_NaN();
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i<base().size();++i) coeffRef(i)=std::numeric_limits<Scalar>::quiet_NaN();
 #else
 # undef EIGEN_INITIALIZE_COEFFS
 # define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
@@ -139,10 +139,10 @@
     EIGEN_DEVICE_FUNC
     const Base& base() const { return *static_cast<const Base*>(this); }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_storage.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_storage.cols(); }
 
     /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
       * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
@@ -522,11 +522,11 @@
     /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
       *
       * \only_for_vectors
-      * 
+      *
       * This constructor is for 1D array or vectors with more than 4 coefficients.
       * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
-      * 
-      * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this 
+      *
+      * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
       * constructor must match the the fixed number of rows (resp. columns) of \c *this.
       */
     template <typename... ArgTypes>
@@ -540,7 +540,7 @@
       m_storage.data()[1] = a1;
       m_storage.data()[2] = a2;
       m_storage.data()[3] = a3;
-      int i = 4;
+      Index i = 4;
       auto x = {(m_storage.data()[i++] = args, 0)...};
       static_cast<void>(x);
     }
@@ -568,7 +568,7 @@
         eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
         eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic);
         resize(list.size(), list_size);
-       
+
         Index row_index = 0;
         for (const std::initializer_list<Scalar>& row : list) {
           eigen_assert(list_size == row.size());
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 13d5662..70a6c10 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -23,25 +23,25 @@
   typedef typename remove_all<Rhs>::type RhsCleaned;
   typedef traits<LhsCleaned> LhsTraits;
   typedef traits<RhsCleaned> RhsTraits;
-  
+
   typedef MatrixXpr XprKind;
-  
+
   typedef typename ScalarBinaryOpTraits<typename traits<LhsCleaned>::Scalar, typename traits<RhsCleaned>::Scalar>::ReturnType Scalar;
   typedef typename product_promote_storage_type<typename LhsTraits::StorageKind,
                                                 typename RhsTraits::StorageKind,
                                                 internal::product_type<Lhs,Rhs>::ret>::ret StorageKind;
   typedef typename promote_index_type<typename LhsTraits::StorageIndex,
                                       typename RhsTraits::StorageIndex>::type StorageIndex;
-  
+
   enum {
     RowsAtCompileTime    = LhsTraits::RowsAtCompileTime,
     ColsAtCompileTime    = RhsTraits::ColsAtCompileTime,
     MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime,
-    
+
     // FIXME: only needed by GeneralMatrixMatrixTriangular
     InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime),
-    
+
     // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator.
     Flags = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? RowMajorBit
           : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
@@ -74,10 +74,10 @@
                                                                                    internal::product_type<_Lhs,_Rhs>::ret>::ret>
 {
   public:
-    
+
     typedef _Lhs Lhs;
     typedef _Rhs Rhs;
-    
+
     typedef typename ProductImpl<
         Lhs, Rhs, Option,
         typename internal::product_promote_storage_type<typename internal::traits<Lhs>::StorageKind,
@@ -98,10 +98,10 @@
         && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index rows() const { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const LhsNestedCleaned& lhs() const { return m_lhs; }
@@ -115,7 +115,7 @@
 };
 
 namespace internal {
-  
+
 template<typename Lhs, typename Rhs, int Option, int ProductTag = internal::product_type<Lhs,Rhs>::ret>
 class dense_product_base
  : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
@@ -131,7 +131,7 @@
 public:
   using Base::derived;
   typedef typename Base::Scalar Scalar;
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
   {
     return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
@@ -153,25 +153,25 @@
   : public internal::dense_product_base<Lhs,Rhs,Option>
 {
     typedef Product<Lhs, Rhs, Option> Derived;
-    
+
   public:
-    
+
     typedef typename internal::dense_product_base<Lhs, Rhs, Option> Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
   protected:
     enum {
-      IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && 
+      IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) &&
                    (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic),
       EnableCoeff = IsOneByOne || Option==LazyProduct
     };
-    
+
   public:
-  
+
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
     {
       EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
       eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
-      
+
       return internal::evaluator<Derived>(derived()).coeff(row,col);
     }
 
@@ -179,11 +179,11 @@
     {
       EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
       eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
-      
+
       return internal::evaluator<Derived>(derived()).coeff(i);
     }
-    
-  
+
+
 };
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index 9fca428..c2a37ea 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_REF_H
 #define EIGEN_REF_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -48,7 +48,7 @@
     };
     typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
   };
-  
+
 };
 
 template<typename Derived>
@@ -67,12 +67,12 @@
   typedef MapBase<Derived> Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
 
-  EIGEN_DEVICE_FUNC inline Index innerStride() const
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const
   {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
   }
 
-  EIGEN_DEVICE_FUNC inline Index outerStride() const
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const
   {
     return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
          : IsVectorAtCompileTime ? this->size()
@@ -86,7 +86,7 @@
       m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
                StrideType::InnerStrideAtCompileTime==Dynamic?0:StrideType::InnerStrideAtCompileTime)
   {}
-  
+
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(RefBase)
 
 protected:
@@ -94,25 +94,13 @@
   typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
 
   // Resolves inner stride if default 0.
-  static EIGEN_DEVICE_FUNC Index resolveInnerStride(Index inner) {
-    if (inner == 0) {
-      return 1;
-    }
-    return inner;
+  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveInnerStride(Index inner) {
+    return inner == 0 ? 1 : inner;
   }
-  
+
   // Resolves outer stride if default 0.
-  static EIGEN_DEVICE_FUNC Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols, bool isVectorAtCompileTime, bool isRowMajor) {
-    if (outer == 0) {
-      if (isVectorAtCompileTime) {
-        outer = inner * rows * cols;
-      } else if (isRowMajor) {
-        outer = inner * cols;
-      } else {
-        outer = inner * rows;
-      }
-    }
-    return outer;
+  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols, bool isVectorAtCompileTime, bool isRowMajor) {
+    return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer;
   }
 
   // Returns true if construction is valid, false if there is a stride mismatch,
@@ -155,8 +143,8 @@
       (PlainObjectType::RowsAtCompileTime == Dynamic) || (PlainObjectType::RowsAtCompileTime == rows));
     eigen_assert(
       (PlainObjectType::ColsAtCompileTime == Dynamic) || (PlainObjectType::ColsAtCompileTime == cols));
-  
-  
+
+
     // If this is a vector, we might be transposing, which means that stride should swap.
     const bool transpose = PlainObjectType::IsVectorAtCompileTime && (rows != expr.rows());
     // If the storage format differs, we also need to swap the stride.
@@ -165,42 +153,42 @@
     const bool storage_differs =  (row_major != expr_row_major);
 
     const bool swap_stride = (transpose != storage_differs);
-    
+
     // Determine expr's actual strides, resolving any defaults if zero.
     const Index expr_inner_actual = resolveInnerStride(expr.innerStride());
-    const Index expr_outer_actual = resolveOuterStride(expr_inner_actual, 
+    const Index expr_outer_actual = resolveOuterStride(expr_inner_actual,
                                                        expr.outerStride(),
                                                        expr.rows(),
-                                                       expr.cols(), 
+                                                       expr.cols(),
                                                        Expression::IsVectorAtCompileTime != 0,
                                                        expr_row_major);
-                                                       
+
     // If this is a column-major row vector or row-major column vector, the inner-stride
     // is arbitrary, so set it to either the compile-time inner stride or 1.
     const bool row_vector = (rows == 1);
     const bool col_vector = (cols == 1);
-    const Index inner_stride = 
-        ( (!row_major && row_vector) || (row_major && col_vector) ) ? 
-            ( StrideType::InnerStrideAtCompileTime > 0 ? Index(StrideType::InnerStrideAtCompileTime) : 1) 
+    const Index inner_stride =
+        ( (!row_major && row_vector) || (row_major && col_vector) ) ?
+            ( StrideType::InnerStrideAtCompileTime > 0 ? Index(StrideType::InnerStrideAtCompileTime) : 1)
             : swap_stride ? expr_outer_actual : expr_inner_actual;
-              
+
     // If this is a column-major column vector or row-major row vector, the outer-stride
     // is arbitrary, so set it to either the compile-time outer stride or vector size.
-    const Index outer_stride = 
-      ( (!row_major && col_vector) || (row_major && row_vector) ) ? 
-          ( StrideType::OuterStrideAtCompileTime > 0 ? Index(StrideType::OuterStrideAtCompileTime) : rows * cols * inner_stride) 
+    const Index outer_stride =
+      ( (!row_major && col_vector) || (row_major && row_vector) ) ?
+          ( StrideType::OuterStrideAtCompileTime > 0 ? Index(StrideType::OuterStrideAtCompileTime) : rows * cols * inner_stride)
           : swap_stride ? expr_inner_actual : expr_outer_actual;
-  
+
     // Check if given inner/outer strides are compatible with compile-time strides.
     const bool inner_valid = (StrideType::InnerStrideAtCompileTime == Dynamic)
         || (resolveInnerStride(Index(StrideType::InnerStrideAtCompileTime)) == inner_stride);
     if (!inner_valid) {
       return false;
     }
-    
+
     const bool outer_valid = (StrideType::OuterStrideAtCompileTime == Dynamic)
         || (resolveOuterStride(
-              inner_stride, 
+              inner_stride,
               Index(StrideType::OuterStrideAtCompileTime),
               rows, cols, PlainObjectType::IsVectorAtCompileTime != 0,
               row_major)
@@ -208,7 +196,7 @@
     if (!outer_valid) {
       return false;
     }
-    
+
     ::new (static_cast<Base*>(this)) Base(expr.data(), rows, cols);
     ::new (&m_stride) StrideBase(
       (StrideType::OuterStrideAtCompileTime == 0) ? 0 : outer_stride,
diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index 0b2d6d7..ab5be7e 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_REPLICATE_H
 #define EIGEN_REPLICATE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 template<typename MatrixType,int RowFactor,int ColFactor>
@@ -35,7 +35,7 @@
     IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
                : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
                : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
-    
+
     // FIXME enable DirectAccess with negative strides?
     Flags = IsRowMajor ? RowMajorBit : 0
   };
@@ -88,15 +88,15 @@
                           THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
     }
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
 
     EIGEN_DEVICE_FUNC
     const _MatrixTypeNested& nestedExpression() const
-    { 
-      return m_matrix; 
+    {
+      return m_matrix;
     }
 
   protected:
diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index 36b4f41..52de73b 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -240,14 +240,14 @@
     XprType& nestedExpression() { return m_xpr; }
 
     /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index innerStride() const
     {
       return m_xpr.innerStride();
     }
 
     /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index outerStride() const
     {
       return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows();
diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index 11dc86d..4dad13e 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -60,8 +60,10 @@
     EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const
     { static_cast<const Derived*>(this)->evalTo(dst); }
-    EIGEN_DEVICE_FUNC inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return static_cast<const Derived*>(this)->rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return static_cast<const Derived*>(this)->cols(); }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 #define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
@@ -90,7 +92,7 @@
 // Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that
 // when a ReturnByValue expression is assigned, the evaluator is not constructed.
 // TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world
-  
+
 template<typename Derived>
 struct evaluator<ReturnByValue<Derived> >
   : public evaluator<typename internal::traits<Derived>::ReturnType>
@@ -98,7 +100,7 @@
   typedef ReturnByValue<Derived> XprType;
   typedef typename internal::traits<Derived>::ReturnType PlainObject;
   typedef evaluator<PlainObject> Base;
-  
+
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
     : m_result(xpr.rows(), xpr.cols())
   {
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index 8530939..28cdd76 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -12,7 +12,7 @@
 #ifndef EIGEN_REVERSE_H
 #define EIGEN_REVERSE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -44,7 +44,7 @@
   static inline PacketType run(const PacketType& x) { return x; }
 };
 
-} // end namespace internal 
+} // end namespace internal
 
 /** \class Reverse
   * \ingroup Core_Module
@@ -89,8 +89,10 @@
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     EIGEN_DEVICE_FUNC inline Index innerStride() const
     {
@@ -98,7 +100,7 @@
     }
 
     EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() const 
+    nestedExpression() const
     {
       return m_matrix;
     }
@@ -161,7 +163,7 @@
 }
 
 namespace internal {
-  
+
 template<int Direction>
 struct vectorwise_reverse_inplace_impl;
 
diff --git a/Eigen/src/Core/Select.h b/Eigen/src/Core/Select.h
index 7002f04..7c86bf8 100644
--- a/Eigen/src/Core/Select.h
+++ b/Eigen/src/Core/Select.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SELECT_H
 #define EIGEN_SELECT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class Select
   * \ingroup Core_Module
@@ -67,8 +67,10 @@
       eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
     }
 
-    inline EIGEN_DEVICE_FUNC Index rows() const { return m_condition.rows(); }
-    inline EIGEN_DEVICE_FUNC Index cols() const { return m_condition.cols(); }
+    inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_condition.rows(); }
+    inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_condition.cols(); }
 
     inline EIGEN_DEVICE_FUNC
     const Scalar coeff(Index i, Index j) const
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 2173799..b7ed6f1 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SELFADJOINTMATRIX_H
 #define EIGEN_SELFADJOINTMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class SelfAdjointView
   * \ingroup Core_Module
@@ -58,7 +58,7 @@
     typedef MatrixTypeNestedCleaned NestedExpression;
 
     /** \brief The type of coefficients in this matrix */
-    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
+    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
     typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
     typedef SelfAdjointView<typename internal::add_const<MatrixType>::type, UpLo> ConstSelfAdjointView;
@@ -76,14 +76,14 @@
       EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
     }
 
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return m_matrix.innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.innerStride(); }
 
     /** \sa MatrixBase::coeff()
       * \warning the coordinates must fit into the referenced triangular part
@@ -132,7 +132,7 @@
     {
       return Product<OtherDerived,SelfAdjointView>(lhs.derived(),rhs);
     }
-    
+
     friend EIGEN_DEVICE_FUNC
     const SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,MatrixType,product),UpLo>
     operator*(const Scalar& s, const SelfAdjointView& mat)
@@ -300,17 +300,17 @@
   using Base::m_src;
   using Base::m_functor;
 public:
-  
+
   typedef typename Base::DstEvaluatorType DstEvaluatorType;
   typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
   typedef typename Base::Scalar Scalar;
   typedef typename Base::AssignmentTraits AssignmentTraits;
-  
-  
+
+
   EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
     : Base(dst, src, func, dstExpr)
   {}
-  
+
   EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
   {
     eigen_internal_assert(row!=col);
@@ -318,12 +318,12 @@
     m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp);
     m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
   }
-  
+
   EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
   {
     Base::assignCoeff(id,id);
   }
-  
+
   EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
   { eigen_internal_assert(false && "should never be called"); }
 };
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index ec4b4a9..23d5cb7 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -13,7 +13,7 @@
 namespace Eigen {
 
 template<typename Decomposition, typename RhsType, typename StorageKind> class SolveImpl;
-  
+
 /** \class Solve
   * \ingroup Core_Module
   *
@@ -64,13 +64,13 @@
 public:
   typedef typename internal::traits<Solve>::PlainObject PlainObject;
   typedef typename internal::traits<Solve>::StorageIndex StorageIndex;
-  
+
   Solve(const Decomposition &dec, const RhsType &rhs)
     : m_dec(dec), m_rhs(rhs)
   {}
-  
-  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
   EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; }
   EIGEN_DEVICE_FUNC const RhsType&       rhs() const { return m_rhs; }
@@ -87,14 +87,14 @@
   : public MatrixBase<Solve<Decomposition,RhsType> >
 {
   typedef Solve<Decomposition,RhsType> Derived;
-  
+
 public:
-  
+
   typedef MatrixBase<Solve<Decomposition,RhsType> > Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
 
 private:
-  
+
   Scalar coeff(Index row, Index col) const;
   Scalar coeff(Index i) const;
 };
@@ -119,15 +119,15 @@
   typedef evaluator<PlainObject> Base;
 
   enum { Flags = Base::Flags | EvalBeforeNestingBit };
-  
+
   EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
     : m_result(solve.rows(), solve.cols())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
     solve.dec()._solve_impl(solve.rhs(), m_result);
   }
-  
-protected:  
+
+protected:
   PlainObject m_result;
 };
 
@@ -176,7 +176,7 @@
     Index dstCols = src.cols();
     if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
       dst.resize(dstRows, dstCols);
-    
+
     src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
   }
 };
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index dc53b5e..3879444 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -213,8 +213,8 @@
     : m_triangularMatrix(tri), m_rhs(rhs)
   {}
 
-  inline Index rows() const { return m_rhs.rows(); }
-  inline Index cols() const { return m_rhs.cols(); }
+  inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rhs.rows(); }
+  inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
   template<typename Dest> inline void evalTo(Dest& dst) const
   {
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index 04fe0b3..4a3f0cc 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -134,9 +134,9 @@
   // statements can be replaced
   static const int ibeta = std::numeric_limits<RealScalar>::radix;  // base for floating-point numbers
   static const int it    = NumTraits<RealScalar>::digits();  // number of base-beta digits in mantissa
-  static const int iemin = std::numeric_limits<RealScalar>::min_exponent;  // minimum exponent
-  static const int iemax = std::numeric_limits<RealScalar>::max_exponent; // maximum exponent
-  static const RealScalar rbig   = (std::numeric_limits<RealScalar>::max)();  // largest floating-point number
+  static const int iemin = NumTraits<RealScalar>::min_exponent();  // minimum exponent
+  static const int iemax = NumTraits<RealScalar>::max_exponent();  // maximum exponent
+  static const RealScalar rbig   = NumTraits<RealScalar>::highest();  // largest floating-point number
   static const RealScalar b1     = RealScalar(pow(RealScalar(ibeta),RealScalar(-((1-iemin)/2))));  // lower boundary of midrange
   static const RealScalar b2     = RealScalar(pow(RealScalar(ibeta),RealScalar((iemax + 1 - it)/2)));  // upper boundary of midrange
   static const RealScalar s1m    = RealScalar(pow(RealScalar(ibeta),RealScalar((2-iemin)/2)));  // scaling factor for lower range
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index cbcb0a5..6494d51 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_STRIDE_H
 #define EIGEN_STRIDE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class Stride
   * \ingroup Core_Module
@@ -26,7 +26,7 @@
   *
   * The outer stride is the pointer increment between two consecutive rows of a row-major matrix or
   * between two consecutive columns of a column-major matrix.
-  * 
+  *
   * These two values can be passed either at compile-time as template parameters, or at runtime as
   * arguments to the constructor.
   *
@@ -41,7 +41,7 @@
   * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime
   * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were
   * not allowed).
-  * 
+  *
   * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
   */
 template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
@@ -78,10 +78,10 @@
     {}
 
     /** \returns the outer stride */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index outer() const { return m_outer.value(); }
     /** \returns the inner stride */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index inner() const { return m_inner.value(); }
 
   protected:
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 49804b0..2bc658f 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_TRANSPOSE_H
 #define EIGEN_TRANSPOSE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 template<typename MatrixType>
@@ -65,10 +65,10 @@
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index rows() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index cols() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
 
     /** \returns the nested expression */
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -336,7 +336,7 @@
   * Notice however that this method is only useful if you want to replace a matrix by its own transpose.
   * If you just need the transpose of a matrix, use transpose().
   *
-  * \note if the matrix is not square, then \c *this must be a resizable matrix. 
+  * \note if the matrix is not square, then \c *this must be a resizable matrix.
   * This excludes (non-square) fixed-size matrices, block-expressions and maps.
   *
   * \sa transpose(), adjoint(), adjointInPlace() */
diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index f323a2b..38a7b01 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -349,12 +349,12 @@
 
     explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
 
-    EIGEN_DEVICE_FUNC
-    Index size() const { return m_transpositions.size(); }
-    EIGEN_DEVICE_FUNC
-    Index rows() const { return m_transpositions.size(); }
-    EIGEN_DEVICE_FUNC
-    Index cols() const { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index size() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
 
     /** \returns the \a matrix with the inverse transpositions applied to the columns.
       */
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 5e2f2de..779152f 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -11,12 +11,12 @@
 #ifndef EIGEN_TRIANGULARMATRIX_H
 #define EIGEN_TRIANGULARMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
-  
+
 template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval;
-  
+
 }
 
 /** \class TriangularBase
@@ -34,16 +34,16 @@
       ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
       MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
-      
+
       SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
                                                    internal::traits<Derived>::ColsAtCompileTime>::ret),
       /**< This is equal to the number of coefficients, i.e. the number of
           * rows times the number of columns, or to \a Dynamic if this is not
           * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
-      
+
       MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
                                                    internal::traits<Derived>::MaxColsAtCompileTime>::ret)
-        
+
     };
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
@@ -55,15 +55,15 @@
     EIGEN_DEVICE_FUNC
     inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); }
 
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return derived().rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return derived().cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return derived().outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return derived().innerStride(); }
-    
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); }
+
     // dummy resize function
     EIGEN_DEVICE_FUNC
     void resize(Index rows, Index cols)
@@ -156,7 +156,7 @@
   * \param MatrixType the type of the object in which we are taking the triangular part
   * \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
   *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
-  *             This is in fact a bit field; it must have either #Upper or #Lower, 
+  *             This is in fact a bit field; it must have either #Upper or #Lower,
   *             and additionally it may have #UnitDiag or #ZeroDiag or neither.
   *
   * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
@@ -199,7 +199,7 @@
 
     typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
     typedef TriangularView<typename internal::add_const<MatrixType>::type, _Mode> ConstTriangularView;
-    
+
   public:
 
     typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
@@ -218,15 +218,15 @@
     EIGEN_DEVICE_FUNC
     explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
     {}
-    
+
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
 
     /** \copydoc EigenBase::rows() */
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
     /** \copydoc EigenBase::cols() */
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     /** \returns a const reference to the nested expression */
     EIGEN_DEVICE_FUNC
@@ -235,7 +235,7 @@
     /** \returns a reference to the nested expression */
     EIGEN_DEVICE_FUNC
     NestedExpression& nestedExpression() { return m_matrix; }
-    
+
     typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
     /** \sa MatrixBase::conjugate() const */
     EIGEN_DEVICE_FUNC
@@ -269,7 +269,7 @@
       typename MatrixType::TransposeReturnType tmp(m_matrix);
       return TransposeReturnType(tmp);
     }
-    
+
     typedef TriangularView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
     /** \sa MatrixBase::transpose() const */
     EIGEN_DEVICE_FUNC
@@ -280,10 +280,10 @@
 
     template<typename Other>
     EIGEN_DEVICE_FUNC
-    inline const Solve<TriangularView, Other> 
+    inline const Solve<TriangularView, Other>
     solve(const MatrixBase<Other>& other) const
     { return Solve<TriangularView, Other>(*this, other.derived()); }
-    
+
   // workaround MSVC ICE
   #if EIGEN_COMP_MSVC
     template<int Side, typename Other>
@@ -327,7 +327,7 @@
       else
         return m_matrix.diagonal().prod();
     }
-      
+
   protected:
 
     MatrixTypeNested m_matrix;
@@ -389,7 +389,7 @@
       internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename Other::Scalar>());
       return derived();
     }
-    
+
     /** \sa MatrixBase::operator*=() */
     EIGEN_DEVICE_FUNC
     TriangularViewType&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() * other; }
@@ -715,7 +715,7 @@
 
 namespace internal {
 
-  
+
 // TODO currently a triangular expression has the form TriangularView<.,.>
 //      in the future triangular-ness should be defined by the expression traits
 //      such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
@@ -744,7 +744,7 @@
 
 template<typename Kernel, unsigned int Mode, int UnrollCount, bool ClearOpposite> struct triangular_assignment_loop;
 
- 
+
 /** \internal Specialization of the dense assignment kernel for triangular matrices.
   * The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions.
   * \tparam UpLo must be either Lower or Upper
@@ -761,17 +761,17 @@
   using Base::m_src;
   using Base::m_functor;
 public:
-  
+
   typedef typename Base::DstEvaluatorType DstEvaluatorType;
   typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
   typedef typename Base::Scalar Scalar;
   typedef typename Base::AssignmentTraits AssignmentTraits;
-  
-  
+
+
   EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
     : Base(dst, src, func, dstExpr)
   {}
-  
+
 #ifdef EIGEN_INTERNAL_DEBUGGING
   EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
   {
@@ -781,16 +781,16 @@
 #else
   using Base::assignCoeff;
 #endif
-  
+
   EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
   {
          if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1));
     else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0));
     else if(Mode==0)                       Base::assignCoeff(id,id);
   }
-  
+
   EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col)
-  { 
+  {
     eigen_internal_assert(row!=col);
     if(SetOpposite)
       m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0));
@@ -811,17 +811,17 @@
   if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
     dst.resize(dstRows, dstCols);
   DstEvaluatorType dstEvaluator(dst);
-    
+
   typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite,
                                               DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
   Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
-  
+
   enum {
       unroll = DstXprType::SizeAtCompileTime != Dynamic
             && SrcEvaluatorType::CoeffReadCost < HugeCost
             && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT
     };
-  
+
   triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
 }
 
@@ -843,8 +843,8 @@
   EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
     eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode));
-    
-    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
+
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
   }
 };
 
@@ -853,7 +853,7 @@
 {
   EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
-    call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);  
+    call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);
   }
 };
 
@@ -862,7 +862,7 @@
 {
   EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
-    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
   }
 };
 
@@ -873,19 +873,19 @@
   // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
   typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
   typedef typename DstEvaluatorType::XprType DstXprType;
-  
+
   enum {
     col = (UnrollCount-1) / DstXprType::RowsAtCompileTime,
     row = (UnrollCount-1) % DstXprType::RowsAtCompileTime
   };
-  
+
   typedef typename Kernel::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
   static inline void run(Kernel &kernel)
   {
     triangular_assignment_loop<Kernel, Mode, UnrollCount-1, SetOpposite>::run(kernel);
-    
+
     if(row==col)
       kernel.assignDiagonalCoeff(row);
     else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row<col) )
@@ -928,10 +928,10 @@
       }
       else
         i = maxi;
-      
+
       if(i<kernel.rows()) // then i==j
         kernel.assignDiagonalCoeff(i++);
-      
+
       if (((Mode&Upper) && SetOpposite) || (Mode&Lower))
       {
         for(; i < kernel.rows(); ++i)
@@ -955,7 +955,7 @@
 }
 
 namespace internal {
-  
+
 // Triangular = Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
@@ -968,7 +968,7 @@
     if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
       dst.resize(dstRows, dstCols);
 
-    dst._assignProduct(src, 1, 0);
+    dst._assignProduct(src, Scalar(1), false);
   }
 };
 
@@ -979,7 +979,7 @@
   typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
   {
-    dst._assignProduct(src, 1, 1);
+    dst._assignProduct(src, Scalar(1), true);
   }
 };
 
@@ -990,7 +990,7 @@
   typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
   {
-    dst._assignProduct(src, -1, 1);
+    dst._assignProduct(src, Scalar(-1), true);
   }
 };
 
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 91a6c03..870f4f1 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -65,10 +65,10 @@
     explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
       : m_matrix(mat), m_functor(func) {}
 
-    EIGEN_DEVICE_FUNC
-    Index rows() const { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
-    EIGEN_DEVICE_FUNC
-    Index cols() const { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
 
     EIGEN_DEVICE_FUNC
     typename MatrixType::Nested nestedExpression() const { return m_matrix; }
@@ -134,7 +134,7 @@
   typedef typename result_of<
                      BinaryOp(const Scalar&,const Scalar&)
                    >::type  result_type;
-  
+
   enum { Vectorizable = functor_traits<BinaryOp>::PacketAccess };
   template<int Size> struct Cost { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
   EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {}
@@ -162,17 +162,17 @@
   * where `foo` is any method of `VectorwiseOp`. This expression is equivalent to applying `foo()` to each
   * column of `A` and then re-assemble the outputs in a matrix expression:
   * \code [A.col(0).foo(), A.col(1).foo(), ..., A.col(A.cols()-1).foo()] \endcode
-  * 
+  *
   * Example: \include MatrixBase_colwise.cpp
   * Output: \verbinclude MatrixBase_colwise.out
   *
   * The begin() and end() methods are obviously exceptions to the previous rule as they
   * return STL-compatible begin/end iterators to the rows or columns of the nested expression.
   * Typical use cases include for-range-loop and calls to STL algorithms:
-  * 
+  *
   * Example: \include MatrixBase_colwise_iterator_cxx11.cpp
   * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out
-  * 
+  *
   * For a partial reduction on an empty input, some rules apply.
   * For the sake of clarity, let's consider a vertical reduction:
   *   - If the number of columns is zero, then a 1x0 row-major vector expression is returned.
@@ -180,7 +180,7 @@
   *       - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.)
   *       - a row vector of ones is returned for a product reduction (e.g., <code>MatrixXd(n,0).colwise().prod()</code>)
   *       - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op))
-  * 
+  *
   * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
   */
 template<typename ExpressionType, int Direction> class VectorwiseOp
@@ -216,7 +216,7 @@
     };
 
   protected:
-  
+
     template<typename OtherDerived> struct ExtendedType {
       typedef Replicate<OtherDerived,
                         isVertical   ? 1 : ExpressionType::RowsAtCompileTime,
@@ -328,7 +328,7 @@
       *
       * \warning the size along the reduction direction must be strictly positive,
       *          otherwise an assertion is triggered.
-      * 
+      *
       * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
       */
     template<typename BinaryOp>
@@ -365,7 +365,7 @@
       *
       * \warning the size along the reduction direction must be strictly positive,
       *          otherwise an assertion is triggered.
-      * 
+      *
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_minCoeff.cpp
@@ -384,7 +384,7 @@
       *
       * \warning the size along the reduction direction must be strictly positive,
       *          otherwise an assertion is triggered.
-      * 
+      *
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_maxCoeff.cpp
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index 67a69c5..07a2e42 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_VISITOR_H
 #define EIGEN_VISITOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -70,22 +70,22 @@
 public:
   EIGEN_DEVICE_FUNC
   explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
-  
+
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  
+
   enum {
     RowsAtCompileTime = XprType::RowsAtCompileTime,
     CoeffReadCost = internal::evaluator<XprType>::CoeffReadCost
   };
-  
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); }
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   { return m_evaluator.coeff(row, col); }
-  
+
 protected:
   internal::evaluator<XprType> m_evaluator;
   const XprType &m_xpr;
@@ -106,7 +106,7 @@
   *
   * \note compared to one or two \em for \em loops, visitors offer automatic
   * unrolling for small fixed size matrix.
-  * 
+  *
   * \note if the matrix is empty, then the visitor is left unchanged.
   *
   * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
@@ -118,10 +118,10 @@
 {
   if(size()==0)
     return;
-  
+
   typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
-  
+
   enum {
     unroll =  SizeAtCompileTime != Dynamic
            && SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost <= EIGEN_UNROLLING_LIMIT
@@ -157,7 +157,7 @@
   *
   * \sa DenseBase::minCoeff(Index*, Index*)
   */
-template <typename Derived>
+template <typename Derived, int NaNPropagation>
 struct min_coeff_visitor : coeff_visitor<Derived>
 {
   typedef typename Derived::Scalar Scalar;
@@ -173,8 +173,40 @@
   }
 };
 
-template<typename Scalar>
-struct functor_traits<min_coeff_visitor<Scalar> > {
+template <typename Derived>
+struct min_coeff_visitor<Derived, PropagateNumbers> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value < this->res))
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template <typename Derived>
+struct min_coeff_visitor<Derived, PropagateNaN> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(value) || value < this->res)
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template<typename Scalar, int NaNPropagation>
+    struct functor_traits<min_coeff_visitor<Scalar, NaNPropagation> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost
   };
@@ -185,10 +217,10 @@
   *
   * \sa DenseBase::maxCoeff(Index*, Index*)
   */
-template <typename Derived>
+template <typename Derived, int NaNPropagation>
 struct max_coeff_visitor : coeff_visitor<Derived>
 {
-  typedef typename Derived::Scalar Scalar; 
+  typedef typename Derived::Scalar Scalar;
   EIGEN_DEVICE_FUNC
   void operator() (const Scalar& value, Index i, Index j)
   {
@@ -201,8 +233,40 @@
   }
 };
 
-template<typename Scalar>
-struct functor_traits<max_coeff_visitor<Scalar> > {
+template <typename Derived>
+struct max_coeff_visitor<Derived, PropagateNumbers> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value > this->res))
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template <typename Derived>
+struct max_coeff_visitor<Derived, PropagateNaN> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(value) || value > this->res)
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template<typename Scalar, int NaNPropagation>
+struct functor_traits<max_coeff_visitor<Scalar, NaNPropagation> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost
   };
@@ -212,22 +276,24 @@
 
 /** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
   * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
-  * 
+  *
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
   * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  * 
-  * \warning the result is undefined if \c *this contains NaN.
   *
   * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
   */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 {
   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
 
-  internal::min_coeff_visitor<Derived> minVisitor;
+  internal::min_coeff_visitor<Derived, NaNPropagation> minVisitor;
   this->visit(minVisitor);
   *rowId = minVisitor.row;
   if (colId) *colId = minVisitor.col;
@@ -235,15 +301,17 @@
 }
 
 /** \returns the minimum of all coefficients of *this and puts in *index its location.
-  * 
+  *
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
   * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  * 
-  * \warning the result is undefined if \c *this contains NaN. 
   *
   * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
   */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* index) const
@@ -251,7 +319,7 @@
   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
 
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  internal::min_coeff_visitor<Derived> minVisitor;
+      internal::min_coeff_visitor<Derived, NaNPropagation> minVisitor;
   this->visit(minVisitor);
   *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row);
   return minVisitor.res;
@@ -259,22 +327,24 @@
 
 /** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
   * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
-  * 
+  *
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
   * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  * 
-  * \warning the result is undefined if \c *this contains NaN. 
   *
   * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
   */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 {
   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
 
-  internal::max_coeff_visitor<Derived> maxVisitor;
+  internal::max_coeff_visitor<Derived, NaNPropagation> maxVisitor;
   this->visit(maxVisitor);
   *rowPtr = maxVisitor.row;
   if (colPtr) *colPtr = maxVisitor.col;
@@ -282,15 +352,17 @@
 }
 
 /** \returns the maximum of all coefficients of *this and puts in *index its location.
-  * 
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   *
-  * \warning the result is undefined if \c *this contains NaN.
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   *
   * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
   */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* index) const
@@ -298,7 +370,7 @@
   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
 
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  internal::max_coeff_visitor<Derived> maxVisitor;
+      internal::max_coeff_visitor<Derived, NaNPropagation> maxVisitor;
   this->visit(maxVisitor);
   *index = (RowsAtCompileTime==1) ? maxVisitor.col : maxVisitor.row;
   return maxVisitor.res;
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index d21e139..c6cb59e 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -31,6 +31,52 @@
 {
   EIGEN_STRONG_INLINE explicit Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b)
+  {
+    Packet4f v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+    // Get the imaginary parts of a
+    v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+    // multiply a_re * b
+    v1 = vec_madd(v1, b.v, p4f_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(v2, b.v, p4f_ZERO);
+    v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
+    // permute back to a proper order
+    v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
+
+    return Packet2cf(padd<Packet4f>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+    v = pmul(Packet2cf(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
+    return Packet2cf(*this) *= b;
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
+    return Packet2cf(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
+    return Packet2cf(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
+    return Packet2cf(vec_neg(v));
+  }
+
   Packet4f  v;
 };
 
@@ -81,6 +127,25 @@
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
 
+EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>* from0, const std::complex<float>* from1)
+{
+  Packet4f res0, res1;
+#ifdef __VSX__
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0));
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1));
+#ifdef _BIG_ENDIAN
+  __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
+#else
+  __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
+#endif
+#else
+  *reinterpret_cast<std::complex<float> *>(&res0) = *from0;
+  *reinterpret_cast<std::complex<float> *>(&res1) = *from1;
+  res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI);
+#endif
+  return Packet2cf(res0);
+}
+
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
   EIGEN_ALIGN16 std::complex<float> af[2];
@@ -101,25 +166,6 @@
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  Packet4f v1, v2;
-
-  // Permute and multiply the real parts of a and b
-  v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
-  // Get the imaginary parts of a
-  v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
-  // multiply a_re * b 
-  v1 = vec_madd(v1, b.v, p4f_ZERO);
-  // multiply a_im * b and get the conjugate result
-  v2 = vec_madd(v2, b.v, p4f_ZERO);
-  v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
-  // permute back to a proper order
-  v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
-  
-  return Packet2cf(padd<Packet4f>(v1, v2));
-}
-
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v, b.v)); }
@@ -239,6 +285,51 @@
 {
   EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b)
+  {
+    Packet2d a_re, a_im, v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+    // Get the imaginary parts of a
+    a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+    // multiply a_re * b
+    v1 = vec_madd(a_re, b.v, p2d_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(a_im, b.v, p2d_ZERO);
+    v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
+    v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
+
+    return Packet1cd(padd<Packet2d>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+    v = pmul(Packet1cd(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
+    return Packet1cd(*this) *= b;
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
+    return Packet1cd(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
+    return Packet1cd(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
+    return Packet1cd(vec_neg(v));
+  }
+
   Packet2d v;
 };
 
@@ -290,24 +381,6 @@
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  Packet2d a_re, a_im, v1, v2;
-
-  // Permute and multiply the real parts of a and b
-  a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
-  // Get the imaginary parts of a
-  a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
-  // multiply a_re * b
-  v1 = vec_madd(a_re, b.v, p2d_ZERO);
-  // multiply a_im * b and get the conjugate result
-  v2 = vec_madd(a_im, b.v, p2d_ZERO);
-  v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
-  v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
-
-  return Packet1cd(padd<Packet2d>(v1, v2));
-}
-
 template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index 03d474a..e3ba061 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
+// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,6 +13,7 @@
 
 #include "MatrixProductCommon.h"
 
+// Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX
 #if EIGEN_COMP_LLVM
 #if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY)
 #ifdef __MMA__
@@ -34,10 +36,8 @@
 
 /**************************************************************************************************
  * TODO                                                                                           *
- * - Check StorageOrder on lhs_pack (the innermost second loop seems unvectorized when it could). *
+ * - Check StorageOrder on dhs_pack (the innermost second loop seems unvectorized when it could). *
  * - Check the possibility of transposing as GETREAL and GETIMAG when needed.                     *
- * - Check if change conjugation to xor instead of mul gains any performance.                     *
- * - Remove IsComplex template argument from complex packing.                                     *
  **************************************************************************************************/
 namespace Eigen {
 
@@ -46,13 +46,11 @@
 /**************************
  * Constants and typedefs *
  **************************/
-const int QuadRegisterCount = 8;
-
 template<typename Scalar>
 struct quad_traits
 {
   typedef typename packet_traits<Scalar>::type    vectortype;
-  typedef PacketBlock<vectortype, 4>                    type;
+  typedef PacketBlock<vectortype,4>                     type;
   typedef vectortype                                 rhstype;
   enum
   {
@@ -66,7 +64,7 @@
 struct quad_traits<double>
 {
   typedef Packet2d                        vectortype;
-  typedef PacketBlock<vectortype, 4>            type;
+  typedef PacketBlock<vectortype,4>             type;
   typedef PacketBlock<Packet2d,2>            rhstype;
   enum
   {
@@ -79,9 +77,6 @@
 // MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out
 // to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then
 // are responsible to extract from convert between Eigen's and MatrixProduct approach.
-const static Packet4f p4f_CONJUGATE = {float(-1.0), float(-1.0), float(-1.0), float(-1.0)};
-
-const static Packet2d p2d_CONJUGATE = {-1.0, -1.0};
 
 const static Packet16uc p16uc_GETREAL32 = {  0,  1,  2,  3,
                                              8,  9, 10, 11,
@@ -109,7 +104,7 @@
  * conjugated. There's no PanelMode available for symm packing.
  *
  * Packing in general is supposed to leave the lhs block and the rhs block easy to be read by gemm using 
- * it's respective rank-update instructions. The float32/64 versions are different because at this moment
+ * its respective rank-update instructions. The float32/64 versions are different because at this moment
  * the size of the accumulator is fixed at 512-bits so you can't have a 4x4 accumulator of 64-bit elements.
  * 
  * As mentioned earlier MatrixProduct breaks complex numbers into a real vector and a complex vector so packing has
@@ -123,156 +118,148 @@
   std::complex<Scalar> v;
   if(i < j)
   {
-    v.real(dt(j,i).real());
+    v.real( dt(j,i).real());
     v.imag(-dt(j,i).imag());
   } else if(i > j)
   {
-    v.real(dt(i,j).real());
-    v.imag(dt(i,j).imag());
+    v.real( dt(i,j).real());
+    v.imag( dt(i,j).imag());
   } else {
-    v.real(dt(i,j).real());
+    v.real( dt(i,j).real());
     v.imag((Scalar)0.0);
   }
   return v;
 }
 
 template<typename Scalar, typename Index, int StorageOrder, int N>
-EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar> *blockB, const std::complex<Scalar>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB, const std::complex<Scalar>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
 {
   const Index depth = k2 + rows;
   const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> rhs(_rhs, rhsStride);
-  const int vectorSize = N*quad_traits<Scalar>::vectorsize;
+  const Index vectorSize = N*quad_traits<Scalar>::vectorsize;
+  const Index vectorDelta = vectorSize * rows;
   Scalar* blockBf = reinterpret_cast<Scalar *>(blockB);
 
-  Index ri = 0, j = 0;
+  Index rir = 0, rii, j = 0;
   for(; j + vectorSize <= cols; j+=vectorSize)
   {
-      Index i = k2;
-      for(; i < depth; i++)
-      {
-        for(Index k = 0; k < vectorSize; k++)
-        {
-          std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j + k, rhs);
-          blockBf[ri + k] = v.real();
-        }
-        ri += vectorSize;
-      }
+    rii = rir + vectorDelta;
 
-      i = k2;
-
-      for(; i < depth; i++)
+    for(Index i = k2; i < depth; i++)
+    {
+      for(Index k = 0; k < vectorSize; k++)
       {
-        for(Index k = 0; k < vectorSize; k++)
-        {
-          std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j + k, rhs);
-          blockBf[ri + k] = v.imag();
-        }
-        ri += vectorSize;
+        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j + k, rhs);
+
+        blockBf[rir + k] = v.real();
+        blockBf[rii + k] = v.imag();
       }
+      rir += vectorSize;
+      rii += vectorSize;
+    }
+
+    rir += vectorDelta;
   }
-  for(Index i = k2; i < depth; i++)
+  if (j < cols)
   {
+    rii = rir + ((cols - j) * rows);
+
+    for(Index i = k2; i < depth; i++)
+    {
       Index k = j;
       for(; k < cols; k++)
       {
         std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, k, rhs);
-        blockBf[ri] = v.real();
-        ri += 1;
+
+        blockBf[rir] = v.real();
+        blockBf[rii] = v.imag();
+
+        rir += 1;
+        rii += 1;
       }
-  }
-  for(Index i = k2; i < depth; i++)
-  {
-      Index k = j;
-      for(; k < cols; k++)
-      {
-        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, k, rhs);
-        blockBf[ri] = v.imag();
-        ri += 1;
-      }
+    }
   }
 }
 
 template<typename Scalar, typename Index, int StorageOrder>
-EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar> *blockA, const std::complex<Scalar>* _lhs, Index lhsStride, Index cols, Index rows)
+EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA, const std::complex<Scalar>* _lhs, Index lhsStride, Index cols, Index rows)
 {
   const Index depth = cols;
   const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> lhs(_lhs, lhsStride);
-  const int vectorSize = quad_traits<Scalar>::vectorsize;
-  Index ri = 0, j = 0;
-  Scalar *blockAf = (Scalar *)(blockA);
+  const Index vectorSize = quad_traits<Scalar>::vectorsize;
+  const Index vectorDelta = vectorSize * depth;
+  Scalar* blockAf = (Scalar *)(blockA);
 
+  Index rir = 0, rii, j = 0;
   for(; j + vectorSize <= rows; j+=vectorSize)
   {
-      Index i = 0;
+    rii = rir + vectorDelta;
 
-      for(; i < depth; i++)
+    for(Index i = 0; i < depth; i++)
+    {
+      for(Index k = 0; k < vectorSize; k++)
       {
-        for(int k = 0; k < vectorSize; k++)
-        {
-          std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(j+k, i, lhs);
-          blockAf[ri + k] = v.real();
-        }
-        ri += vectorSize;
+        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(j+k, i, lhs);
+
+        blockAf[rir + k] = v.real();
+        blockAf[rii + k] = v.imag();
       }
-      i = 0;
-      for(; i < depth; i++)
-      {
-        for(int k = 0; k < vectorSize; k++)
-        {
-          std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(j+k, i, lhs);
-          blockAf[ri + k] = v.imag();
-        }
-        ri += vectorSize;
-      }
+      rir += vectorSize;
+      rii += vectorSize;
+    }
+
+    rir += vectorDelta;
   }
 
-  for(Index i = 0; i < depth; i++)
+  if (j < rows)
   {
+    rii = rir + ((rows - j) * depth);
+
+    for(Index i = 0; i < depth; i++)
+    {
       Index k = j;
       for(; k < rows; k++)
       {
-          std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(k, i, lhs);
-          blockAf[ri] = v.real();
-          ri += 1;
+        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(k, i, lhs);
+
+        blockAf[rir] = v.real();
+        blockAf[rii] = v.imag();
+
+        rir += 1;
+        rii += 1;
       }
-  }
-  for(Index i = 0; i < depth; i++)
-  {
-      Index k = j;
-      for(; k < rows; k++)
-      {
-          std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(k, i, lhs);
-          blockAf[ri] = v.imag();
-          ri += 1;
-      }
+    }
   }
 }
 
 template<typename Scalar, typename Index, int StorageOrder, int N>
-EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar *blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
 {
   const Index depth = k2 + rows;
   const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(_rhs, rhsStride);
-  const int vectorSize = quad_traits<Scalar>::vectorsize;
+  const Index vectorSize = quad_traits<Scalar>::vectorsize;
 
   Index ri = 0, j = 0;
   for(; j + N*vectorSize <= cols; j+=N*vectorSize)
   {
-      Index i = k2;
-      for(; i < depth; i++)
+    Index i = k2;
+    for(; i < depth; i++)
+    {
+      for(Index k = 0; k < N*vectorSize; k++)
       {
-        for(int k = 0; k < N*vectorSize; k++)
-        {
-          if(i <= j+k)
-            blockB[ri + k] = rhs(j+k, i);
-          else
-            blockB[ri + k] = rhs(i, j+k);
-        }
-        ri += N*vectorSize;
+        if(i <= j+k)
+          blockB[ri + k] = rhs(j+k, i);
+        else
+          blockB[ri + k] = rhs(i, j+k);
       }
+      ri += N*vectorSize;
+    }
   }
-  for(Index i = k2; i < depth; i++)
+
+  if (j < cols)
   {
+    for(Index i = k2; i < depth; i++)
+    {
       Index k = j;
       for(; k < cols; k++)
       {
@@ -282,45 +269,49 @@
           blockB[ri] = rhs(k, i);
         ri += 1;
       }
+    }
   }
 }
 
 template<typename Scalar, typename Index, int StorageOrder>
-EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar *blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
+EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
 {
   const Index depth = cols;
   const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs, lhsStride);
-  const int vectorSize = quad_traits<Scalar>::vectorsize;
+  const Index vectorSize = quad_traits<Scalar>::vectorsize;
+
   Index ri = 0, j = 0;
-
-  for(j = 0; j + vectorSize <= rows; j+=vectorSize)
+  for(; j + vectorSize <= rows; j+=vectorSize)
   {
-      Index i = 0;
+    Index i = 0;
 
-      for(; i < depth; i++)
+    for(; i < depth; i++)
+    {
+      for(Index k = 0; k < vectorSize; k++)
       {
-        for(int k = 0; k < vectorSize; k++)
-        {
-          if(i <= j+k)
-            blockA[ri + k] = lhs(j+k, i);
-          else
-            blockA[ri + k] = lhs(i, j+k);
-        }
-        ri += vectorSize;
+        if(i <= j+k)
+          blockA[ri + k] = lhs(j+k, i);
+        else
+          blockA[ri + k] = lhs(i, j+k);
       }
+      ri += vectorSize;
+    }
   }
 
-  for(Index i = 0; i < depth; i++)
+  if (j < rows)
   {
+    for(Index i = 0; i < depth; i++)
+    {
       Index k = j;
       for(; k < rows; k++)
       {
-          if(i <= k)
-            blockA[ri] = lhs(k, i);
-          else
-            blockA[ri] = lhs(i, k);
-          ri += 1;
+        if(i <= k)
+          blockA[ri] = lhs(k, i);
+        else
+          blockA[ri] = lhs(i, k);
+        ri += 1;
       }
+    }
   }
 }
 
@@ -338,7 +329,7 @@
 {
   void operator()(std::complex<float>* blockA, const std::complex<float>* _lhs, Index lhsStride, Index cols, Index rows)
   {
-   symm_pack_complex_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+    symm_pack_complex_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
   }
 };
 
@@ -368,7 +359,7 @@
 {
   void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
   {
-   symm_pack_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
+    symm_pack_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
   }
 };
 
@@ -377,7 +368,7 @@
 {
   void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows)
   {
-   symm_pack_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+    symm_pack_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
   }
 };
 
@@ -396,7 +387,7 @@
 {
   void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows)
   {
-   symm_pack_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+    symm_pack_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
   }
 };
 
@@ -411,517 +402,176 @@
  * and offset and behaves accordingly.
  **/
 
-// General template for lhs complex packing.
-template<typename Scalar, bool IsComplex, typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
-struct lhs_cpack {
+template<typename Scalar, typename Packet, typename Index>
+EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,4>& block)
+{
+  const Index size = 16 / sizeof(Scalar);
+  pstore<Scalar>(to + (0 * size), block.packet[0]);
+  pstore<Scalar>(to + (1 * size), block.packet[1]);
+  pstore<Scalar>(to + (2 * size), block.packet[2]);
+  pstore<Scalar>(to + (3 * size), block.packet[3]);
+}
+
+template<typename Scalar, typename Packet, typename Index>
+EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,2>& block)
+{
+  const Index size = 16 / sizeof(Scalar);
+  pstore<Scalar>(to + (0 * size), block.packet[0]);
+  pstore<Scalar>(to + (1 * size), block.packet[1]);
+}
+
+// General template for lhs & rhs complex packing.
+template<typename Scalar, typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode, bool UseLhs>
+struct dhs_cpack {
   EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   {
-    const int vectorSize = quad_traits<Scalar>::vectorsize;
-    Index ri = 0, j = 0;
-    Scalar *blockAt  = reinterpret_cast<Scalar *>(blockA);
-    Packet conj = pset1<Packet>((Scalar)-1.0);
+    const Index vectorSize = quad_traits<Scalar>::vectorsize;
+    const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
+    Scalar* blockAt = reinterpret_cast<Scalar *>(blockA);
+    Index j = 0;
 
-    for(j = 0; j + vectorSize <= rows; j+=vectorSize)
+    for(; j + vectorSize <= rows; j+=vectorSize)
     {
       Index i = 0;
 
-      if(PanelMode) ri += vectorSize*offset;
+      rii = rir + vectorDelta;
 
       for(; i + vectorSize <= depth; i+=vectorSize)
       {
-        PacketBlock<Packet, 4> block;
+        PacketBlock<Packet,4> blockr, blocki;
+        PacketBlock<PacketC,8> cblock;
 
-        PacketBlock<PacketC, 8> cblock;
-        if(StorageOrder == ColMajor)
-        {
-          cblock.packet[0] = lhs.template loadPacket<PacketC>(j, i + 0);
-          cblock.packet[1] = lhs.template loadPacket<PacketC>(j, i + 1);
-          cblock.packet[2] = lhs.template loadPacket<PacketC>(j, i + 2);
-          cblock.packet[3] = lhs.template loadPacket<PacketC>(j, i + 3);
-
-          cblock.packet[4] = lhs.template loadPacket<PacketC>(j + 2, i + 0);
-          cblock.packet[5] = lhs.template loadPacket<PacketC>(j + 2, i + 1);
-          cblock.packet[6] = lhs.template loadPacket<PacketC>(j + 2, i + 2);
-          cblock.packet[7] = lhs.template loadPacket<PacketC>(j + 2, i + 3);
+        if (UseLhs) {
+          bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs, j, i);
         } else {
-          cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
-          cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
-          cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 2, i);
-          cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 3, i);
-
-          cblock.packet[4] = lhs.template loadPacket<PacketC>(j + 0, i + 2);
-          cblock.packet[5] = lhs.template loadPacket<PacketC>(j + 1, i + 2);
-          cblock.packet[6] = lhs.template loadPacket<PacketC>(j + 2, i + 2);
-          cblock.packet[7] = lhs.template loadPacket<PacketC>(j + 3, i + 2);
+          bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs, i, j);
         }
 
-        block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
-        block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
-        block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
-        block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
+        blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
+        blockr.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
+        blockr.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
 
-        if(StorageOrder == RowMajor) ptranspose(block);
-
-        pstore<Scalar>(blockAt + ri     , block.packet[0]);
-        pstore<Scalar>(blockAt + ri +  4, block.packet[1]);
-        pstore<Scalar>(blockAt + ri +  8, block.packet[2]);
-        pstore<Scalar>(blockAt + ri + 12, block.packet[3]);
-
-        ri += 4*vectorSize;
-      }
-      for(; i < depth; i++)
-      {
-        blockAt[ri + 0] = lhs(j + 0, i).real();
-        blockAt[ri + 1] = lhs(j + 1, i).real();
-        blockAt[ri + 2] = lhs(j + 2, i).real();
-        blockAt[ri + 3] = lhs(j + 3, i).real();
-
-        ri += vectorSize;
-      }
-      if(PanelMode) ri += vectorSize*(stride - offset - depth);
-      
-      i = 0;
-
-      if(PanelMode) ri += vectorSize*offset;
-
-      for(; i + vectorSize <= depth; i+=vectorSize)
-      {
-        PacketBlock<PacketC, 8> cblock;
-        if(StorageOrder == ColMajor)
-        {
-          cblock.packet[0] = lhs.template loadPacket<PacketC>(j, i + 0);
-          cblock.packet[1] = lhs.template loadPacket<PacketC>(j, i + 1);
-          cblock.packet[2] = lhs.template loadPacket<PacketC>(j, i + 2);
-          cblock.packet[3] = lhs.template loadPacket<PacketC>(j, i + 3);
-
-          cblock.packet[4] = lhs.template loadPacket<PacketC>(j + 2, i + 0);
-          cblock.packet[5] = lhs.template loadPacket<PacketC>(j + 2, i + 1);
-          cblock.packet[6] = lhs.template loadPacket<PacketC>(j + 2, i + 2);
-          cblock.packet[7] = lhs.template loadPacket<PacketC>(j + 2, i + 3);
-        } else {
-          cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
-          cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
-          cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 2, i);
-          cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 3, i);
-
-          cblock.packet[4] = lhs.template loadPacket<PacketC>(j + 0, i + 2);
-          cblock.packet[5] = lhs.template loadPacket<PacketC>(j + 1, i + 2);
-          cblock.packet[6] = lhs.template loadPacket<PacketC>(j + 2, i + 2);
-          cblock.packet[7] = lhs.template loadPacket<PacketC>(j + 3, i + 2);
-        }
-
-        PacketBlock<Packet, 4> block;
-        block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
-        block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
-        block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
-        block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
+        blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
+        blocki.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
+        blocki.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
 
         if(Conjugate)
         {
-          block.packet[0] *= conj;
-          block.packet[1] *= conj;
-          block.packet[2] *= conj;
-          block.packet[3] *= conj;
+          blocki.packet[0] = -blocki.packet[0];
+          blocki.packet[1] = -blocki.packet[1];
+          blocki.packet[2] = -blocki.packet[2];
+          blocki.packet[3] = -blocki.packet[3];
         }
 
-        if(StorageOrder == RowMajor) ptranspose(block);
+        if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs)))
+        {
+          ptranspose(blockr);
+          ptranspose(blocki);
+        }
 
-        pstore<Scalar>(blockAt + ri     , block.packet[0]);
-        pstore<Scalar>(blockAt + ri +  4, block.packet[1]);
-        pstore<Scalar>(blockAt + ri +  8, block.packet[2]);
-        pstore<Scalar>(blockAt + ri + 12, block.packet[3]);
+        storeBlock<Scalar, Packet, Index>(blockAt + rir, blockr);
+        storeBlock<Scalar, Packet, Index>(blockAt + rii, blocki);
 
-        ri += 4*vectorSize;
+        rir += 4*vectorSize;
+        rii += 4*vectorSize;
       }
       for(; i < depth; i++)
       {
+        PacketBlock<Packet,1> blockr, blocki;
+        PacketBlock<PacketC,2> cblock;
+
+        if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs)))
+        {
+          if (UseLhs) {
+            cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
+            cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 2, i);
+          } else {
+            cblock.packet[0] = lhs.template loadPacket<PacketC>(i, j + 0);
+            cblock.packet[1] = lhs.template loadPacket<PacketC>(i, j + 2);
+          }
+        } else {
+          const std::complex<Scalar> *lhs0, *lhs1;
+          if (UseLhs) {
+            lhs0 = &lhs(j + 0, i);
+            lhs1 = &lhs(j + 1, i);
+            cblock.packet[0] = pload2(lhs0, lhs1);
+            lhs0 = &lhs(j + 2, i);
+            lhs1 = &lhs(j + 3, i);
+            cblock.packet[1] = pload2(lhs0, lhs1);
+          } else {
+            lhs0 = &lhs(i, j + 0);
+            lhs1 = &lhs(i, j + 1);
+            cblock.packet[0] = pload2(lhs0, lhs1);
+            lhs0 = &lhs(i, j + 2);
+            lhs1 = &lhs(i, j + 3);
+            cblock.packet[1] = pload2(lhs0, lhs1);
+          }
+        }
+
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32);
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32);
+
         if(Conjugate)
         {
-          blockAt[ri + 0] = -lhs(j + 0, i).imag();
-          blockAt[ri + 1] = -lhs(j + 1, i).imag();
-          blockAt[ri + 2] = -lhs(j + 2, i).imag();
-          blockAt[ri + 3] = -lhs(j + 3, i).imag();
-        } else {
-          blockAt[ri + 0] = lhs(j + 0, i).imag();
-          blockAt[ri + 1] = lhs(j + 1, i).imag();
-          blockAt[ri + 2] = lhs(j + 2, i).imag();
-          blockAt[ri + 3] = lhs(j + 3, i).imag();
+          blocki.packet[0] = -blocki.packet[0];
         }
 
-        ri += vectorSize;
+        pstore<Scalar>(blockAt + rir, blockr.packet[0]);
+        pstore<Scalar>(blockAt + rii, blocki.packet[0]);
+
+        rir += vectorSize;
+        rii += vectorSize;
       }
-      if(PanelMode) ri += vectorSize*(stride - offset - depth);
+
+      rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
     }
 
-    if(PanelMode) ri += offset*(rows - j);
-
-    for(Index i = 0; i < depth; i++)
-    {
-      Index k = j;
-      for(; k < rows; k++)
-      {
-        blockAt[ri] = lhs(k, i).real();
-        ri += 1;
-      }
-    }
-
-    if(PanelMode) ri += (rows - j)*(stride - offset - depth);
-
-    if(PanelMode) ri += offset*(rows - j);
-
-    for(Index i = 0; i < depth; i++)
-    {
-      Index k = j;
-      for(; k < rows; k++)
-      {
-        if(Conjugate)
-          blockAt[ri] = -lhs(k, i).imag();
-        else
-          blockAt[ri] = lhs(k, i).imag();
-        ri += 1;
-      }
-    }
-
-    if(PanelMode) ri += (rows - j)*(stride - offset - depth);
-  }
-};
-
-// General template for lhs packing.
-template<typename Scalar, typename Index, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode>
-struct lhs_pack{
-  EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-  {
-    const int vectorSize = quad_traits<Scalar>::vectorsize;
-    Index ri = 0, j = 0;
-
-    for(j = 0; j + vectorSize <= rows; j+=vectorSize)
-    {
-      Index i = 0;
-
-      if(PanelMode) ri += vectorSize*offset;
-
-      for(; i + vectorSize <= depth; i+=vectorSize)
-      {
-        PacketBlock<Packet, 4> block;
-
-        if(StorageOrder == RowMajor)
-        {
-          block.packet[0] = lhs.template loadPacket<Packet>(j + 0, i);
-          block.packet[1] = lhs.template loadPacket<Packet>(j + 1, i);
-          block.packet[2] = lhs.template loadPacket<Packet>(j + 2, i);
-          block.packet[3] = lhs.template loadPacket<Packet>(j + 3, i);
-
-          ptranspose(block);
-        } else {
-          block.packet[0] = lhs.template loadPacket<Packet>(j, i + 0);
-          block.packet[1] = lhs.template loadPacket<Packet>(j, i + 1);
-          block.packet[2] = lhs.template loadPacket<Packet>(j, i + 2);
-          block.packet[3] = lhs.template loadPacket<Packet>(j, i + 3);
-        }
-
-        pstore<Scalar>(blockA + ri     , block.packet[0]);
-        pstore<Scalar>(blockA + ri +  4, block.packet[1]);
-        pstore<Scalar>(blockA + ri +  8, block.packet[2]);
-        pstore<Scalar>(blockA + ri + 12, block.packet[3]);
-
-        ri += 4*vectorSize;
-      }
-      for(; i < depth; i++)
-      {
-        if(StorageOrder == RowMajor)
-        {
-          blockA[ri+0] = lhs(j+0, i);
-          blockA[ri+1] = lhs(j+1, i);
-          blockA[ri+2] = lhs(j+2, i);
-          blockA[ri+3] = lhs(j+3, i);
-        } else {
-          Packet lhsV = lhs.template loadPacket<Packet>(j, i);
-          pstore<Scalar>(blockA + ri, lhsV);
-        }
-
-        ri += vectorSize;
-      }
-      if(PanelMode) ri += vectorSize*(stride - offset - depth);
-    }
-
-    if(PanelMode) ri += offset*(rows - j);
-
     if (j < rows)
     {
+      if(PanelMode) rir += (offset*(rows - j - vectorSize));
+      rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
+
       for(Index i = 0; i < depth; i++)
       {
         Index k = j;
         for(; k < rows; k++)
         {
-          blockA[ri] = lhs(k, i);
-          ri += 1;
-        }
-      }
-    }
+          if (UseLhs) {
+            blockAt[rir] = lhs(k, i).real();
 
-    if(PanelMode) ri += (rows - j)*(stride - offset - depth);
-  }
-};
-
-// General template for rhs complex packing.
-template<typename Scalar, typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
-struct rhs_cpack
-{
-  EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-  {
-    const int vectorSize = quad_traits<Scalar>::vectorsize;
-    Scalar *blockBt = reinterpret_cast<Scalar *>(blockB);
-    Packet conj = pset1<Packet>((Scalar)-1.0);
-
-    Index ri = 0, j = 0;
-    for(; j + vectorSize <= cols; j+=vectorSize)
-    {
-        Index i = 0;
-
-        if(PanelMode) ri += offset*vectorSize;
-
-        for(; i + vectorSize <= depth; i+=vectorSize)
-        {
-            PacketBlock<PacketC, 8> cblock;
-            if(StorageOrder == ColMajor)
-            {
-              cblock.packet[0] = rhs.template loadPacket<PacketC>(i, j + 0);
-              cblock.packet[1] = rhs.template loadPacket<PacketC>(i, j + 1);
-              cblock.packet[2] = rhs.template loadPacket<PacketC>(i, j + 2);
-              cblock.packet[3] = rhs.template loadPacket<PacketC>(i, j + 3);
-
-              cblock.packet[4] = rhs.template loadPacket<PacketC>(i + 2, j + 0);
-              cblock.packet[5] = rhs.template loadPacket<PacketC>(i + 2, j + 1);
-              cblock.packet[6] = rhs.template loadPacket<PacketC>(i + 2, j + 2);
-              cblock.packet[7] = rhs.template loadPacket<PacketC>(i + 2, j + 3);
-            } else {
-              cblock.packet[0] = rhs.template loadPacket<PacketC>(i + 0, j);
-              cblock.packet[1] = rhs.template loadPacket<PacketC>(i + 1, j);
-              cblock.packet[2] = rhs.template loadPacket<PacketC>(i + 2, j);
-              cblock.packet[3] = rhs.template loadPacket<PacketC>(i + 3, j);
-
-              cblock.packet[4] = rhs.template loadPacket<PacketC>(i + 0, j + 2);
-              cblock.packet[5] = rhs.template loadPacket<PacketC>(i + 1, j + 2);
-              cblock.packet[6] = rhs.template loadPacket<PacketC>(i + 2, j + 2);
-              cblock.packet[7] = rhs.template loadPacket<PacketC>(i + 3, j + 2);
-            }
-
-            PacketBlock<Packet, 4> block;
-            block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
-            block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
-            block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
-            block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
-
-            if(StorageOrder == ColMajor) ptranspose(block);
-
-            pstore<Scalar>(blockBt + ri     , block.packet[0]);
-            pstore<Scalar>(blockBt + ri +  4, block.packet[1]);
-            pstore<Scalar>(blockBt + ri +  8, block.packet[2]);
-            pstore<Scalar>(blockBt + ri + 12, block.packet[3]);
-
-            ri += 4*vectorSize;
-        }
-        for(; i < depth; i++)
-        {
-            blockBt[ri+0] = rhs(i, j+0).real();
-            blockBt[ri+1] = rhs(i, j+1).real();
-            blockBt[ri+2] = rhs(i, j+2).real();
-            blockBt[ri+3] = rhs(i, j+3).real();
-            ri += vectorSize;
-        }
-
-        if(PanelMode) ri += vectorSize*(stride - offset - depth);
-
-        i = 0;
-
-        if(PanelMode) ri += offset*vectorSize;
-
-        for(; i + vectorSize <= depth; i+=vectorSize)
-        {
-          PacketBlock<PacketC, 8> cblock;
-          if(StorageOrder == ColMajor)
-          {
-
-            cblock.packet[0] = rhs.template loadPacket<PacketC>(i, j + 0);
-            cblock.packet[1] = rhs.template loadPacket<PacketC>(i, j + 1);
-            cblock.packet[2] = rhs.template loadPacket<PacketC>(i, j + 2);
-            cblock.packet[3] = rhs.template loadPacket<PacketC>(i, j + 3);
-
-            cblock.packet[4] = rhs.template loadPacket<PacketC>(i + 2, j + 0);
-            cblock.packet[5] = rhs.template loadPacket<PacketC>(i + 2, j + 1);
-            cblock.packet[6] = rhs.template loadPacket<PacketC>(i + 2, j + 2);
-            cblock.packet[7] = rhs.template loadPacket<PacketC>(i + 2, j + 3);
-          } else {
-            cblock.packet[0] = rhs.template loadPacket<PacketC>(i + 0, j);
-            cblock.packet[1] = rhs.template loadPacket<PacketC>(i + 1, j);
-            cblock.packet[2] = rhs.template loadPacket<PacketC>(i + 2, j);
-            cblock.packet[3] = rhs.template loadPacket<PacketC>(i + 3, j);
-
-            cblock.packet[4] = rhs.template loadPacket<PacketC>(i + 0, j + 2);
-            cblock.packet[5] = rhs.template loadPacket<PacketC>(i + 1, j + 2);
-            cblock.packet[6] = rhs.template loadPacket<PacketC>(i + 2, j + 2);
-            cblock.packet[7] = rhs.template loadPacket<PacketC>(i + 3, j + 2);
-          }
-
-          PacketBlock<Packet, 4> block;
-          block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
-          block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
-          block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
-          block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
-
-          if(Conjugate)
-          {
-            block.packet[0] *= conj;
-            block.packet[1] *= conj;
-            block.packet[2] *= conj;
-            block.packet[3] *= conj;
-          }
-
-          if(StorageOrder == ColMajor) ptranspose(block);
-
-          pstore<Scalar>(blockBt + ri     , block.packet[0]);
-          pstore<Scalar>(blockBt + ri +  4, block.packet[1]);
-          pstore<Scalar>(blockBt + ri +  8, block.packet[2]);
-          pstore<Scalar>(blockBt + ri + 12, block.packet[3]);
-
-          ri += 4*vectorSize;
-        }
-        for(; i < depth; i++)
-        {
             if(Conjugate)
-            {
-              blockBt[ri+0] = -rhs(i, j+0).imag();
-              blockBt[ri+1] = -rhs(i, j+1).imag();
-              blockBt[ri+2] = -rhs(i, j+2).imag();
-              blockBt[ri+3] = -rhs(i, j+3).imag();
-            } else {
-              blockBt[ri+0] = rhs(i, j+0).imag();
-              blockBt[ri+1] = rhs(i, j+1).imag();
-              blockBt[ri+2] = rhs(i, j+2).imag();
-              blockBt[ri+3] = rhs(i, j+3).imag();
-            }
-            ri += vectorSize;
-        }
-
-        if(PanelMode) ri += vectorSize*(stride - offset - depth);
-    }
-
-    if(PanelMode) ri += offset*(cols - j);
-
-    for(Index i = 0; i < depth; i++)
-    {
-        Index k = j;
-        for(; k < cols; k++)
-        {
-            blockBt[ri] = rhs(i, k).real();
-            ri += 1;
-        }
-    }
-    if(PanelMode) ri += (cols - j)*(stride - offset - depth);
-
-    if(PanelMode) ri += offset*(cols - j);
-
-    for(Index i = 0; i < depth; i++)
-    {
-        Index k = j;
-        for(; k < cols; k++)
-        {
-            if(Conjugate)
-              blockBt[ri] = -rhs(i, k).imag();
+              blockAt[rii] = -lhs(k, i).imag();
             else
-              blockBt[ri] = rhs(i, k).imag();
-            ri += 1;
+              blockAt[rii] =  lhs(k, i).imag();
+          } else {
+            blockAt[rir] = lhs(i, k).real();
+
+            if(Conjugate)
+              blockAt[rii] = -lhs(i, k).imag();
+            else
+              blockAt[rii] =  lhs(i, k).imag();
+          }
+
+          rir += 1;
+          rii += 1;
         }
+      }
     }
-    if(PanelMode) ri += (cols - j)*(stride - offset - depth);
   }
 };
 
-// General template for rhs packing.
-template<typename Scalar, typename Index, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode>
-struct rhs_pack {
-  EIGEN_STRONG_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+// General template for lhs & rhs packing.
+template<typename Scalar, typename Index, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
+struct dhs_pack{
+  EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   {
-    const int vectorSize = quad_traits<Scalar>::vectorsize;
+    const Index vectorSize = quad_traits<Scalar>::vectorsize;
     Index ri = 0, j = 0;
 
-    for(; j + vectorSize <= cols; j+=vectorSize)
-    {
-      Index i = 0;
-
-      if(PanelMode) ri += offset*vectorSize;
-
-      for(; i + vectorSize <= depth; i+=vectorSize)
-      {
-        PacketBlock<Packet, 4> block;
-        if(StorageOrder == ColMajor)
-        {
-          block.packet[0] = rhs.template loadPacket<Packet>(i, j + 0);
-          block.packet[1] = rhs.template loadPacket<Packet>(i, j + 1);
-          block.packet[2] = rhs.template loadPacket<Packet>(i, j + 2);
-          block.packet[3] = rhs.template loadPacket<Packet>(i, j + 3);
-
-          ptranspose(block);
-        } else {
-          block.packet[0] = rhs.template loadPacket<Packet>(i + 0, j);
-          block.packet[1] = rhs.template loadPacket<Packet>(i + 1, j);
-          block.packet[2] = rhs.template loadPacket<Packet>(i + 2, j);
-          block.packet[3] = rhs.template loadPacket<Packet>(i + 3, j);
-        }
-
-        pstore<Scalar>(blockB + ri     , block.packet[0]);
-        pstore<Scalar>(blockB + ri +  4, block.packet[1]);
-        pstore<Scalar>(blockB + ri +  8, block.packet[2]);
-        pstore<Scalar>(blockB + ri + 12, block.packet[3]);
-
-        ri += 4*vectorSize;
-      }
-      for(; i < depth; i++)
-      {
-        if(StorageOrder == ColMajor)
-        {
-          blockB[ri+0] = rhs(i, j+0);
-          blockB[ri+1] = rhs(i, j+1);
-          blockB[ri+2] = rhs(i, j+2);
-          blockB[ri+3] = rhs(i, j+3);
-        } else {
-          Packet rhsV = rhs.template loadPacket<Packet>(i, j);
-          pstore<Scalar>(blockB + ri, rhsV);
-        }
-        ri += vectorSize;
-      }
-
-      if(PanelMode) ri += vectorSize*(stride - offset - depth);
-    }
-
-    if(PanelMode) ri += offset*(cols - j);
-
-    if (j < cols)
-    {
-      for(Index i = 0; i < depth; i++)
-      {
-        Index k = j;
-        for(; k < cols; k++)
-        {
-          blockB[ri] = rhs(i, k);
-          ri += 1;
-        }
-      }
-    }
-    if(PanelMode) ri += (cols - j)*(stride - offset - depth);
-  }
-};
-
-// General template for lhs packing, float64 specialization.
-template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
-struct lhs_pack<double,Index, DataMapper, Packet2d, StorageOrder, PanelMode>
-{
-  EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-  {
-    const int vectorSize = quad_traits<double>::vectorsize;
-    Index ri = 0, j = 0;
-
-    for(j = 0; j + vectorSize <= rows; j+=vectorSize)
+    for(; j + vectorSize <= rows; j+=vectorSize)
     {
       Index i = 0;
 
@@ -929,7 +579,92 @@
 
       for(; i + vectorSize <= depth; i+=vectorSize)
       {
-        PacketBlock<Packet2d, 2> block;
+        PacketBlock<Packet,4> block;
+
+        if (UseLhs) {
+          bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(block, lhs, j, i);
+        } else {
+          bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(block, lhs, i, j);
+        }
+        if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
+        {
+          ptranspose(block);
+        }
+
+        storeBlock<Scalar, Packet, Index>(blockA + ri, block);
+
+        ri += 4*vectorSize;
+      }
+      for(; i < depth; i++)
+      {
+        if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
+        {
+          if (UseLhs) {
+            blockA[ri+0] = lhs(j+0, i);
+            blockA[ri+1] = lhs(j+1, i);
+            blockA[ri+2] = lhs(j+2, i);
+            blockA[ri+3] = lhs(j+3, i);
+          } else {
+            blockA[ri+0] = lhs(i, j+0);
+            blockA[ri+1] = lhs(i, j+1);
+            blockA[ri+2] = lhs(i, j+2);
+            blockA[ri+3] = lhs(i, j+3);
+          }
+        } else {
+          Packet lhsV;
+          if (UseLhs) {
+            lhsV = lhs.template loadPacket<Packet>(j, i);
+          } else {
+            lhsV = lhs.template loadPacket<Packet>(i, j);
+          }
+          pstore<Scalar>(blockA + ri, lhsV);
+        }
+
+        ri += vectorSize;
+      }
+
+      if(PanelMode) ri += vectorSize*(stride - offset - depth);
+    }
+
+    if (j < rows)
+    {
+      if(PanelMode) ri += offset*(rows - j);
+
+      for(Index i = 0; i < depth; i++)
+      {
+        Index k = j;
+        for(; k < rows; k++)
+        {
+          if (UseLhs) {
+            blockA[ri] = lhs(k, i);
+          } else {
+            blockA[ri] = lhs(i, k);
+          }
+          ri += 1;
+        }
+      }
+    }
+  }
+};
+
+// General template for lhs packing, float64 specialization.
+template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, true>
+{
+  EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for(; j + vectorSize <= rows; j+=vectorSize)
+    {
+      Index i = 0;
+
+      if(PanelMode) ri += vectorSize*offset;
+
+      for(; i + vectorSize <= depth; i+=vectorSize)
+      {
+        PacketBlock<Packet2d,2> block;
         if(StorageOrder == RowMajor)
         {
           block.packet[0] = lhs.template loadPacket<Packet2d>(j + 0, i);
@@ -941,8 +676,7 @@
           block.packet[1] = lhs.template loadPacket<Packet2d>(j, i + 1);
         }
 
-        pstore<double>(blockA + ri    , block.packet[0]);
-        pstore<double>(blockA + ri + 2, block.packet[1]);
+        storeBlock<double, Packet2d, Index>(blockA + ri, block);
 
         ri += 2*vectorSize;
       }
@@ -959,44 +693,48 @@
 
         ri += vectorSize;
       }
+
       if(PanelMode) ri += vectorSize*(stride - offset - depth);
     }
 
-    if(PanelMode) ri += offset*(rows - j);
-
-    for(Index i = 0; i < depth; i++)
+    if (j < rows)
     {
-      Index k = j;
-      for(; k < rows; k++)
+      if(PanelMode) ri += offset*(rows - j);
+
+      for(Index i = 0; i < depth; i++)
       {
-        blockA[ri] = lhs(k, i);
-        ri += 1;
+        Index k = j;
+        for(; k < rows; k++)
+        {
+          blockA[ri] = lhs(k, i);
+          ri += 1;
+        }
       }
     }
-
-    if(PanelMode) ri += (rows - j)*(stride - offset - depth);
   }
 };
 
 // General template for rhs packing, float64 specialization.
 template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
-struct rhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode>
+struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, false>
 {
   EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
   {
-    const int vectorSize = quad_traits<double>::vectorsize;
+    const Index vectorSize = quad_traits<double>::vectorsize;
     Index ri = 0, j = 0;
+
     for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
     {
       Index i = 0;
 
       if(PanelMode) ri += offset*(2*vectorSize);
+
       for(; i + vectorSize <= depth; i+=vectorSize)
       {
-        PacketBlock<Packet2d, 4> block;
+        PacketBlock<Packet2d,4> block;
         if(StorageOrder == ColMajor)
         {
-          PacketBlock<Packet2d, 2> block1, block2;
+          PacketBlock<Packet2d,2> block1, block2;
           block1.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 0);
           block1.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 1);
           block2.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 2);
@@ -1015,10 +753,7 @@
           block.packet[2] = rhs.template loadPacket<Packet2d>(i + 1, j + 0); //[b1 b2]
           block.packet[3] = rhs.template loadPacket<Packet2d>(i + 1, j + 2); //[b3 b4]
 
-          pstore<double>(blockB + ri    , block.packet[0]);
-          pstore<double>(blockB + ri + 2, block.packet[1]);
-          pstore<double>(blockB + ri + 4, block.packet[2]);
-          pstore<double>(blockB + ri + 6, block.packet[3]);
+          storeBlock<double, Packet2d, Index>(blockB + ri, block);
         }
 
         ri += 4*vectorSize;
@@ -1049,43 +784,46 @@
       if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
     }
 
-    if(PanelMode) ri += offset*(cols - j);
-
-    for(Index i = 0; i < depth; i++)
+    if (j < cols)
     {
-      Index k = j;
-      for(; k < cols; k++)
+      if(PanelMode) ri += offset*(cols - j);
+
+      for(Index i = 0; i < depth; i++)
       {
-        blockB[ri] = rhs(i, k);
-        ri += 1;
+        Index k = j;
+        for(; k < cols; k++)
+        {
+          blockB[ri] = rhs(i, k);
+          ri += 1;
+        }
       }
     }
-    if(PanelMode) ri += (cols - j)*(stride - offset - depth);
   }
 };
 
 // General template for lhs complex packing, float64 specialization.
-template<bool IsComplex, typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
-struct lhs_cpack<double, IsComplex, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode>
+template<typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
 {
   EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   {
-    const int vectorSize = quad_traits<double>::vectorsize;
-    Index ri = 0, j = 0;
-    double *blockAt  = reinterpret_cast<double *>(blockA);
-    Packet conj = pset1<Packet>(-1.0);
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
+    double* blockAt = reinterpret_cast<double *>(blockA);
+    Index j = 0;
 
-    for(j = 0; j + vectorSize <= rows; j+=vectorSize)
+    for(; j + vectorSize <= rows; j+=vectorSize)
     {
       Index i = 0;
 
-      if(PanelMode) ri += vectorSize*offset;
+      rii = rir + vectorDelta;
 
       for(; i + vectorSize <= depth; i+=vectorSize)
       {
-        PacketBlock<Packet, 2> block;
+        PacketBlock<Packet,2> blockr, blocki;
+        PacketBlock<PacketC,4> cblock;
 
-        PacketBlock<PacketC, 4> cblock;
         if(StorageOrder == ColMajor)
         {
           cblock.packet[0] = lhs.template loadPacket<PacketC>(j, i + 0); //[a1 a1i]
@@ -1094,219 +832,157 @@
           cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 1, i + 0); //[a2 a2i]
           cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i]
 
-          block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2]
-          block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
+          blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2]
+          blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
+
+          blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64);
+          blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64);
         } else {
           cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i); //[a1 a1i]
           cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i); //[a2 a2i]
 
           cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 0, i + 1); //[b1 b1i]
-          cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i]
+          cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i
 
-          block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2]
-          block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
+          blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2]
+          blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
+
+          blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
+          blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
         }
 
-        pstore<double>(blockAt + ri     , block.packet[0]);
-        pstore<double>(blockAt + ri +  2, block.packet[1]);
+        if(Conjugate)
+        {
+          blocki.packet[0] = -blocki.packet[0];
+          blocki.packet[1] = -blocki.packet[1];
+        }
 
-        ri += 2*vectorSize;
+        storeBlock<double, Packet, Index>(blockAt + rir, blockr);
+        storeBlock<double, Packet, Index>(blockAt + rii, blocki);
+
+        rir += 2*vectorSize;
+        rii += 2*vectorSize;
       }
       for(; i < depth; i++)
       {
-        blockAt[ri + 0] = lhs(j + 0, i).real();
-        blockAt[ri + 1] = lhs(j + 1, i).real();
-        ri += vectorSize;
-      }
-      if(PanelMode) ri += vectorSize*(stride - offset - depth);
-      
-      i = 0;
+        PacketBlock<Packet,1> blockr, blocki;
+        PacketBlock<PacketC,2> cblock;
 
-      if(PanelMode) ri += vectorSize*offset;
+        cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
+        cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
 
-      for(; i + vectorSize <= depth; i+=vectorSize)
-      {
-        PacketBlock<Packet, 2> block;
-
-        PacketBlock<PacketC, 4> cblock;
-        if(StorageOrder == ColMajor)
-        {
-          cblock.packet[0] = lhs.template loadPacket<PacketC>(j, i + 0);
-          cblock.packet[1] = lhs.template loadPacket<PacketC>(j, i + 1);
-
-          cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 1, i + 0);
-          cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1);
-
-          block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64);
-          block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64);
-        } else {
-          cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
-          cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
-
-          cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 0, i + 1);
-          cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1);
-
-          block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
-          block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
-        }
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
 
         if(Conjugate)
         {
-          block.packet[0] *= conj;
-          block.packet[1] *= conj;
+          blocki.packet[0] = -blocki.packet[0];
         }
 
-        pstore<double>(blockAt + ri     , block.packet[0]);
-        pstore<double>(blockAt + ri +  2, block.packet[1]);
+        pstore<double>(blockAt + rir, blockr.packet[0]);
+        pstore<double>(blockAt + rii, blocki.packet[0]);
 
-        ri += 2*vectorSize;
+        rir += vectorSize;
+        rii += vectorSize;
       }
-      for(; i < depth; i++)
-      {
-        if(Conjugate)
-        {
-          blockAt[ri + 0] = -lhs(j + 0, i).imag();
-          blockAt[ri + 1] = -lhs(j + 1, i).imag();
-        } else {
-          blockAt[ri + 0] = lhs(j + 0, i).imag();
-          blockAt[ri + 1] = lhs(j + 1, i).imag();
-        }
 
-        ri += vectorSize;
-      }
-      if(PanelMode) ri += vectorSize*(stride - offset - depth);
+      rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
     }
 
-    if(PanelMode) ri += offset*(rows - j);
-
-    for(Index i = 0; i < depth; i++)
+    if (j < rows)
     {
-      Index k = j;
-      for(; k < rows; k++)
+      if(PanelMode) rir += (offset*(rows - j - vectorSize));
+      rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
+
+      for(Index i = 0; i < depth; i++)
       {
-        blockAt[ri] = lhs(k, i).real();
-        ri += 1;
+        Index k = j;
+        for(; k < rows; k++)
+        {
+          blockAt[rir] = lhs(k, i).real();
+
+          if(Conjugate)
+            blockAt[rii] = -lhs(k, i).imag();
+          else
+            blockAt[rii] =  lhs(k, i).imag();
+
+          rir += 1;
+          rii += 1;
+        }
       }
     }
-
-    if(PanelMode) ri += (rows - j)*(stride - offset - depth);
-
-    if(PanelMode) ri += offset*(rows - j);
-
-    for(Index i = 0; i < depth; i++)
-    {
-      Index k = j;
-      for(; k < rows; k++)
-      {
-        if(Conjugate)
-          blockAt[ri] = -lhs(k, i).imag();
-        else
-          blockAt[ri] = lhs(k, i).imag();
-        ri += 1;
-      }
-    }
-
-    if(PanelMode) ri += (rows - j)*(stride - offset - depth);
   }
 };
 
 // General template for rhs complex packing, float64 specialization.
 template<typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
-struct rhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode>
+struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
 {
   EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
   {
-    const int vectorSize = quad_traits<double>::vectorsize;
-    double *blockBt = reinterpret_cast<double *>(blockB);
-    Packet conj = pset1<Packet>(-1.0);
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii;
+    double* blockBt = reinterpret_cast<double *>(blockB);
+    Index j = 0;
 
-    Index ri = 0, j = 0;
     for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
     {
       Index i = 0;
 
-      if(PanelMode) ri += offset*(2*vectorSize);
+      rii = rir + vectorDelta;
 
       for(; i < depth; i++)
       {
-        PacketBlock<PacketC, 4> cblock;
-        PacketBlock<Packet, 2> block;
+        PacketBlock<PacketC,4> cblock;
+        PacketBlock<Packet,2> blockr, blocki;
 
-        cblock.packet[0] = rhs.template loadPacket<PacketC>(i, j + 0);
-        cblock.packet[1] = rhs.template loadPacket<PacketC>(i, j + 1);
-        cblock.packet[2] = rhs.template loadPacket<PacketC>(i, j + 2);
-        cblock.packet[3] = rhs.template loadPacket<PacketC>(i, j + 3);
+        bload<DataMapper, PacketC, Index, 2, 0, ColMajor>(cblock, rhs, i, j);
 
-        block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
-        block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
+        blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
 
-        pstore<double>(blockBt + ri    , block.packet[0]);
-        pstore<double>(blockBt + ri + 2, block.packet[1]);
-
-        ri += 2*vectorSize;
-      }
-
-      if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
-
-      i = 0;
-
-      if(PanelMode) ri += offset*(2*vectorSize);
-
-      for(; i < depth; i++)
-      {
-        PacketBlock<PacketC, 4> cblock;
-        PacketBlock<Packet, 2> block;
-
-        cblock.packet[0] = rhs.template loadPacket<PacketC>(i, j + 0); //[a1 a1i]
-        cblock.packet[1] = rhs.template loadPacket<PacketC>(i, j + 1); //[b1 b1i]
-        cblock.packet[2] = rhs.template loadPacket<PacketC>(i, j + 2); //[c1 c1i]
-        cblock.packet[3] = rhs.template loadPacket<PacketC>(i, j + 3); //[d1 d1i]
-
-        block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
-        block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
+        blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
 
         if(Conjugate)
         {
-          block.packet[0] *= conj;
-          block.packet[1] *= conj;
+          blocki.packet[0] = -blocki.packet[0];
+          blocki.packet[1] = -blocki.packet[1];
         }
 
-        pstore<double>(blockBt + ri     , block.packet[0]);
-        pstore<double>(blockBt + ri +  2, block.packet[1]);
+        storeBlock<double, Packet, Index>(blockBt + rir, blockr);
+        storeBlock<double, Packet, Index>(blockBt + rii, blocki);
 
-        ri += 2*vectorSize;
+        rir += 2*vectorSize;
+        rii += 2*vectorSize;
       }
-      if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
+
+      rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
     }
 
-    if(PanelMode) ri += offset*(cols - j);
-
-    for(Index i = 0; i < depth; i++)
+    if (j < cols)
     {
-      Index k = j;
-      for(; k < cols; k++)
+      if(PanelMode) rir += (offset*(cols - j - 2*vectorSize));
+      rii = rir + (((PanelMode) ? stride : depth) * (cols - j));
+
+      for(Index i = 0; i < depth; i++)
       {
-        blockBt[ri] = rhs(i, k).real();
-        ri += 1;
+        Index k = j;
+        for(; k < cols; k++)
+        {
+          blockBt[rir] = rhs(i, k).real();
+
+          if(Conjugate)
+            blockBt[rii] = -rhs(i, k).imag();
+          else
+            blockBt[rii] =  rhs(i, k).imag();
+
+          rir += 1;
+          rii += 1;
+        }
       }
     }
-    if(PanelMode) ri += (cols - j)*(stride - offset - depth);
-
-    if(PanelMode) ri += offset*(cols - j);
-
-    for(Index i = 0; i < depth; i++)
-    {
-      Index k = j;
-      for(; k < cols; k++)
-      {
-        if(Conjugate)
-          blockBt[ri] = -rhs(i, k).imag();
-        else
-          blockBt[ri] = rhs(i, k).imag();
-        ri += 1;
-      }
-    }
-    if(PanelMode) ri += (cols - j)*(stride - offset - depth);
   }
 };
 
@@ -1315,53 +991,9 @@
  **************/
 
 // 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm).
-template<typename Scalar, typename Packet, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pger(PacketBlock<Packet,4>* acc, const Scalar* lhs, const Packet* rhsV)
+template<typename Packet, bool NegativeAccumulate>
+EIGEN_STRONG_INLINE void pger_common(PacketBlock<Packet,4>* acc, const Packet& lhsV, const Packet* rhsV)
 {
-  asm("#pger begin");
-  Packet lhsV = pload<Packet>(lhs);
-
-  if(NegativeAccumulate)
-  {
-    acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
-    acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]);
-    acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]);
-    acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]);
-  } else {
-    acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
-    acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]);
-    acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]);
-    acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]);
-  }
-  asm("#pger end");
-}
-
-template<typename Scalar, typename Packet, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pger(PacketBlock<Packet,1>* acc, const Scalar* lhs, const Packet* rhsV)
-{
-  Packet lhsV = pload<Packet>(lhs);
-
-  if(NegativeAccumulate)
-  {
-    acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
-  } else {
-    acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
-  }
-}
-
-template<typename Scalar, typename Packet, typename Index, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pger(PacketBlock<Packet,4>* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows)
-{
-#ifdef _ARCH_PWR9
-  Packet lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar));
-#else
-  Packet lhsV;
-  Index i = 0;
-  do {
-    lhsV[i] = lhs[i];
-  } while (++i < remaining_rows);
-#endif
-
   if(NegativeAccumulate)
   {
     acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
@@ -1376,19 +1008,9 @@
   }
 }
 
-template<typename Scalar, typename Packet, typename Index, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pger(PacketBlock<Packet,1>* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows)
+template<typename Packet, bool NegativeAccumulate>
+EIGEN_STRONG_INLINE void pger_common(PacketBlock<Packet,1>* acc, const Packet& lhsV, const Packet* rhsV)
 {
-#ifdef _ARCH_PWR9
-  Packet lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar));
-#else
-  Packet lhsV;
-  Index i = 0;
-  do {
-    lhsV[i] = lhs[i];
-  } while (++i < remaining_rows);
-#endif
-
   if(NegativeAccumulate)
   {
     acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
@@ -1397,84 +1019,97 @@
   }
 }
 
+template<int N, typename Scalar, typename Packet, bool NegativeAccumulate>
+EIGEN_STRONG_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV)
+{
+  Packet lhsV = pload<Packet>(lhs);
+
+  pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
+}
+
+template<typename Scalar, typename Packet, typename Index>
+EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows)
+{
+#ifdef _ARCH_PWR9
+  lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar));
+#else
+  Index i = 0;
+  do {
+    lhsV[i] = lhs[i];
+  } while (++i < remaining_rows);
+#endif
+}
+
+template<int N, typename Scalar, typename Packet, typename Index, bool NegativeAccumulate>
+EIGEN_STRONG_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows)
+{
+  Packet lhsV;
+  loadPacketRemaining<Scalar, Packet, Index>(lhs, lhsV, remaining_rows);
+
+  pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
+}
+
 // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real.
-template<typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void pgerc(PacketBlock<Packet, 4>& accReal, PacketBlock<Packet,4>& accImag, const Scalar *rhs_ptr, const Scalar *rhs_ptr_imag, const Scalar *lhs_ptr, const Scalar* lhs_ptr_imag, Packet& conj)
+template<int N, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void pgerc_common(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi)
 {
-  Packet lhsV  = *((Packet *) lhs_ptr);
-  Packet rhsV1 = pset1<Packet>(rhs_ptr[0]);
-  Packet rhsV2 = pset1<Packet>(rhs_ptr[1]);
-  Packet rhsV3 = pset1<Packet>(rhs_ptr[2]);
-  Packet rhsV4 = pset1<Packet>(rhs_ptr[3]);
-
-  Packet lhsVi;
-  if(!LhsIsReal) lhsVi = *((Packet *) lhs_ptr_imag);
-  Packet rhsV1i, rhsV2i, rhsV3i, rhsV4i;
-  if(!RhsIsReal)
-  {
-    rhsV1i = pset1<Packet>(rhs_ptr_imag[0]);
-    rhsV2i = pset1<Packet>(rhs_ptr_imag[1]);
-    rhsV3i = pset1<Packet>(rhs_ptr_imag[2]);
-    rhsV4i = pset1<Packet>(rhs_ptr_imag[3]);
-  }
-
-  if(ConjugateLhs && !LhsIsReal) lhsVi = pmul<Packet>(lhsVi,conj);
-  if(ConjugateRhs && !RhsIsReal)
-  {
-    rhsV1i = pmul<Packet>(rhsV1i,conj);
-    rhsV2i = pmul<Packet>(rhsV2i,conj);
-    rhsV3i = pmul<Packet>(rhsV3i,conj);
-    rhsV4i = pmul<Packet>(rhsV4i,conj);
-  }
-
+  pger_common<Packet, false>(accReal, lhsV, rhsV);
   if(LhsIsReal)
   {
-    accReal.packet[0] = pmadd<Packet>(rhsV1, lhsV, accReal.packet[0]);
-    accReal.packet[1] = pmadd<Packet>(rhsV2, lhsV, accReal.packet[1]);
-    accReal.packet[2] = pmadd<Packet>(rhsV3, lhsV, accReal.packet[2]);
-    accReal.packet[3] = pmadd<Packet>(rhsV4, lhsV, accReal.packet[3]);
-
-    accImag.packet[0] = pmadd<Packet>(rhsV1i, lhsV, accImag.packet[0]);
-    accImag.packet[1] = pmadd<Packet>(rhsV2i, lhsV, accImag.packet[1]);
-    accImag.packet[2] = pmadd<Packet>(rhsV3i, lhsV, accImag.packet[2]);
-    accImag.packet[3] = pmadd<Packet>(rhsV4i, lhsV, accImag.packet[3]);
-  } else if(RhsIsReal) {
-    accReal.packet[0] = pmadd<Packet>(rhsV1, lhsV, accReal.packet[0]);
-    accReal.packet[1] = pmadd<Packet>(rhsV2, lhsV, accReal.packet[1]);
-    accReal.packet[2] = pmadd<Packet>(rhsV3, lhsV, accReal.packet[2]);
-    accReal.packet[3] = pmadd<Packet>(rhsV4, lhsV, accReal.packet[3]);
-
-    accImag.packet[0] = pmadd<Packet>(rhsV1, lhsVi, accImag.packet[0]);
-    accImag.packet[1] = pmadd<Packet>(rhsV2, lhsVi, accImag.packet[1]);
-    accImag.packet[2] = pmadd<Packet>(rhsV3, lhsVi, accImag.packet[2]);
-    accImag.packet[3] = pmadd<Packet>(rhsV4, lhsVi, accImag.packet[3]);
+    pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
+    EIGEN_UNUSED_VARIABLE(lhsVi);
   } else {
-    accReal.packet[0] = pmadd<Packet>(rhsV1, lhsV, accReal.packet[0]);
-    accReal.packet[1] = pmadd<Packet>(rhsV2, lhsV, accReal.packet[1]);
-    accReal.packet[2] = pmadd<Packet>(rhsV3, lhsV, accReal.packet[2]);
-    accReal.packet[3] = pmadd<Packet>(rhsV4, lhsV, accReal.packet[3]);
-
-    accImag.packet[0] = pmadd<Packet>(rhsV1i, lhsV, accImag.packet[0]);
-    accImag.packet[1] = pmadd<Packet>(rhsV2i, lhsV, accImag.packet[1]);
-    accImag.packet[2] = pmadd<Packet>(rhsV3i, lhsV, accImag.packet[2]);
-    accImag.packet[3] = pmadd<Packet>(rhsV4i, lhsV, accImag.packet[3]);
-
-    accReal.packet[0] = psub<Packet>(accReal.packet[0], pmul<Packet>(rhsV1i, lhsVi));
-    accReal.packet[1] = psub<Packet>(accReal.packet[1], pmul<Packet>(rhsV2i, lhsVi));
-    accReal.packet[2] = psub<Packet>(accReal.packet[2], pmul<Packet>(rhsV3i, lhsVi));
-    accReal.packet[3] = psub<Packet>(accReal.packet[3], pmul<Packet>(rhsV4i, lhsVi));
-
-    accImag.packet[0] = pmadd<Packet>(rhsV1, lhsVi, accImag.packet[0]);
-    accImag.packet[1] = pmadd<Packet>(rhsV2, lhsVi, accImag.packet[1]);
-    accImag.packet[2] = pmadd<Packet>(rhsV3, lhsVi, accImag.packet[2]);
-    accImag.packet[3] = pmadd<Packet>(rhsV4, lhsVi, accImag.packet[3]);
+    if (!RhsIsReal) {
+      pger_common<Packet, ConjugateLhs == ConjugateRhs>(accReal, lhsVi, rhsVi);
+      pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
+    } else {
+      EIGEN_UNUSED_VARIABLE(rhsVi);
+    }
+    pger_common<Packet, ConjugateLhs>(accImag, lhsVi, rhsV);
   }
 }
 
-template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar *lhs)
+template<int N, typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi)
 {
-  return *((Packet *)lhs);
+  Packet lhsV = ploadLhs<Scalar, Packet>(lhs_ptr);
+  Packet lhsVi;
+  if(!LhsIsReal) lhsVi = ploadLhs<Scalar, Packet>(lhs_ptr_imag);
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+
+  pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
+}
+
+template<typename Scalar, typename Packet, typename Index, bool LhsIsReal>
+EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows)
+{
+#ifdef _ARCH_PWR9
+  lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar));
+  if(!LhsIsReal) lhsVi = vec_xl_len((Scalar *)lhs_ptr_imag, remaining_rows * sizeof(Scalar));
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+#else
+  Index i = 0;
+  do {
+    lhsV[i] = lhs_ptr[i];
+    if(!LhsIsReal) lhsVi[i] = lhs_ptr_imag[i];
+  } while (++i < remaining_rows);
+  if(LhsIsReal) EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+#endif
+}
+
+template<int N, typename Scalar, typename Packet, typename Index, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows)
+{
+  Packet lhsV, lhsVi;
+  loadPacketRemaining<Scalar, Packet, Index, LhsIsReal>(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows);
+
+  pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
+}
+
+template<typename Scalar, typename Packet>
+EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs)
+{
+  return *reinterpret_cast<Packet *>(const_cast<Scalar *>(lhs));
 }
 
 // Zero the accumulator on PacketBlock.
@@ -1509,53 +1144,99 @@
   acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
 }
 
-// Complex version of PacketBlock scaling.
 template<typename Packet>
-EIGEN_STRONG_INLINE void bscalec(PacketBlock<Packet,4>& aReal, PacketBlock<Packet,4>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,4>& cReal, PacketBlock<Packet,4>& cImag)
+EIGEN_STRONG_INLINE void bscalec_common(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha)
 {
-  cReal.packet[0] = pmul<Packet>(aReal.packet[0], bReal);
-  cReal.packet[1] = pmul<Packet>(aReal.packet[1], bReal);
-  cReal.packet[2] = pmul<Packet>(aReal.packet[2], bReal);
-  cReal.packet[3] = pmul<Packet>(aReal.packet[3], bReal);
+  acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
+  acc.packet[1] = pmul<Packet>(accZ.packet[1], pAlpha);
+  acc.packet[2] = pmul<Packet>(accZ.packet[2], pAlpha);
+  acc.packet[3] = pmul<Packet>(accZ.packet[3], pAlpha);
+}
 
-  cImag.packet[0] = pmul<Packet>(aImag.packet[0], bReal);
-  cImag.packet[1] = pmul<Packet>(aImag.packet[1], bReal);
-  cImag.packet[2] = pmul<Packet>(aImag.packet[2], bReal);
-  cImag.packet[3] = pmul<Packet>(aImag.packet[3], bReal);
+template<typename Packet>
+EIGEN_STRONG_INLINE void bscalec_common(PacketBlock<Packet,1>& acc, PacketBlock<Packet,1>& accZ, const Packet& pAlpha)
+{
+  acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
+}
 
-  cReal.packet[0] = psub<Packet>(cReal.packet[0], pmul<Packet>(aImag.packet[0], bImag));
-  cReal.packet[1] = psub<Packet>(cReal.packet[1], pmul<Packet>(aImag.packet[1], bImag));
-  cReal.packet[2] = psub<Packet>(cReal.packet[2], pmul<Packet>(aImag.packet[2], bImag));
-  cReal.packet[3] = psub<Packet>(cReal.packet[3], pmul<Packet>(aImag.packet[3], bImag));
+// Complex version of PacketBlock scaling.
+template<typename Packet, int N>
+EIGEN_STRONG_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag)
+{
+  bscalec_common<Packet>(cReal, aReal, bReal);
 
-  cImag.packet[0] = pmadd<Packet>(aReal.packet[0], bImag, cImag.packet[0]);
-  cImag.packet[1] = pmadd<Packet>(aReal.packet[1], bImag, cImag.packet[1]);
-  cImag.packet[2] = pmadd<Packet>(aReal.packet[2], bImag, cImag.packet[2]);
-  cImag.packet[3] = pmadd<Packet>(aReal.packet[3], bImag, cImag.packet[3]);
+  bscalec_common<Packet>(cImag, aImag, bReal);
+
+  pger_common<Packet, true>(&cReal, bImag, aImag.packet);
+
+  pger_common<Packet, false>(&cImag, bImag, aReal.packet);
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE void band(PacketBlock<Packet,4>& acc, const Packet& pMask)
+{
+  acc.packet[0] = pand(acc.packet[0], pMask);
+  acc.packet[1] = pand(acc.packet[1], pMask);
+  acc.packet[2] = pand(acc.packet[2], pMask);
+  acc.packet[3] = pand(acc.packet[3], pMask);
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE void bscalec(PacketBlock<Packet,4>& aReal, PacketBlock<Packet,4>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,4>& cReal, PacketBlock<Packet,4>& cImag, const Packet& pMask)
+{
+  band<Packet>(aReal, pMask);
+  band<Packet>(aImag, pMask);
+
+  bscalec<Packet,4>(aReal, aImag, bReal, bImag, cReal, cImag);
 }
 
 // Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed.
-template<typename DataMapper, typename Packet, typename Index, int N>
-EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,4>& acc, const DataMapper& res, Index row, Index col, Index accCols)
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
+EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,4>& acc, const DataMapper& res, Index row, Index col)
 {
-  acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
-  acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
-  acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
-  acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
+  if (StorageOrder == RowMajor) {
+    acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
+    acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
+    acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
+    acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
+  } else {
+    acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
+    acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
+    acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
+    acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
+  }
 }
 
 // An overload of bload when you have a PacketBLock with 8 vectors.
-template<typename DataMapper, typename Packet, typename Index, int N>
-EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,8>& acc, const DataMapper& res, Index row, Index col, Index accCols)
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
+EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,8>& acc, const DataMapper& res, Index row, Index col)
+{
+  if (StorageOrder == RowMajor) {
+    acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
+    acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
+    acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
+    acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
+    acc.packet[4] = res.template loadPacket<Packet>(row + 0, col + (N+1)*accCols);
+    acc.packet[5] = res.template loadPacket<Packet>(row + 1, col + (N+1)*accCols);
+    acc.packet[6] = res.template loadPacket<Packet>(row + 2, col + (N+1)*accCols);
+    acc.packet[7] = res.template loadPacket<Packet>(row + 3, col + (N+1)*accCols);
+  } else {
+    acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
+    acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
+    acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
+    acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
+    acc.packet[4] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
+    acc.packet[5] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 1);
+    acc.packet[6] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 2);
+    acc.packet[7] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 3);
+  }
+}
+
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
+EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,2>& acc, const DataMapper& res, Index row, Index col)
 {
   acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
-  acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
-  acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
-  acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
-  acc.packet[4] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
-  acc.packet[5] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 1);
-  acc.packet[6] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 2);
-  acc.packet[7] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 3);
+  acc.packet[1] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
 }
 
 const static Packet4i mask41 = { -1,  0,  0,  0 };
@@ -1568,7 +1249,7 @@
 EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows)
 {
   if (remaining_rows == 0) {
-    return pset1<Packet>(float(0.0));
+    return pset1<Packet>(float(0.0));  // Not used
   } else {
     switch (remaining_rows) {
       case 1:  return Packet(mask41);
@@ -1582,7 +1263,7 @@
 EIGEN_STRONG_INLINE Packet2d bmask<Packet2d>(const int remaining_rows)
 {
   if (remaining_rows == 0) {
-    return pset1<Packet2d>(double(0.0));
+    return pset1<Packet2d>(double(0.0));  // Not used
   } else {
     return Packet2d(mask21);
   }
@@ -1591,14 +1272,30 @@
 template<typename Packet>
 EIGEN_STRONG_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha, const Packet& pMask)
 {
-  acc.packet[0] = pmadd(pAlpha, pand(accZ.packet[0], pMask), acc.packet[0]);
-  acc.packet[1] = pmadd(pAlpha, pand(accZ.packet[1], pMask), acc.packet[1]);
-  acc.packet[2] = pmadd(pAlpha, pand(accZ.packet[2], pMask), acc.packet[2]);
-  acc.packet[3] = pmadd(pAlpha, pand(accZ.packet[3], pMask), acc.packet[3]);
+  band<Packet>(accZ, pMask);
+
+  bscale<Packet>(acc, accZ, pAlpha);
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3)
+{
+  pbroadcast4<Packet>(a, a0, a1, a2, a3);
+}
+
+template<>
+EIGEN_STRONG_INLINE void pbroadcast4_old<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+  a1 = pload<Packet2d>(a);
+  a3 = pload<Packet2d>(a + 2);
+  a0 = vec_splat(a1, 0);
+  a1 = vec_splat(a1, 1);
+  a2 = vec_splat(a3, 0);
+  a3 = vec_splat(a3, 1);
 }
 
 // PEEL loop factor.
-#define PEEL 10
+#define PEEL 7
 
 template<typename Scalar, typename Packet, typename Index>
 EIGEN_STRONG_INLINE void MICRO_EXTRA_COL(
@@ -1610,7 +1307,7 @@
 {
   Packet rhsV[1];
   rhsV[0] = pset1<Packet>(rhs_ptr[0]);
-  pger<Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
+  pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
   lhs_ptr += remaining_rows;
   rhs_ptr += remaining_cols;
 }
@@ -1618,8 +1315,8 @@
 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
 EIGEN_STRONG_INLINE void gemm_extra_col(
   const DataMapper& res,
-  const Scalar *lhs_base,
-  const Scalar *rhs_base,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
   Index depth,
   Index strideA,
   Index offsetA,
@@ -1629,9 +1326,9 @@
   Index remaining_cols,
   const Packet& pAlpha)
 {
-  const Scalar *rhs_ptr = rhs_base;
-  const Scalar *lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
-  PacketBlock<Packet,1> accZero, acc;
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
+  PacketBlock<Packet,1> accZero;
 
   bsetzero<Scalar, Packet>(accZero);
 
@@ -1639,8 +1336,8 @@
   Index k = 0;
   for(; k + PEEL <= remaining_depth; k+= PEEL)
   {
-    prefetch(rhs_ptr);
-    prefetch(lhs_ptr);
+    EIGEN_POWER_PREFETCH(rhs_ptr);
+    EIGEN_POWER_PREFETCH(lhs_ptr);
     for (int l = 0; l < PEEL; l++) {
       MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
     }
@@ -1653,14 +1350,14 @@
   {
     Packet rhsV[1];
     rhsV[0] = pset1<Packet>(rhs_ptr[0]);
-    pger<Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
+    pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
     lhs_ptr += remaining_rows;
     rhs_ptr += remaining_cols;
   }
 
-  acc.packet[0] = vec_mul(pAlpha, accZero.packet[0]);
-  for(Index i = 0; i < remaining_rows; i++){
-    res(row + i, col) += acc.packet[0][i];
+  accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]);
+  for(Index i = 0; i < remaining_rows; i++) {
+    res(row + i, col) += accZero.packet[0][i];
   }
 }
 
@@ -1673,28 +1370,29 @@
 {
   Packet rhsV[4];
   pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
-  pger<Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
+  pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
   lhs_ptr += remaining_rows;
   rhs_ptr += accRows;
 }
 
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
 EIGEN_STRONG_INLINE void gemm_extra_row(
   const DataMapper& res,
-  const Scalar *lhs_base,
-  const Scalar *rhs_base,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
   Index depth,
   Index strideA,
   Index offsetA,
   Index row,
   Index col,
+  Index rows,
   Index cols,
   Index remaining_rows,
   const Packet& pAlpha,
   const Packet& pMask)
 {
-  const Scalar *rhs_ptr = rhs_base;
-  const Scalar *lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
   PacketBlock<Packet,4> accZero, acc;
 
   bsetzero<Scalar, Packet>(accZero);
@@ -1703,8 +1401,8 @@
   Index k = 0;
   for(; k + PEEL <= remaining_depth; k+= PEEL)
   {
-    prefetch(rhs_ptr);
-    prefetch(lhs_ptr);
+    EIGEN_POWER_PREFETCH(rhs_ptr);
+    EIGEN_POWER_PREFETCH(lhs_ptr);
     for (int l = 0; l < PEEL; l++) {
       MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
     }
@@ -1714,78 +1412,103 @@
     MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
   }
 
-  if (remaining_depth == depth)
+  if ((remaining_depth == depth) && (rows >= accCols))
   {
-    for(Index j = 0; j < 4; j++){
+    for(Index j = 0; j < 4; j++) {
       acc.packet[j] = res.template loadPacket<Packet>(row, col + j);
     }
-    bscale(acc, accZero, pAlpha, pMask);
-    res.template storePacketBlock<Packet, 4>(row, col, acc);
+    bscale<Packet>(acc, accZero, pAlpha, pMask);
+    res.template storePacketBlock<Packet,4>(row, col, acc);
   } else {
     for(; k < depth; k++)
     {
       Packet rhsV[4];
       pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
-      pger<Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
+      pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
       lhs_ptr += remaining_rows;
       rhs_ptr += accRows;
     }
 
-    for(Index j = 0; j < 4; j++){
-      acc.packet[j] = vec_mul(pAlpha, accZero.packet[j]);
+    for(Index j = 0; j < 4; j++) {
+      accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]);
     }
-    for(Index j = 0; j < 4; j++){
-      for(Index i = 0; i < remaining_rows; i++){
-        res(row + i, col + j) += acc.packet[j][i];
+    for(Index j = 0; j < 4; j++) {
+      for(Index i = 0; i < remaining_rows; i++) {
+        res(row + i, col + j) += accZero.packet[j][i];
       }
     }
   }
 }
 
-#define MICRO_DST \
-  PacketBlock<Packet,4> *accZero0, PacketBlock<Packet,4> *accZero1, PacketBlock<Packet,4> *accZero2, \
-  PacketBlock<Packet,4> *accZero3, PacketBlock<Packet,4> *accZero4, PacketBlock<Packet,4> *accZero5, \
-  PacketBlock<Packet,4> *accZero6, PacketBlock<Packet,4> *accZero7
-
-#define MICRO_COL_DST \
-  PacketBlock<Packet,1> *accZero0, PacketBlock<Packet,1> *accZero1, PacketBlock<Packet,1> *accZero2, \
-  PacketBlock<Packet,1> *accZero3, PacketBlock<Packet,1> *accZero4, PacketBlock<Packet,1> *accZero5, \
-  PacketBlock<Packet,1> *accZero6, PacketBlock<Packet,1> *accZero7
-
-#define MICRO_SRC \
-  const Scalar **lhs_ptr0, const Scalar **lhs_ptr1, const Scalar **lhs_ptr2, \
-  const Scalar **lhs_ptr3, const Scalar **lhs_ptr4, const Scalar **lhs_ptr5, \
-  const Scalar **lhs_ptr6, const Scalar **lhs_ptr7
-
-#define MICRO_ONE \
-  MICRO<unroll_factor, Scalar, Packet, accRows, accCols>(\
-    &lhs_ptr0, &lhs_ptr1, &lhs_ptr2, &lhs_ptr3, &lhs_ptr4, &lhs_ptr5, &lhs_ptr6, &lhs_ptr7, \
-    rhs_ptr, \
-    &accZero0, &accZero1, &accZero2, &accZero3, &accZero4, &accZero5, &accZero6, &accZero7);
-
-#define MICRO_COL_ONE \
-  MICRO_COL<unroll_factor, Scalar, Packet, Index, accCols>(\
-    &lhs_ptr0, &lhs_ptr1, &lhs_ptr2, &lhs_ptr3, &lhs_ptr4, &lhs_ptr5, &lhs_ptr6, &lhs_ptr7, \
-    rhs_ptr, \
-    &accZero0, &accZero1, &accZero2, &accZero3, &accZero4, &accZero5, &accZero6, &accZero7, \
-    remaining_cols);
-
-#define MICRO_WORK_ONE(iter) \
-  if (N > iter) { \
-    pger<Scalar, Packet, false>(accZero##iter, *lhs_ptr##iter, rhsV); \
-    *lhs_ptr##iter += accCols; \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(accZero##iter); \
-    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
-  }
-
 #define MICRO_UNROLL(func) \
   func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
 
-#define MICRO_WORK MICRO_UNROLL(MICRO_WORK_ONE)
+#define MICRO_UNROLL_WORK(func, func2, peel) \
+    MICRO_UNROLL(func2); \
+    func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
+    func(4,peel) func(5,peel) func(6,peel) func(7,peel)
+
+#define MICRO_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
+    lhs_ptr##iter += accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+  }
+
+#define MICRO_WORK_ONE(iter, peel) \
+  if (unroll_factor > iter) { \
+    pger_common<Packet, false>(&accZero##iter, lhsV##iter, rhsV##peel); \
+  }
+
+#define MICRO_TYPE_PEEL4(func, func2, peel) \
+  if (PEEL > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
+    pbroadcast4<Packet>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+    MICRO_UNROLL_WORK(func, func2, peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+  }
+
+#define MICRO_TYPE_PEEL1(func, func2, peel) \
+  if (PEEL > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
+    rhsV##peel[0] = pset1<Packet>(rhs_ptr[remaining_cols * peel]); \
+    MICRO_UNROLL_WORK(func, func2, peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+  }
+
+#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
+  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
+  func(func1,func2,0); func(func1,func2,1); \
+  func(func1,func2,2); func(func1,func2,3); \
+  func(func1,func2,4); func(func1,func2,5); \
+  func(func1,func2,6); func(func1,func2,7); \
+  func(func1,func2,8); func(func1,func2,9);
+
+#define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
+  Packet rhsV0[M]; \
+  func(func1,func2,0);
+
+#define MICRO_ONE_PEEL4 \
+  MICRO_UNROLL_TYPE_PEEL(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
+  rhs_ptr += (accRows * PEEL);
+
+#define MICRO_ONE4 \
+  MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
+  rhs_ptr += accRows;
+
+#define MICRO_ONE_PEEL1 \
+  MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
+  rhs_ptr += (remaining_cols * PEEL);
+
+#define MICRO_ONE1 \
+  MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
+  rhs_ptr += remaining_cols;
 
 #define MICRO_DST_PTR_ONE(iter) \
-  if (unroll_factor > iter){ \
+  if (unroll_factor > iter) { \
     bsetzero<Scalar, Packet>(accZero##iter); \
   } else { \
     EIGEN_UNUSED_VARIABLE(accZero##iter); \
@@ -1803,52 +1526,38 @@
 #define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE)
 
 #define MICRO_PREFETCH_ONE(iter) \
-  if (unroll_factor > iter){ \
-    prefetch(lhs_ptr##iter); \
+  if (unroll_factor > iter) { \
+    EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
   }
 
 #define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
 
 #define MICRO_STORE_ONE(iter) \
-  if (unroll_factor > iter){ \
+  if (unroll_factor > iter) { \
     acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
     acc.packet[1] = res.template loadPacket<Packet>(row + iter*accCols, col + 1); \
     acc.packet[2] = res.template loadPacket<Packet>(row + iter*accCols, col + 2); \
     acc.packet[3] = res.template loadPacket<Packet>(row + iter*accCols, col + 3); \
-    bscale(acc, accZero##iter, pAlpha); \
-    res.template storePacketBlock<Packet, 4>(row + iter*accCols, col, acc); \
+    bscale<Packet>(acc, accZero##iter, pAlpha); \
+    res.template storePacketBlock<Packet,4>(row + iter*accCols, col, acc); \
   }
 
 #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
 
 #define MICRO_COL_STORE_ONE(iter) \
-  if (unroll_factor > iter){ \
+  if (unroll_factor > iter) { \
     acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
-    bscale(acc, accZero##iter, pAlpha); \
-    res.template storePacketBlock<Packet, 1>(row + iter*accCols, col, acc); \
+    bscale<Packet>(acc, accZero##iter, pAlpha); \
+    res.template storePacketBlock<Packet,1>(row + iter*accCols, col, acc); \
   }
 
 #define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE)
 
-template<int N, typename Scalar, typename Packet, const Index accRows, const Index accCols>
-EIGEN_STRONG_INLINE void MICRO(
-  MICRO_SRC,
-  const Scalar* &rhs_ptr,
-  MICRO_DST)
-  {
-    Packet rhsV[4];
-    pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
-    asm("#unrolled pger? begin");
-    MICRO_WORK
-    asm("#unrolled pger? end");
-    rhs_ptr += accRows;
-  }
-
 template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
 EIGEN_STRONG_INLINE void gemm_unrolled_iteration(
   const DataMapper& res,
-  const Scalar *lhs_base,
-  const Scalar *rhs_base,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
   Index depth,
   Index strideA,
   Index offsetA,
@@ -1856,56 +1565,37 @@
   Index col,
   const Packet& pAlpha)
 {
-  const Scalar *rhs_ptr = rhs_base;
-  const Scalar *lhs_ptr0, *lhs_ptr1, *lhs_ptr2, *lhs_ptr3, *lhs_ptr4, *lhs_ptr5, *lhs_ptr6, *lhs_ptr7;
+asm("#gemm begin");
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr0, *  lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7;
   PacketBlock<Packet,4> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
   PacketBlock<Packet,4> acc;
 
-  asm("#unrolled start");
   MICRO_SRC_PTR
-  asm("#unrolled zero?");
   MICRO_DST_PTR
 
   Index k = 0;
   for(; k + PEEL <= depth; k+= PEEL)
   {
-    prefetch(rhs_ptr);
+    EIGEN_POWER_PREFETCH(rhs_ptr);
     MICRO_PREFETCH
-    asm("#unrolled inner loop?");
-    for (int l = 0; l < PEEL; l++) {
-      MICRO_ONE
-    }
-    asm("#unrolled inner loop end?");
+    MICRO_ONE_PEEL4
   }
   for(; k < depth; k++)
   {
-    MICRO_ONE
+    MICRO_ONE4
   }
   MICRO_STORE
 
   row += unroll_factor*accCols;
+asm("#gemm end");
 }
 
-template<int N, typename Scalar, typename Packet, typename Index, const Index accCols>
-EIGEN_STRONG_INLINE void MICRO_COL(
-  MICRO_SRC,
-  const Scalar* &rhs_ptr,
-  MICRO_COL_DST,
-  Index remaining_rows)
-  {
-    Packet rhsV[1];
-    rhsV[0] = pset1<Packet>(rhs_ptr[0]);
-    asm("#unrolled pger? begin");
-    MICRO_WORK
-    asm("#unrolled pger? end");
-    rhs_ptr += remaining_rows;
-  }
-
 template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
 EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration(
   const DataMapper& res,
-  const Scalar *lhs_base,
-  const Scalar *rhs_base,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
   Index depth,
   Index strideA,
   Index offsetA,
@@ -1914,8 +1604,8 @@
   Index remaining_cols,
   const Packet& pAlpha)
 {
-  const Scalar *rhs_ptr = rhs_base;
-  const Scalar *lhs_ptr0, *lhs_ptr1, *lhs_ptr2, *lhs_ptr3, *lhs_ptr4, *lhs_ptr5, *lhs_ptr6, *lhs_ptr7;
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, *lhs_ptr7;
   PacketBlock<Packet,1> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
   PacketBlock<Packet,1> acc;
 
@@ -1925,15 +1615,13 @@
   Index k = 0;
   for(; k + PEEL <= depth; k+= PEEL)
   {
-    prefetch(rhs_ptr);
+    EIGEN_POWER_PREFETCH(rhs_ptr);
     MICRO_PREFETCH
-    for (int l = 0; l < PEEL; l++) {
-      MICRO_COL_ONE
-    }
+    MICRO_ONE_PEEL1
   }
   for(; k < depth; k++)
   {
-    MICRO_COL_ONE
+    MICRO_ONE1
   }
   MICRO_COL_STORE
 
@@ -1943,8 +1631,8 @@
 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
 EIGEN_STRONG_INLINE void gemm_unrolled_col(
   const DataMapper& res,
-  const Scalar *lhs_base,
-  const Scalar *rhs_base,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
   Index depth,
   Index strideA,
   Index offsetA,
@@ -1955,10 +1643,10 @@
   const Packet& pAlpha)
 {
 #define MAX_UNROLL 6
-  while(row + MAX_UNROLL*accCols <= rows){
+  while(row + MAX_UNROLL*accCols <= rows) {
     gemm_unrolled_col_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
   }
-  switch( (rows-row)/accCols ){
+  switch( (rows-row)/accCols ) {
 #if MAX_UNROLL > 7
     case 7:
       gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
@@ -2018,16 +1706,15 @@
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        const Scalar *rhs_base = blockB + col*strideB + accRows*offsetB;
-        const Scalar *lhs_base = blockA;
+        const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
+        const Scalar* lhs_base = blockA;
         Index row = 0;
 
-        asm("#jump table");
 #define MAX_UNROLL 6
-        while(row + MAX_UNROLL*accCols <= rows){
+        while(row + MAX_UNROLL*accCols <= rows) {
           gemm_unrolled_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
         }
-        switch( (rows-row)/accCols ){
+        switch( (rows-row)/accCols ) {
 #if MAX_UNROLL > 7
           case 7:
             gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
@@ -2067,18 +1754,17 @@
             break;
         }
 #undef MAX_UNROLL
-        asm("#jump table end");
 
         if(remaining_rows > 0)
         {
-          gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, cols, remaining_rows, pAlpha, pMask);
+          gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
         }
     }
 
     if(remaining_cols > 0)
     {
-      const Scalar *rhs_base = blockB + col*strideB + remaining_cols*offsetB;
-      const Scalar *lhs_base = blockA;
+      const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
+      const Scalar* lhs_base = blockA;
 
       for(; col < cols; col++)
       {
@@ -2095,316 +1781,623 @@
     }
 }
 
-template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const int accRows, const int accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc,
-          Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
-      const int remaining_rows = rows % accCols;
-      const int remaining_cols = cols % accRows;
-      const int accColsC = accCols / 2;
-      int advanceCols = 2;
-      int advanceRows = 2;
+#define accColsC (accCols / 2)
+#define advanceRows ((LhsIsReal) ? 1 : 2)
+#define advanceCols ((RhsIsReal) ? 1 : 2)
 
-      if(LhsIsReal) advanceRows = 1;
-      if(RhsIsReal) advanceCols = 1;
+// PEEL_COMPLEX loop factor.
+#define PEEL_COMPLEX 3
+
+template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_COL(
+  const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
+  const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag,
+  PacketBlock<Packet,1> &accReal, PacketBlock<Packet,1> &accImag,
+  Index remaining_rows,
+  Index remaining_cols)
+{
+  Packet rhsV[1], rhsVi[1];
+  rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
+  if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
+  pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
+  lhs_ptr_real += remaining_rows;
+  if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  rhs_ptr_real += remaining_cols;
+  if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
+  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_extra_col(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index row,
+  Index col,
+  Index remaining_rows,
+  Index remaining_cols,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag)
+{
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag;
+  if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB;
+  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
+  const Scalar* lhs_ptr_imag;
+  if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  PacketBlock<Packet,1> accReal, accImag;
+  PacketBlock<Packet,1> taccReal, taccImag;
+  PacketBlock<Packetc,1> acc0, acc1;
+
+  bsetzero<Scalar, Packet>(accReal);
+  bsetzero<Scalar, Packet>(accImag);
+
+  Index remaining_depth = (depth & -accRows);
+  Index k = 0;
+  for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if(!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    EIGEN_POWER_PREFETCH(lhs_ptr_real);
+    if(!LhsIsReal) {
+      EIGEN_POWER_PREFETCH(lhs_ptr_imag);
+    }
+    for (int l = 0; l < PEEL_COMPLEX; l++) {
+      MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
+    }
+  }
+  for(; k < remaining_depth; k++)
+  {
+    MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
+  }
+
+  for(; k < depth; k++)
+  {
+    Packet rhsV[1], rhsVi[1];
+    rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
+    if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
+    pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
+    lhs_ptr_real += remaining_rows;
+    if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
+    rhs_ptr_real += remaining_cols;
+    if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
+  }
+
+  bscalec<Packet,1>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
+  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
+
+  if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
+  {
+    res(row + 0, col + 0) += pfirst<Packetc>(acc0.packet[0]);
+  } else {
+    acc0.packet[0] += res.template loadPacket<Packetc>(row + 0, col + 0);
+    res.template storePacketBlock<Packetc,1>(row + 0, col + 0, acc0);
+    if(remaining_rows > accColsC) {
+      res(row + accColsC, col + 0) += pfirst<Packetc>(acc1.packet[0]);
+    }
+  }
+}
+
+template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_ROW(
+  const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
+  const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag,
+  PacketBlock<Packet,4> &accReal, PacketBlock<Packet,4> &accImag,
+  Index remaining_rows)
+{
+  Packet rhsV[4], rhsVi[4];
+  pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+  if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
+  pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
+  lhs_ptr_real += remaining_rows;
+  if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  rhs_ptr_real += accRows;
+  if(!RhsIsReal) rhs_ptr_imag += accRows;
+  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_extra_row(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask)
+{
+asm("#gemm_complex begin");
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag;
+  if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB;
+  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
+  const Scalar* lhs_ptr_imag;
+  if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  PacketBlock<Packet,4> accReal, accImag;
+  PacketBlock<Packet,4> taccReal, taccImag;
+  PacketBlock<Packetc,4> acc0, acc1;
+  PacketBlock<Packetc,8> tRes;
+
+  bsetzero<Scalar, Packet>(accReal);
+  bsetzero<Scalar, Packet>(accImag);
+
+  Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows);
+  Index k = 0;
+  for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if(!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    EIGEN_POWER_PREFETCH(lhs_ptr_real);
+    if(!LhsIsReal) {
+      EIGEN_POWER_PREFETCH(lhs_ptr_imag);
+    }
+    for (int l = 0; l < PEEL_COMPLEX; l++) {
+      MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
+    }
+  }
+  for(; k < remaining_depth; k++)
+  {
+    MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
+  }
+
+  if ((remaining_depth == depth) && (rows >= accCols))
+  {
+    bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row, col);
+    bscalec<Packet>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
+    bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1);
+    res.template storePacketBlock<Packetc,4>(row + 0, col, acc0);
+    res.template storePacketBlock<Packetc,4>(row + accColsC, col, acc1);
+  } else {
+    for(; k < depth; k++)
+    {
+      Packet rhsV[4], rhsVi[4];
+      pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+      if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
+      pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
+      lhs_ptr_real += remaining_rows;
+      if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
+      rhs_ptr_real += accRows;
+      if(!RhsIsReal) rhs_ptr_imag += accRows;
+    }
+
+    bscalec<Packet,4>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
+    bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
+
+    if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
+    {
+      for(Index j = 0; j < 4; j++) {
+        res(row + 0, col + j) += pfirst<Packetc>(acc0.packet[j]);
+      }
+    } else {
+      for(Index j = 0; j < 4; j++) {
+        PacketBlock<Packetc,1> acc2;
+        acc2.packet[0] = res.template loadPacket<Packetc>(row + 0, col + j) + acc0.packet[j];
+        res.template storePacketBlock<Packetc,1>(row + 0, col + j, acc2);
+        if(remaining_rows > accColsC) {
+          res(row + accColsC, col + j) += pfirst<Packetc>(acc1.packet[j]);
+        }
+      }
+    }
+  }
+asm("#gemm_complex end");
+}
+
+#define MICRO_COMPLEX_UNROLL(func) \
+  func(0) func(1) func(2) func(3) func(4)
+
+#define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
+    MICRO_COMPLEX_UNROLL(func2); \
+    func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel)
+
+#define MICRO_COMPLEX_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
+    lhs_ptr_real##iter += accCols; \
+    if(!LhsIsReal) { \
+      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \
+      lhs_ptr_imag##iter += accCols; \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+    } \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+    EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+  }
+
+#define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
+  if (unroll_factor > iter) { \
+    pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_WORK_ONE1(iter, peel) \
+  if (unroll_factor > iter) { \
+    pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
+  if (PEEL_COMPLEX > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
+    pbroadcast4_old<Packet>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+    if(!RhsIsReal) { \
+      pbroadcast4_old<Packet>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+    } \
+    MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \
+  if (PEEL_COMPLEX > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
+    rhsV##peel[0] = pset1<Packet>(rhs_ptr_real[remaining_cols * peel]); \
+    if(!RhsIsReal) { \
+      rhsVi##peel[0] = pset1<Packet>(rhs_ptr_imag[remaining_cols * peel]); \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+    } \
+    MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
+  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
+  Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \
+  func(func1,func2,0); func(func1,func2,1); \
+  func(func1,func2,2); func(func1,func2,3); \
+  func(func1,func2,4); func(func1,func2,5); \
+  func(func1,func2,6); func(func1,func2,7); \
+  func(func1,func2,8); func(func1,func2,9);
+
+#define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
+  Packet rhsV0[M], rhsVi0[M];\
+  func(func1,func2,0);
+
+#define MICRO_COMPLEX_ONE_PEEL4 \
+  MICRO_COMPLEX_UNROLL_TYPE_PEEL(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
+  rhs_ptr_real += (accRows * PEEL_COMPLEX); \
+  if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX);
+
+#define MICRO_COMPLEX_ONE4 \
+  MICRO_COMPLEX_UNROLL_TYPE_ONE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
+  rhs_ptr_real += accRows; \
+  if(!RhsIsReal) rhs_ptr_imag += accRows;
+
+#define MICRO_COMPLEX_ONE_PEEL1 \
+  MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
+  rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \
+  if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX);
+
+#define MICRO_COMPLEX_ONE1 \
+  MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
+  rhs_ptr_real += remaining_cols; \
+  if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
+
+#define MICRO_COMPLEX_DST_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bsetzero<Scalar, Packet>(accReal##iter); \
+    bsetzero<Scalar, Packet>(accImag##iter); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accReal##iter); \
+    EIGEN_UNUSED_VARIABLE(accImag##iter); \
+  }
+
+#define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE)
+
+#define MICRO_COMPLEX_SRC_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \
+    if(!LhsIsReal) { \
+      lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
+    } \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
+  }
+
+#define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
+
+#define MICRO_COMPLEX_PREFETCH_ONE(iter) \
+  if (unroll_factor > iter) { \
+    EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
+    if(!LhsIsReal) { \
+      EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \
+    } \
+  }
+
+#define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
+
+#define MICRO_COMPLEX_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
+    bscalec<Packet,4>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
+    bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
+    res.template storePacketBlock<Packetc,4>(row + iter*accCols + 0, col, acc0); \
+    res.template storePacketBlock<Packetc,4>(row + iter*accCols + accColsC, col, acc1); \
+  }
+
+#define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
+
+#define MICRO_COMPLEX_COL_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
+    bscalec<Packet,1>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
+    bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
+    res.template storePacketBlock<Packetc,1>(row + iter*accCols + 0, col, acc0); \
+    res.template storePacketBlock<Packetc,1>(row + iter*accCols + accColsC, col, acc1); \
+  }
+
+#define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE)
+
+template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index& row,
+  Index col,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag)
+{
+asm("#gemm_complex_unrolled begin");
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag;
+  if(!RhsIsReal) {
+    rhs_ptr_imag = rhs_base + accRows*strideB;
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  }
+  const Scalar* lhs_ptr_real0, * lhs_ptr_imag0, * lhs_ptr_real1, * lhs_ptr_imag1;
+  const Scalar* lhs_ptr_real2, * lhs_ptr_imag2, * lhs_ptr_real3, * lhs_ptr_imag3;
+  const Scalar* lhs_ptr_real4, * lhs_ptr_imag4;
+  PacketBlock<Packet,4> accReal0, accImag0, accReal1, accImag1;
+  PacketBlock<Packet,4> accReal2, accImag2, accReal3, accImag3;
+  PacketBlock<Packet,4> accReal4, accImag4;
+  PacketBlock<Packet,4> taccReal, taccImag;
+  PacketBlock<Packetc,4> acc0, acc1;
+  PacketBlock<Packetc,8> tRes;
+
+  MICRO_COMPLEX_SRC_PTR
+  MICRO_COMPLEX_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if(!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    MICRO_COMPLEX_PREFETCH
+    MICRO_COMPLEX_ONE_PEEL4
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_COMPLEX_ONE4
+  }
+  MICRO_COMPLEX_STORE
+
+  row += unroll_factor*accCols;
+asm("#gemm_complex_unrolled end");
+}
+
+template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index& row,
+  Index col,
+  Index remaining_cols,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag)
+{
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag;
+  if(!RhsIsReal) {
+    rhs_ptr_imag = rhs_base + remaining_cols*strideB;
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  }
+  const Scalar* lhs_ptr_real0, * lhs_ptr_imag0, * lhs_ptr_real1, * lhs_ptr_imag1;
+  const Scalar* lhs_ptr_real2, * lhs_ptr_imag2, * lhs_ptr_real3, * lhs_ptr_imag3;
+  const Scalar* lhs_ptr_real4, * lhs_ptr_imag4;
+  PacketBlock<Packet,1> accReal0, accImag0, accReal1, accImag1;
+  PacketBlock<Packet,1> accReal2, accImag2, accReal3, accImag3;
+  PacketBlock<Packet,1> accReal4, accImag4;
+  PacketBlock<Packet,1> taccReal, taccImag;
+  PacketBlock<Packetc,1> acc0, acc1;
+  PacketBlock<Packetc,2> tRes;
+
+  MICRO_COMPLEX_SRC_PTR
+  MICRO_COMPLEX_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if(!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    MICRO_COMPLEX_PREFETCH
+    MICRO_COMPLEX_ONE_PEEL1
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_COMPLEX_ONE1
+  }
+  MICRO_COMPLEX_COL_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_unrolled_col(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index& row,
+  Index rows,
+  Index col,
+  Index remaining_cols,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag)
+{
+#define MAX_COMPLEX_UNROLL 3
+  while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
+    gemm_complex_unrolled_col_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
+  }
+  switch( (rows-row)/accCols ) {
+#if MAX_COMPLEX_UNROLL > 4
+   case 4:
+     gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
+     break;
+#endif
+#if MAX_COMPLEX_UNROLL > 3
+   case 3:
+     gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
+     break;
+#endif
+#if MAX_COMPLEX_UNROLL > 2
+   case 2:
+     gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
+     break;
+#endif
+#if MAX_COMPLEX_UNROLL > 1
+   case 1:
+     gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
+     break;
+#endif
+   default:
+     break;
+  }
+#undef MAX_COMPLEX_UNROLL
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+      const Index remaining_rows = rows % accCols;
+      const Index remaining_cols = cols % accRows;
 
       if( strideA == -1 ) strideA = depth;
       if( strideB == -1 ) strideB = depth;
 
       const Packet pAlphaReal = pset1<Packet>(alpha.real());
       const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+      const Packet pMask = bmask<Packet>((const int)(remaining_rows));
 
-      const Scalar *blockA = (Scalar *) blockAc;
-      const Scalar *blockB = (Scalar *) blockBc;
-
-      Packet conj = pset1<Packet>((Scalar)-1.0);
+      const Scalar* blockA = (Scalar *) blockAc;
+      const Scalar* blockB = (Scalar *) blockBc;
 
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        const Scalar *rhs_base = blockB + ( (advanceCols*col)/accRows     )*strideB*accRows;
-        const Scalar *lhs_base = blockA;
-
+        const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
+        const Scalar* lhs_base = blockA;
         Index row = 0;
-        for(; row + accCols <= rows; row += accCols)
-        {
-#define MICRO() \
-            pgerc<Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal1, accImag1, rhs_ptr, rhs_ptr_imag, lhs_ptr1, lhs_ptr_imag1, conj); \
-            lhs_ptr1 += accCols; \
-            rhs_ptr += accRows; \
-            if(!LhsIsReal) \
-              lhs_ptr_imag1 += accCols; \
-            if(!RhsIsReal) \
-              rhs_ptr_imag += accRows;
 
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *rhs_ptr_imag = rhs_ptr + accRows*strideB;
-          const Scalar *lhs_ptr1 = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
-          const Scalar *lhs_ptr_imag1 = lhs_ptr1 + accCols*strideA;
-
-          PacketBlock<Packet,4> accReal1, accImag1;
-          bsetzero<Scalar, Packet>(accReal1);
-          bsetzero<Scalar, Packet>(accImag1);
-
-          lhs_ptr1 += accCols*offsetA;
-          if(!LhsIsReal)
-            lhs_ptr_imag1 += accCols*offsetA;
-          rhs_ptr += accRows*offsetB;
-          if(!RhsIsReal)
-            rhs_ptr_imag += accRows*offsetB;
-          Index k = 0;
-          for(; k + PEEL <= depth; k+=PEEL)
-          {
-            prefetch(rhs_ptr);
-            prefetch(rhs_ptr_imag);
-            prefetch(lhs_ptr1);
-            prefetch(lhs_ptr_imag1);
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-#if PEEL > 8
-            MICRO();
-            MICRO();
+#define MAX_COMPLEX_UNROLL 3
+        while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
+          gemm_complex_unrolled_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+        }
+        switch( (rows-row)/accCols ) {
+#if MAX_COMPLEX_UNROLL > 4
+          case 4:
+            gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
 #endif
-          }
-          for(; k < depth; k++)
-          {
-            MICRO();
-          }
-          PacketBlock<Packet,4> taccReal, taccImag;
-          bscalec<Packet>(accReal1, accImag1, pAlphaReal, pAlphaImag, taccReal, taccImag);
+#if MAX_COMPLEX_UNROLL > 3
+          case 3:
+            gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+#if MAX_COMPLEX_UNROLL > 2
+          case 2:
+            gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+#if MAX_COMPLEX_UNROLL > 1
+          case 1:
+            gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+          default:
+            break;
+        }
+#undef MAX_COMPLEX_UNROLL
 
-          PacketBlock<Packetc, 8> tRes;
-          bload<DataMapper, Packetc, Index, 0>(tRes, res, row, col, accColsC);
-
-          PacketBlock<Packetc, 4> acc1, acc2;
-          bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc1, acc2);
-
-          res.template storePacketBlock<Packetc, 4>(row + 0, col, acc1);
-          res.template storePacketBlock<Packetc, 4>(row + accColsC, col, acc2);
-#undef MICRO
-      }
-          if(remaining_rows > 0)
-          {
-            const Scalar *rhs_ptr  = rhs_base;
-            const Scalar *rhs_ptr_imag = rhs_ptr + accRows*strideB;
-            const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
-            const Scalar *lhs_ptr_imag = lhs_ptr + remaining_rows*strideA;
-
-            lhs_ptr += remaining_rows*offsetA;
-            if(!LhsIsReal)
-              lhs_ptr_imag += remaining_rows*offsetA;
-            rhs_ptr += accRows*offsetB;
-            if(!RhsIsReal)
-              rhs_ptr_imag += accRows*offsetB;
-            for(Index k = 0; k < depth; k++)
-            {
-              for(Index arow = 0; arow < remaining_rows; arow++)
-              {
-                Scalar lhs_real = lhs_ptr[arow];
-                Scalar lhs_imag;
-                if(!LhsIsReal) lhs_imag = lhs_ptr_imag[arow];
-
-                Scalarc lhsc;
-
-                lhsc.real(lhs_real);
-                if(!LhsIsReal)
-                {
-                  if(ConjugateLhs)
-                    lhsc.imag(-lhs_imag);
-                  else
-                    lhsc.imag(lhs_imag);
-                } else {
-                  //Lazy approach for now
-                  lhsc.imag((Scalar)0);
-                }
-
-                for(int acol = 0; acol < accRows; acol++ )
-                {
-                  Scalar rhs_real = rhs_ptr[acol];
-                  Scalar rhs_imag;
-                  if(!RhsIsReal) rhs_imag = rhs_ptr_imag[acol];
-                  Scalarc rhsc;
-
-                  rhsc.real(rhs_real);
-                  if(!RhsIsReal)
-                  {
-                    if(ConjugateRhs)
-                      rhsc.imag(-rhs_imag);
-                    else
-                      rhsc.imag(rhs_imag);
-                  } else {
-                    //Lazy approach for now
-                    rhsc.imag((Scalar)0);
-                  }
-                  res(row + arow, col + acol) += alpha*lhsc*rhsc;
-                }
-              }
-              rhs_ptr += accRows;
-              lhs_ptr += remaining_rows;
-              if(!LhsIsReal)
-                lhs_ptr_imag += remaining_rows;
-              if(!RhsIsReal)
-                rhs_ptr_imag += accRows;
-            }
-          }
+        if(remaining_rows > 0)
+        {
+          gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+        }
       }
 
       if(remaining_cols > 0)
       {
-        const Scalar *rhs_base = blockB + ( (advanceCols*col)/accRows     )*strideB*accRows;
-        const Scalar *lhs_base = blockA;
-        Index row = 0;
+        const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB;
+        const Scalar* lhs_base = blockA;
 
-        for(; row + accCols <= rows; row += accCols)
+        for(; col < cols; col++)
         {
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *rhs_ptr_imag = rhs_ptr + remaining_cols*strideB;
-          const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
-          const Scalar *lhs_ptr_imag = lhs_ptr + accCols*strideA;
+          Index row = 0;
 
-          lhs_ptr += accCols*offsetA;
-          if(!LhsIsReal)
-            lhs_ptr_imag += accCols*offsetA;
-          rhs_ptr += remaining_cols*offsetB;
-          if(!RhsIsReal)
-            rhs_ptr_imag += remaining_cols*offsetB;
-          Scalarc scalarAcc[4][4];
-          for(Index arow = 0; arow < 4; arow++ )
+          gemm_complex_unrolled_col<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag);
+
+          if (remaining_rows > 0)
           {
-            for(Index acol = 0; acol < 4; acol++ )
-            {
-              scalarAcc[arow][acol].real((Scalar)0.0);
-              scalarAcc[arow][acol].imag((Scalar)0.0);
-            }
+            gemm_complex_extra_col<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag);
           }
-          for(Index k = 0; k < depth; k++)
-          {
-            for(Index arow = 0; arow < accCols; arow++)
-            {
-              Scalar lhs_real = lhs_ptr[arow];
-              Scalar lhs_imag;
-              if(!LhsIsReal)
-              {
-                lhs_imag = lhs_ptr_imag[arow];
-
-                if(ConjugateLhs)
-                  lhs_imag *= -1;
-              } else {
-                lhs_imag = (Scalar)0;
-              }
-
-              for(int acol = 0; acol < remaining_cols; acol++ )
-              {
-                Scalar rhs_real = rhs_ptr[acol];
-                Scalar rhs_imag;
-                if(!RhsIsReal)
-                {
-                  rhs_imag = rhs_ptr_imag[acol];
-
-                  if(ConjugateRhs)
-                    rhs_imag *= -1;
-                } else {
-                  rhs_imag = (Scalar)0;
-                }
-
-                scalarAcc[arow][acol].real(scalarAcc[arow][acol].real() + lhs_real*rhs_real - lhs_imag*rhs_imag);
-                scalarAcc[arow][acol].imag(scalarAcc[arow][acol].imag() + lhs_imag*rhs_real + lhs_real*rhs_imag);
-              }
-            }
-            rhs_ptr += remaining_cols;
-            lhs_ptr += accCols;
-            if(!RhsIsReal)
-              rhs_ptr_imag += remaining_cols;
-            if(!LhsIsReal)
-              lhs_ptr_imag += accCols;
-          }
-          for(int arow = 0; arow < accCols; arow++ )
-          {
-            for(int acol = 0; acol < remaining_cols; acol++ )
-            {
-              Scalar accR = scalarAcc[arow][acol].real();
-              Scalar accI = scalarAcc[arow][acol].imag();
-              Scalar aR = alpha.real();
-              Scalar aI = alpha.imag();
-              Scalar resR = res(row + arow, col + acol).real();
-              Scalar resI = res(row + arow, col + acol).imag();
-
-              res(row + arow, col + acol).real(resR + accR*aR - accI*aI);
-              res(row + arow, col + acol).imag(resI + accR*aI + accI*aR);
-            }
-          }
-        }
-
-        if(remaining_rows > 0)
-        {
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *rhs_ptr_imag = rhs_ptr + remaining_cols*strideB;
-          const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
-          const Scalar *lhs_ptr_imag = lhs_ptr + remaining_rows*strideA;
-
-          lhs_ptr += remaining_rows*offsetA;
-          if(!LhsIsReal)
-            lhs_ptr_imag += remaining_rows*offsetA;
-          rhs_ptr += remaining_cols*offsetB;
-          if(!RhsIsReal)
-            rhs_ptr_imag += remaining_cols*offsetB;
-          for(Index k = 0; k < depth; k++)
-          {
-            for(Index arow = 0; arow < remaining_rows; arow++)
-            {
-              Scalar lhs_real = lhs_ptr[arow];
-              Scalar lhs_imag;
-              if(!LhsIsReal) lhs_imag = lhs_ptr_imag[arow];
-              Scalarc lhsc;
-
-              lhsc.real(lhs_real);
-              if(!LhsIsReal)
-              {
-                if(ConjugateLhs)
-                  lhsc.imag(-lhs_imag);
-                else
-                  lhsc.imag(lhs_imag);
-              } else {
-                lhsc.imag((Scalar)0);
-              }
-
-              for(Index acol = 0; acol < remaining_cols; acol++ )
-              {
-                Scalar rhs_real = rhs_ptr[acol];
-                Scalar rhs_imag;
-                if(!RhsIsReal) rhs_imag = rhs_ptr_imag[acol];
-                Scalarc rhsc;
-
-                rhsc.real(rhs_real);
-                if(!RhsIsReal)
-                {
-                  if(ConjugateRhs)
-                    rhsc.imag(-rhs_imag);
-                  else
-                    rhsc.imag(rhs_imag);
-                } else {
-                  rhsc.imag((Scalar)0);
-                }
-                res(row + arow, col + acol) += alpha*lhsc*rhsc;
-              }
-            }
-            rhs_ptr += remaining_cols;
-            lhs_ptr += remaining_rows;
-            if(!LhsIsReal)
-              lhs_ptr_imag += remaining_rows;
-            if(!RhsIsReal)
-              rhs_ptr_imag += remaining_cols;
-          }
+          rhs_base++;
         }
       }
 }
 
+#undef accColsC
+#undef advanceCols
+#undef advanceRows
+
 /************************************
  * ppc64le template specializations *
  * **********************************/
@@ -2418,7 +2411,7 @@
 void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
   ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-    lhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode> pack;
+    dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
     pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2432,7 +2425,7 @@
 void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
   ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-    lhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode> pack;
+    dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
     pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2446,7 +2439,7 @@
 void gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
   ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  rhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode> pack;
+  dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2460,7 +2453,7 @@
 void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
   ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  rhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode> pack;
+  dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2474,7 +2467,7 @@
 void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
   ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  lhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode> pack;
+  dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2488,7 +2481,7 @@
 void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
   ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  lhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode> pack;
+  dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
@@ -2501,7 +2494,7 @@
 void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
   ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  lhs_cpack<float, true, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode> pack;
+  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2515,7 +2508,7 @@
 void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
   ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  lhs_cpack<float, true, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode> pack;
+  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2529,7 +2522,7 @@
 void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
   ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  rhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode> pack;
+  dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2543,7 +2536,7 @@
 void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
   ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  rhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode> pack;
+  dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2557,7 +2550,7 @@
 void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
   ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  rhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode> pack;
+  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2571,7 +2564,7 @@
 void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
   ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  rhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode> pack;
+  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2585,7 +2578,7 @@
 void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
   ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  lhs_cpack<double, true, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode> pack;
+  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2599,7 +2592,7 @@
 void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
   ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  lhs_cpack<double, true, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode> pack;
+  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2613,7 +2606,7 @@
 void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
   ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  rhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode> pack;
+  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2627,7 +2620,7 @@
 void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
   ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  rhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode> pack;
+  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2687,8 +2680,8 @@
                Index rows, Index depth, Index cols, std::complex<float> alpha,
                Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
-    const int accRows = quad_traits<float>::rows;
-    const int accCols = quad_traits<float>::size;
+    const Index accRows = quad_traits<float>::rows;
+    const Index accCols = quad_traits<float>::size;
     void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*,
           Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
 
@@ -2726,8 +2719,8 @@
                Index rows, Index depth, Index cols, std::complex<float> alpha,
                Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
-    const int accRows = quad_traits<float>::rows;
-    const int accCols = quad_traits<float>::size;
+    const Index accRows = quad_traits<float>::rows;
+    const Index accCols = quad_traits<float>::size;
     void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*,
           Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
@@ -2764,8 +2757,8 @@
                Index rows, Index depth, Index cols, std::complex<float> alpha,
                Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
-    const int accRows = quad_traits<float>::rows;
-    const int accCols = quad_traits<float>::size;
+    const Index accRows = quad_traits<float>::rows;
+    const Index accCols = quad_traits<float>::size;
     void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*,
           Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
@@ -2839,8 +2832,8 @@
                Index rows, Index depth, Index cols, std::complex<double> alpha,
                Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
-    const int accRows = quad_traits<double>::rows;
-    const int accCols = quad_traits<double>::size;
+    const Index accRows = quad_traits<double>::rows;
+    const Index accCols = quad_traits<double>::size;
     void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*,
           Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
@@ -2877,8 +2870,8 @@
                Index rows, Index depth, Index cols, std::complex<double> alpha,
                Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
-    const int accRows = quad_traits<double>::rows;
-    const int accCols = quad_traits<double>::size;
+    const Index accRows = quad_traits<double>::rows;
+    const Index accCols = quad_traits<double>::size;
     void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*,
           Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
@@ -2915,8 +2908,8 @@
                Index rows, Index depth, Index cols, std::complex<double> alpha,
                Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
-    const int accRows = quad_traits<double>::rows;
-    const int accCols = quad_traits<double>::size;
+    const Index accRows = quad_traits<double>::rows;
+    const Index accCols = quad_traits<double>::size;
     void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*,
           Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
index a1799c0..6e74116 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
@@ -1,3 +1,10 @@
+//#define EIGEN_POWER_USE_PREFETCH  // Use prefetching in gemm routines
+#ifdef EIGEN_POWER_USE_PREFETCH
+#define EIGEN_POWER_PREFETCH(p)  prefetch(p)
+#else
+#define EIGEN_POWER_PREFETCH(p)
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -5,8 +12,8 @@
 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
 EIGEN_STRONG_INLINE void gemm_extra_col(
   const DataMapper& res,
-  const Scalar *lhs_base,
-  const Scalar *rhs_base,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
   Index depth,
   Index strideA,
   Index offsetA,
@@ -16,16 +23,17 @@
   Index remaining_cols,
   const Packet& pAlpha);
 
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
 EIGEN_STRONG_INLINE void gemm_extra_row(
   const DataMapper& res,
-  const Scalar *lhs_base,
-  const Scalar *rhs_base,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
   Index depth,
   Index strideA,
   Index offsetA,
   Index row,
   Index col,
+  Index rows,
   Index cols,
   Index remaining_rows,
   const Packet& pAlpha,
@@ -34,8 +42,8 @@
 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
 EIGEN_STRONG_INLINE void gemm_unrolled_col(
   const DataMapper& res,
-  const Scalar *lhs_base,
-  const Scalar *rhs_base,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
   Index depth,
   Index strideA,
   Index offsetA,
@@ -48,6 +56,71 @@
 template<typename Packet>
 EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows);
 
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_extra_col(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index row,
+  Index col,
+  Index remaining_rows,
+  Index remaining_cols,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag);
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_extra_row(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask);
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_unrolled_col(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index& row,
+  Index rows,
+  Index col,
+  Index remaining_cols,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag);
+
+template<typename Scalar, typename Packet>
+EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs);
+
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
+EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,4>& acc, const DataMapper& res, Index row, Index col);
+
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
+EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,8>& acc, const DataMapper& res, Index row, Index col);
+
+template<typename Packet>
+EIGEN_STRONG_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha);
+
+template<typename Packet, int N>
+EIGEN_STRONG_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag);
+
 const static Packet16uc p16uc_SETCOMPLEX32_FIRST = {  0,  1,  2,  3,
                                                      16, 17, 18, 19,
                                                       4,  5,  6,  7,
@@ -68,7 +141,7 @@
 
 // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
 template<typename Packet, typename Packetc>
-EIGEN_STRONG_INLINE void bcouple(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc,8>& tRes, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
+EIGEN_STRONG_INLINE void bcouple_common(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
 {
   acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
   acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST);
@@ -79,6 +152,12 @@
   acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND);
   acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND);
   acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND);
+}
+
+template<typename Packet, typename Packetc>
+EIGEN_STRONG_INLINE void bcouple(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc,8>& tRes, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
+{
+  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc1, acc2);
 
   acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
   acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
@@ -91,8 +170,26 @@
   acc2.packet[3] = padd<Packetc>(tRes.packet[7], acc2.packet[3]);
 }
 
+template<typename Packet, typename Packetc>
+EIGEN_STRONG_INLINE void bcouple_common(PacketBlock<Packet,1>& taccReal, PacketBlock<Packet,1>& taccImag, PacketBlock<Packetc, 1>& acc1, PacketBlock<Packetc, 1>& acc2)
+{
+  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
+
+  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND);
+}
+
+template<typename Packet, typename Packetc>
+EIGEN_STRONG_INLINE void bcouple(PacketBlock<Packet,1>& taccReal, PacketBlock<Packet,1>& taccImag, PacketBlock<Packetc,2>& tRes, PacketBlock<Packetc, 1>& acc1, PacketBlock<Packetc, 1>& acc2)
+{
+  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc1, acc2);
+
+  acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
+
+  acc2.packet[0] = padd<Packetc>(tRes.packet[1], acc2.packet[0]);
+}
+
 template<>
-EIGEN_STRONG_INLINE void bcouple<Packet2d, Packet1cd>(PacketBlock<Packet2d,4>& taccReal, PacketBlock<Packet2d,4>& taccImag, PacketBlock<Packet1cd,8>& tRes, PacketBlock<Packet1cd, 4>& acc1, PacketBlock<Packet1cd, 4>& acc2)
+EIGEN_STRONG_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2d,4>& taccReal, PacketBlock<Packet2d,4>& taccImag, PacketBlock<Packet1cd, 4>& acc1, PacketBlock<Packet1cd, 4>& acc2)
 {
   acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
   acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST);
@@ -103,23 +200,21 @@
   acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND);
   acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND);
   acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND);
+}
 
-  acc1.packet[0] = padd<Packet1cd>(tRes.packet[0], acc1.packet[0]);
-  acc1.packet[1] = padd<Packet1cd>(tRes.packet[1], acc1.packet[1]);
-  acc1.packet[2] = padd<Packet1cd>(tRes.packet[2], acc1.packet[2]);
-  acc1.packet[3] = padd<Packet1cd>(tRes.packet[3], acc1.packet[3]);
+template<>
+EIGEN_STRONG_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2d,1>& taccReal, PacketBlock<Packet2d,1>& taccImag, PacketBlock<Packet1cd, 1>& acc1, PacketBlock<Packet1cd, 1>& acc2)
+{
+  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
 
-  acc2.packet[0] = padd<Packet1cd>(tRes.packet[4], acc2.packet[0]);
-  acc2.packet[1] = padd<Packet1cd>(tRes.packet[5], acc2.packet[1]);
-  acc2.packet[2] = padd<Packet1cd>(tRes.packet[6], acc2.packet[2]);
-  acc2.packet[3] = padd<Packet1cd>(tRes.packet[7], acc2.packet[3]);
+  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND);
 }
 
 // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
 template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE Packet ploadRhs(const Scalar *rhs)
+EIGEN_STRONG_INLINE Packet ploadRhs(const Scalar* rhs)
 {
-    return *((Packet *)rhs);
+  return *reinterpret_cast<Packet *>(const_cast<Scalar *>(rhs));
 }
 
 } // end namespace internal
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
index 64f1172..8edf79c 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
+// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -22,149 +23,41 @@
 
 namespace internal {
 
-const static Packet16uc MMA_p16uc_SETCOMPLEX32_FIRST = {  0,  1,  2,  3,
-                                                         16, 17, 18, 19,
-                                                          4,  5,  6,  7,
-                                                         20, 21, 22, 23};
-
-const static Packet16uc MMA_p16uc_SETCOMPLEX32_SECOND = {  8,  9, 10, 11,
-                                                          24, 25, 26, 27,
-                                                          12, 13, 14, 15,
-                                                          28, 29, 30, 31};
-//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64
-const static Packet16uc MMA_p16uc_SETCOMPLEX64_FIRST = {  0,  1,  2,  3,  4,  5,  6,  7,
-                                                         16, 17, 18, 19, 20, 21, 22, 23};
-
-//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64
-const static Packet16uc MMA_p16uc_SETCOMPLEX64_SECOND = {  8,  9, 10, 11, 12, 13, 14, 15,
-                                                          24, 25, 26, 27, 28, 29, 30, 31};
-
-
-// Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
-template<typename Packet, typename Packetc>
-EIGEN_STRONG_INLINE void bcoupleMMA(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc,8>& tRes, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
-{
-  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], MMA_p16uc_SETCOMPLEX32_FIRST);
-  acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], MMA_p16uc_SETCOMPLEX32_FIRST);
-  acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], MMA_p16uc_SETCOMPLEX32_FIRST);
-  acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], MMA_p16uc_SETCOMPLEX32_FIRST);
-
-  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], MMA_p16uc_SETCOMPLEX32_SECOND);
-  acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], MMA_p16uc_SETCOMPLEX32_SECOND);
-  acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], MMA_p16uc_SETCOMPLEX32_SECOND);
-  acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], MMA_p16uc_SETCOMPLEX32_SECOND);
-
-  acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
-  acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
-  acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
-  acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
-
-  acc2.packet[0] = padd<Packetc>(tRes.packet[4], acc2.packet[0]);
-  acc2.packet[1] = padd<Packetc>(tRes.packet[5], acc2.packet[1]);
-  acc2.packet[2] = padd<Packetc>(tRes.packet[6], acc2.packet[2]);
-  acc2.packet[3] = padd<Packetc>(tRes.packet[7], acc2.packet[3]);
-}
-
-template<>
-EIGEN_STRONG_INLINE void bcoupleMMA<Packet2d, Packet1cd>(PacketBlock<Packet2d,4>& taccReal, PacketBlock<Packet2d,4>& taccImag, PacketBlock<Packet1cd,8>& tRes, PacketBlock<Packet1cd, 4>& acc1, PacketBlock<Packet1cd, 4>& acc2)
-{
-  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], MMA_p16uc_SETCOMPLEX64_FIRST);
-  acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], MMA_p16uc_SETCOMPLEX64_FIRST);
-  acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], MMA_p16uc_SETCOMPLEX64_FIRST);
-  acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], MMA_p16uc_SETCOMPLEX64_FIRST);
-
-  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], MMA_p16uc_SETCOMPLEX64_SECOND);
-  acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], MMA_p16uc_SETCOMPLEX64_SECOND);
-  acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], MMA_p16uc_SETCOMPLEX64_SECOND);
-  acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], MMA_p16uc_SETCOMPLEX64_SECOND);
-
-  acc1.packet[0] = padd<Packet1cd>(tRes.packet[0], acc1.packet[0]);
-  acc1.packet[1] = padd<Packet1cd>(tRes.packet[1], acc1.packet[1]);
-  acc1.packet[2] = padd<Packet1cd>(tRes.packet[2], acc1.packet[2]);
-  acc1.packet[3] = padd<Packet1cd>(tRes.packet[3], acc1.packet[3]);
-
-  acc2.packet[0] = padd<Packet1cd>(tRes.packet[4], acc2.packet[0]);
-  acc2.packet[1] = padd<Packet1cd>(tRes.packet[5], acc2.packet[1]);
-  acc2.packet[2] = padd<Packet1cd>(tRes.packet[6], acc2.packet[2]);
-  acc2.packet[3] = padd<Packet1cd>(tRes.packet[7], acc2.packet[3]);
-}
-
 template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE Packet ploadLhsMMA(const Scalar *lhs)
-{
-  return *((Packet *)lhs);
-}
-
-template<typename Packet>
-EIGEN_STRONG_INLINE PacketBlock<Packet,2> pmul(const PacketBlock<Packet,2>& a, const Packet& b)
-{
-  PacketBlock<Packet,2> pb;
-  pb.packet[0] = a.packet[0]*b;
-  pb.packet[1] = a.packet[1]*b;
-  return pb;
-}
-
-template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE void bsetzeroMMA(__vector_quad *acc)
+EIGEN_STRONG_INLINE void bsetzeroMMA(__vector_quad* acc)
 {
   __builtin_mma_xxsetaccz(acc);
 }
 
-template<typename DataMapper, typename Index, typename Packet>
-EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad *acc)
+template<typename DataMapper, typename Index, typename Packet, const Index accCols>
+EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc)
 {
   PacketBlock<Packet, 4> result;
   __builtin_mma_disassemble_acc(&result.packet, acc);
 
-  result.packet[0] = pmadd<Packet>(alpha, result.packet[0], data.template loadPacket<Packet>(i, j + 0));
-  result.packet[1] = pmadd<Packet>(alpha, result.packet[1], data.template loadPacket<Packet>(i, j + 1));
-  result.packet[2] = pmadd<Packet>(alpha, result.packet[2], data.template loadPacket<Packet>(i, j + 2));
-  result.packet[3] = pmadd<Packet>(alpha, result.packet[3], data.template loadPacket<Packet>(i, j + 3));
+  PacketBlock<Packet, 4> tRes;
+  bload<DataMapper, Packet, Index, accCols, 0, ColMajor>(tRes, data, i, j);
 
-  data.template storePacketBlock<Packet, 4>(i, j, result);
+  bscale<Packet>(tRes, result, alpha);
+
+  data.template storePacketBlock<Packet, 4>(i, j, tRes);
 }
 
-template<typename DataMapper, typename Index, typename Packet, typename Packetc, int N>
-EIGEN_STRONG_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad *accReal, __vector_quad *accImag, const int accColsC)
+template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accColsC, int N>
+EIGEN_STRONG_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag)
 {
   PacketBlock<Packet, 4> resultReal, resultImag;
   __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
   __builtin_mma_disassemble_acc(&resultImag.packet, accImag);
 
-  PacketBlock<Packet,4> taccReal, taccImag;
-  taccReal.packet[0] = pmul<Packet>(resultReal.packet[0], alphaReal);
-  taccReal.packet[1] = pmul<Packet>(resultReal.packet[1], alphaReal);
-  taccReal.packet[2] = pmul<Packet>(resultReal.packet[2], alphaReal);
-  taccReal.packet[3] = pmul<Packet>(resultReal.packet[3], alphaReal);
-
-  taccImag.packet[0] = pmul<Packet>(resultImag.packet[0], alphaReal);
-  taccImag.packet[1] = pmul<Packet>(resultImag.packet[1], alphaReal);
-  taccImag.packet[2] = pmul<Packet>(resultImag.packet[2], alphaReal);
-  taccImag.packet[3] = pmul<Packet>(resultImag.packet[3], alphaReal);
-
-  taccReal.packet[0] = psub<Packet>(taccReal.packet[0], pmul<Packet>(resultImag.packet[0], alphaImag));
-  taccReal.packet[1] = psub<Packet>(taccReal.packet[1], pmul<Packet>(resultImag.packet[1], alphaImag));
-  taccReal.packet[2] = psub<Packet>(taccReal.packet[2], pmul<Packet>(resultImag.packet[2], alphaImag));
-  taccReal.packet[3] = psub<Packet>(taccReal.packet[3], pmul<Packet>(resultImag.packet[3], alphaImag));
-
-  taccImag.packet[0] = pmadd<Packet>(resultReal.packet[0], alphaImag, taccImag.packet[0]);
-  taccImag.packet[1] = pmadd<Packet>(resultReal.packet[1], alphaImag, taccImag.packet[1]);
-  taccImag.packet[2] = pmadd<Packet>(resultReal.packet[2], alphaImag, taccImag.packet[2]);
-  taccImag.packet[3] = pmadd<Packet>(resultReal.packet[3], alphaImag, taccImag.packet[3]);
-
   PacketBlock<Packetc, 8> tRes;
-  tRes.packet[0] = data.template loadPacket<Packetc>(i + N*accColsC, j + 0);
-  tRes.packet[1] = data.template loadPacket<Packetc>(i + N*accColsC, j + 1);
-  tRes.packet[2] = data.template loadPacket<Packetc>(i + N*accColsC, j + 2);
-  tRes.packet[3] = data.template loadPacket<Packetc>(i + N*accColsC, j + 3);
+  bload<DataMapper, Packetc, Index, accColsC, N, ColMajor>(tRes, data, i, j);
 
-  tRes.packet[4] = data.template loadPacket<Packetc>(i + (N+1)*accColsC, j + 0);
-  tRes.packet[5] = data.template loadPacket<Packetc>(i + (N+1)*accColsC, j + 1);
-  tRes.packet[6] = data.template loadPacket<Packetc>(i + (N+1)*accColsC, j + 2);
-  tRes.packet[7] = data.template loadPacket<Packetc>(i + (N+1)*accColsC, j + 3);
+  PacketBlock<Packet,4> taccReal, taccImag;
+  bscalec<Packet,4>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag);
 
   PacketBlock<Packetc, 4> acc1, acc2;
-  bcoupleMMA<Packet, Packetc>(taccReal, taccImag, tRes, acc1, acc2);
+  bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc1, acc2);
 
   data.template storePacketBlock<Packetc, 4>(i + N*accColsC, j, acc1);
   data.template storePacketBlock<Packetc, 4>(i + (N+1)*accColsC, j, acc2);
@@ -172,7 +65,7 @@
 
 // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pgerMMA(__vector_quad *acc, const RhsPacket& a, const LhsPacket& b)
+EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b)
 {
   if(NegativeAccumulate)
   {
@@ -182,110 +75,148 @@
   }
 }
 
-template<>
-EIGEN_STRONG_INLINE void pgerMMA<Packet2d, PacketBlock<Packet2d, 2>, false>(__vector_quad *acc, const PacketBlock<Packet2d,2>& a, const Packet2d& b)
+template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock<Packet2d,2>& a, const Packet2d& b)
 {
-  __vector_pair *a0 = (__vector_pair *)(&a.packet[0]);
-  __builtin_mma_xvf64gerpp(acc, *a0, (__vector unsigned char)b);
+  __vector_pair* a0 = (__vector_pair *)(&a.packet[0]);
+  if(NegativeAccumulate)
+  {
+    __builtin_mma_xvf64gernp(acc, *a0, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf64gerpp(acc, *a0, (__vector unsigned char)b);
+  }
 }
 
-template<>
-EIGEN_STRONG_INLINE void pgerMMA<Packet2d, PacketBlock<Packet2d, 2>, true>(__vector_quad *acc, const PacketBlock<Packet2d, 2>& a, const Packet2d& b)
+template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b)
 {
-  __vector_pair *a0 = (__vector_pair *)(&a.packet[0]);
-  __builtin_mma_xvf64gernp(acc, *a0, (__vector unsigned char)b);
+  if(NegativeAccumulate)
+  {
+    __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  }
 }
 
-template<>
-EIGEN_STRONG_INLINE void pgerMMA<Packet2d, __vector_pair, false>(__vector_quad *acc, const __vector_pair& a, const Packet2d& b)
+template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet4f& b)
 {
-  __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  EIGEN_UNUSED_VARIABLE(acc); // Just for compilation
+  EIGEN_UNUSED_VARIABLE(a);
+  EIGEN_UNUSED_VARIABLE(b);
 }
 
-template<>
-EIGEN_STRONG_INLINE void pgerMMA<Packet2d, __vector_pair, true>(__vector_quad *acc, const __vector_pair& a, const Packet2d& b)
+template<typename Scalar, typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi)
 {
-  __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
-}
-
-template<>
-EIGEN_STRONG_INLINE void pgerMMA<Packet4f, __vector_pair, false>(__vector_quad *acc, const __vector_pair& a, const Packet4f& b)
-{
-  // Just for compilation
-  EIGEN_UNUSED_VARIABLE(acc)
-  EIGEN_UNUSED_VARIABLE(a)
-  EIGEN_UNUSED_VARIABLE(b)
-}
-
-template<>
-EIGEN_STRONG_INLINE void pgerMMA<Packet4f, __vector_pair, true>(__vector_quad *acc, const __vector_pair& a, const Packet4f& b)
-{
-  // Just for compilation
-  EIGEN_UNUSED_VARIABLE(acc)
-  EIGEN_UNUSED_VARIABLE(a)
-  EIGEN_UNUSED_VARIABLE(b)
+  pgerMMA<Packet, RhsPacket, false>(accReal,  rhsV,  lhsV);
+  if(LhsIsReal) {
+    pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi,  lhsV);
+  } else {
+    if(!RhsIsReal) {
+      pgerMMA<Packet, RhsPacket, ConjugateLhs == ConjugateRhs>(accReal, rhsVi, lhsVi);
+      pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi,  lhsV);
+    } else {
+      EIGEN_UNUSED_VARIABLE(rhsVi);
+    }
+    pgerMMA<Packet, RhsPacket, ConjugateLhs>(accImag,  rhsV, lhsVi);
+  }
 }
 
 // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
 template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE void ploadRhsMMA(const Scalar *rhs, Packet &rhsV)
+EIGEN_STRONG_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV)
 {
-  rhsV = *((Packet *)rhs);
+  rhsV = ploadRhs<Scalar, Packet>((const Scalar*)(rhs));
 } 
 
 template<>
-EIGEN_STRONG_INLINE void ploadRhsMMA<double, PacketBlock<Packet2d, 2> >(const double *rhs, PacketBlock<Packet2d, 2> &rhsV)
+EIGEN_STRONG_INLINE void ploadRhsMMA<double, PacketBlock<Packet2d, 2> >(const double* rhs, PacketBlock<Packet2d, 2>& rhsV)
 {
-  rhsV.packet[0] = *((Packet2d *)rhs      );
-  rhsV.packet[1] = *(((Packet2d *)rhs) + 1);
+  rhsV.packet[0] = ploadRhs<double, Packet2d>((const double *)((Packet2d *)rhs      ));
+  rhsV.packet[1] = ploadRhs<double, Packet2d>((const double *)(((Packet2d *)rhs) + 1));
 }
 
 template<>
-EIGEN_STRONG_INLINE void ploadRhsMMA<double, __vector_pair>(const double *rhs, __vector_pair &rhsV)
+EIGEN_STRONG_INLINE void ploadRhsMMA<double, __vector_pair>(const double* rhs, __vector_pair& rhsV)
 {
-  __builtin_vsx_assemble_pair(&rhsV, (__vector unsigned char)(*(((Packet2d *)rhs) + 1)), (__vector unsigned char)(*((Packet2d *)rhs)));
+#if EIGEN_COMP_LLVM
+  __builtin_vsx_assemble_pair(&rhsV,
+    (__vector unsigned char)(ploadRhs<double, Packet2d>((const double *)(((Packet2d *)rhs) + 1))),
+    (__vector unsigned char)(ploadRhs<double, Packet2d>((const double *)((Packet2d *)rhs      ))));
+#else
+  __asm__ ("lxvp %x0,%1" : "=wa" (rhsV) : "Y" (*rhs));
+#endif
 }
 
-#define MICRO_MMA_DST \
-  __vector_quad *accZero0, __vector_quad *accZero1, __vector_quad *accZero2, \
-  __vector_quad *accZero3, __vector_quad *accZero4, __vector_quad *accZero5, \
-  __vector_quad *accZero6, __vector_quad *accZero7
+template<>
+EIGEN_STRONG_INLINE void ploadRhsMMA(const float* rhs, __vector_pair& rhsV)
+{
+  // Just for compilation
+  EIGEN_UNUSED_VARIABLE(rhs);
+  EIGEN_UNUSED_VARIABLE(rhsV);
+}
 
-#define MICRO_MMA_SRC \
-  const Scalar **lhs_ptr0, const Scalar **lhs_ptr1, const Scalar **lhs_ptr2, \
-  const Scalar **lhs_ptr3, const Scalar **lhs_ptr4, const Scalar **lhs_ptr5, \
-  const Scalar **lhs_ptr6, const Scalar **lhs_ptr7
-
-#define MICRO_MMA_ONE \
-  if (sizeof(Scalar) == sizeof(float)) { \
-    MICRO_MMA<unroll_factor, Scalar, Packet, RhsPacket, accRows, accCols>(\
-      &lhs_ptr0, &lhs_ptr1, &lhs_ptr2, &lhs_ptr3, &lhs_ptr4, &lhs_ptr5, &lhs_ptr6, &lhs_ptr7, \
-      rhs_ptr, \
-      &accZero0, &accZero1, &accZero2, &accZero3, &accZero4, &accZero5, &accZero6, &accZero7); \
-  } else { \
-    MICRO_MMA<unroll_factor, Scalar, Packet, __vector_pair, accRows, accCols>(\
-      &lhs_ptr0, &lhs_ptr1, &lhs_ptr2, &lhs_ptr3, &lhs_ptr4, &lhs_ptr5, &lhs_ptr6, &lhs_ptr7, \
-      rhs_ptr, \
-      &accZero0, &accZero1, &accZero2, &accZero3, &accZero4, &accZero5, &accZero6, &accZero7); \
-  }
-
-#define MICRO_MMA_WORK_ONE(iter) \
-  if (N > iter) { \
-    Packet lhsV = ploadLhsMMA<Scalar, Packet>(*lhs_ptr##iter); \
-    pgerMMA<Packet, RhsPacket, false>(accZero##iter, rhsV, lhsV); \
-    *lhs_ptr##iter += accCols; \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(accZero##iter); \
-    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
-  }
+// PEEL_MMA loop factor.
+#define PEEL_MMA 7
 
 #define MICRO_MMA_UNROLL(func) \
   func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
 
-#define MICRO_MMA_WORK MICRO_MMA_UNROLL(MICRO_MMA_WORK_ONE)
+#define MICRO_MMA_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
+    lhs_ptr##iter += accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+  }
+
+#define MICRO_MMA_WORK_ONE(iter, type, peel) \
+  if (unroll_factor > iter) { \
+    pgerMMA<Packet, type, false>(&accZero##iter, rhsV##peel, lhsV##iter); \
+  }
+
+#define MICRO_MMA_TYPE_PEEL(func, func2, type, peel) \
+  if (PEEL_MMA > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
+    ploadRhsMMA<Scalar, type>(rhs_ptr + (accRows * peel), rhsV##peel); \
+    MICRO_MMA_UNROLL(func2); \
+    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
+    func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+  }
+
+#define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
+  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,8); MICRO_MMA_TYPE_PEEL(func,func2,type,9);
+
+#define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \
+  type rhsV0; \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,0);
+
+#define MICRO_MMA_ONE_PEEL \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \
+  } else { \
+    MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \
+  } \
+  rhs_ptr += (accRows * PEEL_MMA);
+
+#define MICRO_MMA_ONE \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \
+  } else { \
+    MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \
+  } \
+  rhs_ptr += accRows;
 
 #define MICRO_MMA_DST_PTR_ONE(iter) \
-  if (unroll_factor > iter){ \
+  if (unroll_factor > iter) { \
     bsetzeroMMA<Scalar, Packet>(&accZero##iter); \
   } else { \
     EIGEN_UNUSED_VARIABLE(accZero##iter); \
@@ -303,39 +234,24 @@
 #define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_MMA_SRC_PTR_ONE)
 
 #define MICRO_MMA_PREFETCH_ONE(iter) \
-  if (unroll_factor > iter){ \
-    prefetch(lhs_ptr##iter); \
+  if (unroll_factor > iter) { \
+    EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
   }
 
 #define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_MMA_PREFETCH_ONE)
 
 #define MICRO_MMA_STORE_ONE(iter) \
-  if (unroll_factor > iter){ \
-    storeAccumulator<DataMapper, Index, Packet>(row + iter*accCols, col, res, pAlpha, &accZero##iter); \
+  if (unroll_factor > iter) { \
+    storeAccumulator<DataMapper, Index, Packet, accCols>(row + iter*accCols, col, res, pAlpha, &accZero##iter); \
   }
 
 #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
 
-// PEEL_MMA loop factor.
-#define PEEL_MMA 10
-
-template<int N, typename Scalar, typename Packet, typename RhsPacket, const Index accRows, const Index accCols>
-EIGEN_STRONG_INLINE void MICRO_MMA(
-  MICRO_MMA_SRC,
-  const Scalar* &rhs_ptr,
-  MICRO_MMA_DST)
-  {
-    RhsPacket rhsV;
-    ploadRhsMMA<Scalar, RhsPacket>(rhs_ptr, rhsV);
-    MICRO_MMA_WORK
-    rhs_ptr += accRows;
-  }
-
 template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
 EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration(
   const DataMapper& res,
-  const Scalar *lhs_base,
-  const Scalar *rhs_base,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
   Index depth,
   Index strideA,
   Index offsetA,
@@ -343,22 +259,20 @@
   Index col,
   const Packet& pAlpha)
 {
-  const Scalar *rhs_ptr = rhs_base;
-  const Scalar *lhs_ptr0, *lhs_ptr1, *lhs_ptr2, *lhs_ptr3, *lhs_ptr4, *lhs_ptr5, *lhs_ptr6, *lhs_ptr7;
+asm("#gemm_MMA begin");
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7;
   __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
 
-  asm("#unrolled MMA start");
   MICRO_MMA_SRC_PTR
   MICRO_MMA_DST_PTR
 
   Index k = 0;
   for(; k + PEEL_MMA <= depth; k+= PEEL_MMA)
   {
-    prefetch(rhs_ptr);
+    EIGEN_POWER_PREFETCH(rhs_ptr);
     MICRO_MMA_PREFETCH
-    for (int l = 0; l < PEEL_MMA; l++) {
-      MICRO_MMA_ONE
-    }
+    MICRO_MMA_ONE_PEEL
   }
   for(; k < depth; k++)
   {
@@ -367,7 +281,7 @@
   MICRO_MMA_STORE
 
   row += unroll_factor*accCols;
-  asm("#unrolled MMA end");
+asm("#gemm_MMA end");
 }
 
 template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
@@ -385,15 +299,15 @@
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        const Scalar *rhs_base = blockB + col*strideB + accRows*offsetB;
-        const Scalar *lhs_base = blockA;
+        const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
+        const Scalar* lhs_base = blockA;
 
         Index row = 0;
 #define MAX_MMA_UNROLL 7
-        while(row + MAX_MMA_UNROLL*accCols <= rows){
+        while(row + MAX_MMA_UNROLL*accCols <= rows) {
           gemm_unrolled_MMA_iteration<MAX_MMA_UNROLL, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
         }
-        switch( (rows-row)/accCols ){
+        switch( (rows-row)/accCols ) {
 #if MAX_MMA_UNROLL > 7
           case 7:
             gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
@@ -436,334 +350,288 @@
 
         if(remaining_rows > 0)
         {
-          gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, cols, remaining_rows, pAlpha, pMask);
+          gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
         }
-    }
-
-    if(remaining_cols > 0)
-    {
-      const Scalar *rhs_base = blockB + col*strideB + remaining_cols*offsetB;
-      const Scalar *lhs_base = blockA;
-
-      for(; col < cols; col++)
-      {
-        Index row = 0;
-
-        gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
-
-        if (remaining_rows > 0)
-        {
-          gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
-        }
-        rhs_base++;
       }
-    }
+
+      if(remaining_cols > 0)
+      {
+        const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
+        const Scalar* lhs_base = blockA;
+
+        for(; col < cols; col++)
+        {
+          Index row = 0;
+
+          gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
+
+          if (remaining_rows > 0)
+          {
+            gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
+          }
+          rhs_base++;
+        }
+      }
 }
 
-template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const int accRows, const int accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc,
-          Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
-      const int remaining_rows = rows % accCols;
-      const int remaining_cols = cols % accRows;
-      const int accColsC = accCols / 2;
-      int advanceCols = 2;
-      int advanceRows = 2;
+#define accColsC (accCols / 2)
+#define advanceRows ((LhsIsReal) ? 1 : 2)
+#define advanceCols ((RhsIsReal) ? 1 : 2)
 
-      if(LhsIsReal) advanceRows = 1;
-      if(RhsIsReal) advanceCols = 1;
+// PEEL_COMPLEX_MMA loop factor.
+#define PEEL_COMPLEX_MMA 7
+
+#define MICRO_COMPLEX_MMA_UNROLL(func) \
+  func(0) func(1) func(2) func(3) func(4)
+
+#define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
+    lhs_ptr_real##iter += accCols; \
+    if(!LhsIsReal) { \
+      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \
+      lhs_ptr_imag##iter += accCols; \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+    } \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+    EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \
+  if (unroll_factor > iter) { \
+    pgercMMA<Scalar, Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \
+  if (PEEL_COMPLEX_MMA > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
+    ploadRhsMMA<Scalar, type>(rhs_ptr_real + (accRows * peel), rhsV##peel); \
+    if(!RhsIsReal) { \
+      ploadRhsMMA<Scalar, type>(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+    } \
+    MICRO_COMPLEX_MMA_UNROLL(func2); \
+    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) func(4,type,peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
+  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \
+  type rhsVi0, rhsVi1, rhsVi2, rhsVi3, rhsVi4, rhsVi5, rhsVi6, rhsVi7, rhsVi8, rhsVi9; \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,4); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,5); \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,6); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,7); \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,8); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,9);
+
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \
+  type rhsV0, rhsVi0; \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0);
+
+#define MICRO_COMPLEX_MMA_ONE_PEEL \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \
+  } else { \
+    MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \
+  } \
+  rhs_ptr_real += (accRows * PEEL_COMPLEX_MMA); \
+  if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_MMA);
+
+#define MICRO_COMPLEX_MMA_ONE \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \
+  } else { \
+    MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \
+  } \
+  rhs_ptr_real += accRows; \
+  if(!RhsIsReal) rhs_ptr_imag += accRows;
+
+#define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bsetzeroMMA<Scalar, Packet>(&accReal##iter); \
+    bsetzeroMMA<Scalar, Packet>(&accImag##iter); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accReal##iter); \
+    EIGEN_UNUSED_VARIABLE(accImag##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE)
+
+#define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \
+    if(!LhsIsReal) { \
+      lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
+    } \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE)
+
+#define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \
+  if (unroll_factor > iter) { \
+    EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
+    if(!LhsIsReal) { \
+      EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \
+    } \
+  }
+
+#define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE)
+
+#define MICRO_COMPLEX_MMA_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accColsC, 0>(row + iter*accCols, col, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
+
+template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index& row,
+  Index col,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag)
+{
+asm("#gemm_complex_MMA begin");
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag;
+  if(!RhsIsReal) {
+    rhs_ptr_imag = rhs_base + accRows*strideB;
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  }
+  const Scalar* lhs_ptr_real0, * lhs_ptr_imag0, * lhs_ptr_real1, * lhs_ptr_imag1;
+  const Scalar* lhs_ptr_real2, * lhs_ptr_imag2, * lhs_ptr_real3, * lhs_ptr_imag3;
+  const Scalar* lhs_ptr_real4, * lhs_ptr_imag4;
+  __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4;
+
+  MICRO_COMPLEX_MMA_SRC_PTR
+  MICRO_COMPLEX_MMA_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL_COMPLEX_MMA <= depth; k+= PEEL_COMPLEX_MMA)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if(!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    MICRO_COMPLEX_MMA_PREFETCH
+    MICRO_COMPLEX_MMA_ONE_PEEL
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_COMPLEX_MMA_ONE
+  }
+  MICRO_COMPLEX_MMA_STORE
+
+  row += unroll_factor*accCols;
+asm("#gemm_complex_MMA end");
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+      const Index remaining_rows = rows % accCols;
+      const Index remaining_cols = cols % accRows;
 
       if( strideA == -1 ) strideA = depth;
       if( strideB == -1 ) strideB = depth;
 
       const Packet pAlphaReal = pset1<Packet>(alpha.real());
       const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+      const Packet pMask = bmask<Packet>((const int)(remaining_rows));
 
-      const Scalar *blockA = (Scalar *) blockAc;
-      const Scalar *blockB = (Scalar *) blockBc;
-
-      Packet conj = pset1<Packet>((Scalar)-1.0f);
+      const Scalar* blockA = (Scalar *) blockAc;
+      const Scalar* blockB = (Scalar *) blockBc;
 
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        const Scalar *rhs_base = blockB + ( (advanceCols*col)/accRows     )*strideB*accRows;
-        const Scalar *lhs_base = blockA;
-
+        const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
+        const Scalar* lhs_base = blockA;
         Index row = 0;
 
-        for(; row + accCols <= rows; row += accCols)
-        {
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *rhs_ptr_imag = rhs_ptr + accRows*strideB;
-          const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
-          const Scalar *lhs_ptr_imag = lhs_ptr + accCols*strideA;
-
-          __vector_quad accReal, accImag;
-          __builtin_mma_xxsetaccz(&accReal);
-          __builtin_mma_xxsetaccz(&accImag);
-
-          lhs_ptr += accCols*offsetA;
-          if(!LhsIsReal)
-            lhs_ptr_imag += accCols*offsetA;
-          rhs_ptr += accRows*offsetB;
-          if(!RhsIsReal)
-            rhs_ptr_imag += accRows*offsetB;
-          for(Index k = 0; k < depth; k++)
-          {
-            Packet lhsV = ploadLhsMMA<Scalar, Packet>(lhs_ptr);
-            RhsPacket rhsV = ploadRhs<Scalar, RhsPacket>(rhs_ptr);
-
-            Packet lhsVi = ploadLhsMMA<Scalar, Packet>(lhs_ptr_imag);
-            RhsPacket rhsVi = ploadRhs<Scalar, RhsPacket>(rhs_ptr_imag);
-
-            if(ConjugateLhs && !LhsIsReal) lhsVi = pmul<Packet>(lhsVi, conj);
-            if(ConjugateRhs && !RhsIsReal) rhsVi = pmul<Packet>(rhsVi, conj);
-
-            if(LhsIsReal)
-            {
-              pgerMMA<Packet, RhsPacket, false>(&accReal,  rhsV,  lhsV);
-              pgerMMA<Packet, RhsPacket, false>(&accImag, rhsVi,  lhsV);
-            } else if(RhsIsReal) {
-              pgerMMA<Packet, RhsPacket, false>(&accReal,  rhsV,  lhsV);
-              pgerMMA<Packet, RhsPacket, false>(&accImag,  rhsV, lhsVi);
-            } else {
-              pgerMMA<Packet, RhsPacket, false>(&accReal,  rhsV,  lhsV);
-              pgerMMA<Packet, RhsPacket,  true>(&accReal, rhsVi, lhsVi);
-              pgerMMA<Packet, RhsPacket, false>(&accImag, rhsVi,  lhsV);
-              pgerMMA<Packet, RhsPacket, false>(&accImag,  rhsV, lhsVi);
-            }
-
-            lhs_ptr += accCols;
-            rhs_ptr += accRows;
-            if(!LhsIsReal)
-              lhs_ptr_imag += accCols;
-            if(!RhsIsReal)
-              rhs_ptr_imag += accRows;
-          }
-
-          storeComplexAccumulator<DataMapper, Index, Packet, Packetc, 0>(row, col, res, pAlphaReal, pAlphaImag, &accReal, &accImag, accColsC);
+#define MAX_COMPLEX_MMA_UNROLL 4
+        while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
+          gemm_complex_unrolled_MMA_iteration<MAX_COMPLEX_MMA_UNROLL, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
         }
+        switch( (rows-row)/accCols ) {
+#if MAX_COMPLEX_MMA_UNROLL > 4
+          case 4:
+            gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 3
+          case 3:
+            gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 2
+          case 2:
+            gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 1
+          case 1:
+            gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+          default:
+            break;
+        }
+#undef MAX_COMPLEX_MMA_UNROLL
 
-          if(remaining_rows > 0)
-          {
-            const Scalar *rhs_ptr  = rhs_base;
-            const Scalar *rhs_ptr_imag = rhs_ptr + accRows*strideB;
-            const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
-            const Scalar *lhs_ptr_imag = lhs_ptr + remaining_rows*strideA;
-
-            lhs_ptr += remaining_rows*offsetA;
-            if(!LhsIsReal)
-              lhs_ptr_imag += remaining_rows*offsetA;
-            rhs_ptr += accRows*offsetB;
-            if(!RhsIsReal)
-              rhs_ptr_imag += accRows*offsetB;
-            for(Index k = 0; k < depth; k++)
-            {
-              for(Index arow = 0; arow < remaining_rows; arow++)
-              {
-                Scalar lhs_real = lhs_ptr[arow];
-                Scalar lhs_imag;
-                if(!LhsIsReal) lhs_imag = lhs_ptr_imag[arow];
-
-                Scalarc lhsc;
-
-                lhsc.real(lhs_real);
-                if(!LhsIsReal)
-                {
-                  if(ConjugateLhs) 
-                    lhsc.imag(-lhs_imag);
-                  else
-                    lhsc.imag(lhs_imag);
-                } else {
-                  //Lazy approach for now
-                  lhsc.imag((Scalar)0);
-                }
-
-                for(int acol = 0; acol < accRows; acol++ )
-                {
-                  Scalar rhs_real = rhs_ptr[acol];
-                  Scalar rhs_imag;
-                  if(!RhsIsReal) rhs_imag = rhs_ptr_imag[acol];
-                  Scalarc rhsc;
-
-                  rhsc.real(rhs_real);
-                  if(!RhsIsReal)
-                  {
-                    if(ConjugateRhs)
-                      rhsc.imag(-rhs_imag);
-                    else
-                      rhsc.imag(rhs_imag);
-                  } else {
-                    //Lazy approach for now
-                    rhsc.imag((Scalar)0);
-                  }
-                  res(row + arow, col + acol) += alpha*lhsc*rhsc;
-                }
-              }
-              rhs_ptr += accRows;
-              lhs_ptr += remaining_rows;
-              if(!LhsIsReal)
-                lhs_ptr_imag += remaining_rows;
-              if(!RhsIsReal)
-                rhs_ptr_imag += accRows;
-            }
-          }
+        if(remaining_rows > 0)
+        {
+          gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+        }
       }
 
       if(remaining_cols > 0)
       {
-        const Scalar *rhs_base = blockB + ( (advanceCols*col)/accRows     )*strideB*accRows;
-        const Scalar *lhs_base = blockA;
-        Index row = 0;
+        const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB;
+        const Scalar* lhs_base = blockA;
 
-        for(; row + accCols <= rows; row += accCols)
+        for(; col < cols; col++)
         {
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *rhs_ptr_imag = rhs_ptr + remaining_cols*strideB;
-          const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
-          const Scalar *lhs_ptr_imag = lhs_ptr + accCols*strideA;
+          Index row = 0;
 
-          lhs_ptr += accCols*offsetA;
-          if(!LhsIsReal)
-            lhs_ptr_imag += accCols*offsetA;
-          rhs_ptr += remaining_cols*offsetB;
-          if(!RhsIsReal)
-            rhs_ptr_imag += remaining_cols*offsetB;
-          Scalarc scalarAcc[4][4];
-          for(Index arow = 0; arow < 4; arow++ )
+          gemm_complex_unrolled_col<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag);
+
+          if (remaining_rows > 0)
           {
-            for(Index acol = 0; acol < 4; acol++ )
-            {
-              scalarAcc[arow][acol].real((Scalar)0.0f);
-              scalarAcc[arow][acol].imag((Scalar)0.0f);
-            }
+            gemm_complex_extra_col<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag);
           }
-          for(Index k = 0; k < depth; k++)
-          {
-            for(Index arow = 0; arow < accCols; arow++)
-            {
-              Scalar lhs_real = lhs_ptr[arow];
-              Scalar lhs_imag;
-              if(!LhsIsReal) 
-              {
-                lhs_imag = lhs_ptr_imag[arow];
-
-                if(ConjugateLhs)
-                  lhs_imag *= -1;
-              } else {
-                lhs_imag = (Scalar)0;
-              }
-
-              for(int acol = 0; acol < remaining_cols; acol++ )
-              {
-                Scalar rhs_real = rhs_ptr[acol];
-                Scalar rhs_imag;
-                if(!RhsIsReal)
-                {
-                  rhs_imag = rhs_ptr_imag[acol];
-
-                  if(ConjugateRhs)
-                    rhs_imag *= -1;
-                } else {
-                  rhs_imag = (Scalar)0;
-                }
-
-                scalarAcc[arow][acol].real(scalarAcc[arow][acol].real() + lhs_real*rhs_real - lhs_imag*rhs_imag);
-                scalarAcc[arow][acol].imag(scalarAcc[arow][acol].imag() + lhs_imag*rhs_real + lhs_real*rhs_imag);
-              }
-            }
-            rhs_ptr += remaining_cols;
-            lhs_ptr += accCols;
-            if(!RhsIsReal)
-              rhs_ptr_imag += remaining_cols;
-            if(!LhsIsReal)
-              lhs_ptr_imag += accCols;
-          }
-          for(int arow = 0; arow < accCols; arow++ )
-          {
-            for(int acol = 0; acol < remaining_cols; acol++ )
-            {
-              Scalar accR = scalarAcc[arow][acol].real();
-              Scalar accI = scalarAcc[arow][acol].imag();
-              Scalar aR = alpha.real();
-              Scalar aI = alpha.imag();
-              Scalar resR = res(row + arow, col + acol).real();
-              Scalar resI = res(row + arow, col + acol).imag();
-
-              res(row + arow, col + acol).real(resR + accR*aR - accI*aI);
-              res(row + arow, col + acol).imag(resI + accR*aI + accI*aR);
-            }
-          }
-        }
-
-        if(remaining_rows > 0)
-        {
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *rhs_ptr_imag = rhs_ptr + remaining_cols*strideB;
-          const Scalar *lhs_ptr = lhs_base + ((advanceRows*row)/accCols)*strideA*accCols;
-          const Scalar *lhs_ptr_imag = lhs_ptr + remaining_rows*strideA;
-
-          lhs_ptr += remaining_rows*offsetA;
-          if(!LhsIsReal)
-            lhs_ptr_imag += remaining_rows*offsetA;
-          rhs_ptr += remaining_cols*offsetB;
-          if(!RhsIsReal)
-            rhs_ptr_imag += remaining_cols*offsetB;
-          for(Index k = 0; k < depth; k++)
-          {
-            for(Index arow = 0; arow < remaining_rows; arow++)
-            {
-              Scalar lhs_real = lhs_ptr[arow];
-              Scalar lhs_imag;
-              if(!LhsIsReal) lhs_imag = lhs_ptr_imag[arow];
-              Scalarc lhsc;
-
-              lhsc.real(lhs_real);
-              if(!LhsIsReal)
-              {
-                if(ConjugateLhs) 
-                  lhsc.imag(-lhs_imag);
-                else
-                  lhsc.imag(lhs_imag);
-              } else {
-                lhsc.imag((Scalar)0);
-              }
-
-              for(Index acol = 0; acol < remaining_cols; acol++ )
-              {
-                Scalar rhs_real = rhs_ptr[acol];
-                Scalar rhs_imag;
-                if(!RhsIsReal) rhs_imag = rhs_ptr_imag[acol];
-                Scalarc rhsc;
-
-                rhsc.real(rhs_real);
-                if(!RhsIsReal)
-                {
-                  if(ConjugateRhs)
-                    rhsc.imag(-rhs_imag);
-                  else
-                    rhsc.imag(rhs_imag);
-                } else {
-                  rhsc.imag((Scalar)0);
-                }
-                res(row + arow, col + acol) += alpha*lhsc*rhsc;
-              }
-            }
-            rhs_ptr += remaining_cols;
-            lhs_ptr += remaining_rows;
-            if(!LhsIsReal)
-              lhs_ptr_imag += remaining_rows;
-            if(!RhsIsReal)
-              rhs_ptr_imag += remaining_cols;
-          }
+          rhs_base++;
         }
       }
 }
 
+#undef accColsC
+#undef advanceRows
+#undef advanceCols
+
 #pragma GCC reset_options
 } // end namespace internal
 
 } // end namespace Eigen
+
 #endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
 
diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index 3c0cd39..aac60f1 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -562,11 +562,14 @@
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) {
   return bfloat16(::floorf(float(a)));
 }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) {
+  return bfloat16(::ceilf(float(a)));
+}
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) {
   return bfloat16(::rintf(float(a)));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) {
-  return bfloat16(::ceilf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) {
+  return bfloat16(::roundf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) {
   return bfloat16(::fmodf(float(a), float(b)));
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 411640e..87e8c27 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -135,7 +135,7 @@
 // Explicitly multiplies 
 //    a * (2^e)
 // clamping e to the range
-// [numeric_limits<Scalar>::min_exponent-2, numeric_limits<Scalar>::max_exponent]
+// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
 //
 // This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
 // if 2^e doesn't fit into a normal floating-point Scalar.
@@ -1480,8 +1480,8 @@
   const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y));
   const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf);
   EIGEN_CONSTEXPR Scalar huge_exponent =
-      (std::numeric_limits<Scalar>::max_exponent * Scalar(EIGEN_LN2)) /
-      std::numeric_limits<Scalar>::epsilon();
+      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) /
+       NumTraits<Scalar>::epsilon();
   const Packet abs_y_is_huge = pcmp_le(pset1<Packet>(huge_exponent), pabs(y));
 
   // Predicates for whether y is integer and/or even.
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 56ff7ce..9f8e8cc 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -718,9 +718,6 @@
   return half(::floorf(float(a)));
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) {
-  return half(::rintf(float(a)));
-}
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
   defined(EIGEN_HIP_DEVICE_COMPILE)
@@ -729,6 +726,12 @@
   return half(::ceilf(float(a)));
 #endif
 }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) {
+  return half(::rintf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) {
+  return half(::roundf(float(a)));
+}
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half fmod(const half& a, const half& b) {
   return half(::fmodf(float(a), float(b)));
 }
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 7d69de6..9cf4e07 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -4053,7 +4053,7 @@
     HasRsqrt = 1,
     HasErf = EIGEN_FAST_MATH,
     HasBessel = 0,  // Issues with accuracy.
-    HasNdtri = 0,
+    HasNdtri = 0
   };
 };
 
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index d7b8bc8..db102c7 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -149,11 +149,10 @@
     HasBlend = 1,
     HasCeil = 1,
     HasFloor = 1,
-    HasRint = 1,
-
 #ifdef EIGEN_VECTORIZE_SSE4_1
     HasRound = 1,
 #endif
+    HasRint = 1
   };
 };
 template <>
@@ -175,10 +174,10 @@
     HasBlend = 1,
     HasFloor = 1,
     HasCeil = 1,
-    HasRint = 1,
-  #ifdef EIGEN_VECTORIZE_SSE4_1
+#ifdef EIGEN_VECTORIZE_SSE4_1
     HasRound = 1,
 #endif
+    HasRint = 1
   };
 };
 #endif
diff --git a/Eigen/src/Core/util/IndexedViewHelper.h b/Eigen/src/Core/util/IndexedViewHelper.h
index 1cda850..f85de30 100644
--- a/Eigen/src/Core/util/IndexedViewHelper.h
+++ b/Eigen/src/Core/util/IndexedViewHelper.h
@@ -78,7 +78,7 @@
 
 // Analogue of std::get<0>(x), but tailored for our needs.
 template<typename T>
-Index first(const T& x) { return x.first(); }
+EIGEN_CONSTEXPR Index first(const T& x) EIGEN_NOEXCEPT { return x.first(); }
 
 // IndexedViewCompatibleType/makeIndexedViewCompatible turn an arbitrary object of type T into something usable by MatrixSlice
 // The generic implementation is a no-op
@@ -100,8 +100,8 @@
   };
   SingleRange(Index val) : m_value(val) {}
   Index operator[](Index) const { return m_value; }
-  Index size() const { return 1; }
-  Index first() const { return m_value; }
+  static EIGEN_CONSTEXPR Index size() EIGEN_NOEXCEPT { return 1; }
+  Index first() const EIGEN_NOEXCEPT { return m_value; }
   Index m_value;
 };
 
@@ -141,9 +141,9 @@
 struct AllRange {
   enum { SizeAtCompileTime = XprSize };
   AllRange(Index size = XprSize) : m_size(size) {}
-  Index operator[](Index i) const { return i; }
-  Index size() const { return m_size.value(); }
-  Index first() const { return 0; }
+  EIGEN_CONSTEXPR Index operator[](Index i) const EIGEN_NOEXCEPT { return i; }
+  EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_size.value(); }
+  EIGEN_CONSTEXPR Index first() const EIGEN_NOEXCEPT { return 0; }
   variable_if_dynamic<Index,XprSize> m_size;
 };
 
diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h
index caeea23..ef3fdfb 100644
--- a/Eigen/src/Core/util/IntegralConstant.h
+++ b/Eigen/src/Core/util/IntegralConstant.h
@@ -52,7 +52,7 @@
 {
 public:
   static const int value = N;
-  operator int() const { return value; }
+  EIGEN_CONSTEXPR operator int() const { return value; }
   FixedInt() {}
   FixedInt( VariableAndFixedInt<N> other) {
     #ifndef EIGEN_INTERNAL_DEBUGGING
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index cad57c3..f66325f 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -275,7 +275,7 @@
 template<typename T> struct enable_if<true,T>
 { typedef T type; };
 
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11
 #if !defined(__FLT_EPSILON__)
 #define __FLT_EPSILON__ FLT_EPSILON
 #define __DBL_EPSILON__ DBL_EPSILON
@@ -286,7 +286,7 @@
 template<typename T> struct numeric_limits
 {
   EIGEN_DEVICE_FUNC
-  static T epsilon() { return 0; }
+  static EIGEN_CONSTEXPR T epsilon() { return 0; }
   static T (max)() { assert(false && "Highest not supported for this type"); }
   static T (min)() { assert(false && "Lowest not supported for this type"); }
   static T infinity() { assert(false && "Infinity not supported for this type"); }
@@ -294,7 +294,7 @@
 };
 template<> struct numeric_limits<float>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static float epsilon() { return __FLT_EPSILON__; }
   EIGEN_DEVICE_FUNC
   static float (max)() {
@@ -304,7 +304,7 @@
     return HIPRT_MAX_NORMAL_F;
   #endif
   }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static float (min)() { return FLT_MIN; }
   EIGEN_DEVICE_FUNC
   static float infinity() {
@@ -325,11 +325,11 @@
 };
 template<> struct numeric_limits<double>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static double epsilon() { return __DBL_EPSILON__; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static double (max)() { return DBL_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static double (min)() { return DBL_MIN; }
   EIGEN_DEVICE_FUNC
   static double infinity() {
@@ -350,71 +350,71 @@
 };
 template<> struct numeric_limits<int>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static int epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static int (max)() { return INT_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static int (min)() { return INT_MIN; }
 };
 template<> struct numeric_limits<unsigned int>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned int epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned int (max)() { return UINT_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned int (min)() { return 0; }
 };
 template<> struct numeric_limits<long>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long (max)() { return LONG_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long (min)() { return LONG_MIN; }
 };
 template<> struct numeric_limits<unsigned long>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned long (max)() { return ULONG_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned long (min)() { return 0; }
 };
 template<> struct numeric_limits<long long>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long long (max)() { return LLONG_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long long (min)() { return LLONG_MIN; }
 };
 template<> struct numeric_limits<unsigned long long>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned long long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned long long (max)() { return ULLONG_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned long long (min)() { return 0; }
 };
 template<> struct numeric_limits<bool>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static bool epsilon() { return false; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static bool (max)() { return true; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR 
   static bool (min)() { return false; }
 };
 
 }
 
-#endif
+#endif // defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11
 
 /** \internal
   * A base class do disable default copy ctor and copy assignment operator.
@@ -476,10 +476,10 @@
   *
   */
 template<typename T>
-Index size(const T& x) { return x.size(); }
+EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); }
 
 template<typename T,std::size_t N>
-Index size(const T (&) [N]) { return N; }
+EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; }
 
 /** \internal
   * Convenient struct to get the result type of a nullary, unary, binary, or
@@ -761,7 +761,7 @@
 template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
 #endif
 
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11
 using internal::device::numeric_limits;
 #else
 using std::numeric_limits;
diff --git a/Eigen/src/Core/util/SymbolicIndex.h b/Eigen/src/Core/util/SymbolicIndex.h
index 17cf46f..354dd9a 100644
--- a/Eigen/src/Core/util/SymbolicIndex.h
+++ b/Eigen/src/Core/util/SymbolicIndex.h
@@ -65,7 +65,7 @@
 public:
   ValueExpr() {}
   template<typename T>
-  Index eval_impl(const T&) const { return N; }
+  EIGEN_CONSTEXPR Index eval_impl(const T&) const { return N; }
 };
 
 
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index f83bca1..9c77717 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -131,9 +131,12 @@
   public:
     EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(variable_if_dynamic)
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
-    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return T(Value); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    operator T() const { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void setValue(T) const {}
 };
 
 template<typename T> class variable_if_dynamic<T, Dynamic>
@@ -154,8 +157,10 @@
   public:
     EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex)
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
-    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void setValue(T) {}
 };
 
 template<typename T> class variable_if_dynamicindex<T, DynamicIndex>
@@ -428,7 +433,7 @@
     T const&,
     const T
   >::type type;
-  
+
   typedef typename conditional<
     bool(traits<T>::Flags & NestByRefBit),
     T &,
@@ -614,7 +619,7 @@
   typedef typename conditional<
     is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
     MatrixRowType,
-    ArrayRowType 
+    ArrayRowType
   >::type type;
 };
 
@@ -629,7 +634,7 @@
   typedef typename conditional<
     is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
     MatrixColType,
-    ArrayColType 
+    ArrayColType
   >::type type;
 };
 
@@ -645,7 +650,7 @@
   typedef typename conditional<
     is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
     MatrixDiagType,
-    ArrayDiagType 
+    ArrayDiagType
   >::type type;
 };
 
@@ -761,7 +766,7 @@
   if(f&DirectAccessBit)             res += " | Direct";
   if(f&NestByRefBit)                res += " | NestByRef";
   if(f&NoPreferredStorageOrderBit)  res += " | NoPreferredStorageOrderBit";
-  
+
   return res;
 }
 #endif
@@ -858,7 +863,7 @@
 #define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
   EIGEN_STATIC_ASSERT((Eigen::internal::has_ReturnType<ScalarBinaryOpTraits<LHS, RHS,BINOP> >::value), \
     YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-    
+
 } // end namespace Eigen
 
 #endif // EIGEN_XPRHELPER_H
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 73b7041..59e5964 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -44,10 +44,14 @@
   * \f$ v \f$ such that \f$ Av = \lambda v \f$.  The eigenvalues of a
   * selfadjoint matrix are always real. If \f$ D \f$ is a diagonal matrix with
   * the eigenvalues on the diagonal, and \f$ V \f$ is a matrix with the
-  * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$ (for selfadjoint
-  * matrices, the matrix \f$ V \f$ is always invertible). This is called the
+  * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$. This is called the
   * eigendecomposition.
   *
+  * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal
+  * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then
+  * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is
+  * equal to its transpose, \f$ V^{-1} = V^T \f$.
+  *
   * The algorithm exploits the fact that the matrix is selfadjoint, making it
   * faster and more accurate than the general purpose eigenvalue algorithms
   * implemented in EigenSolver and ComplexEigenSolver.
@@ -256,6 +260,11 @@
       * matrix \f$ A \f$, then the matrix returned by this function is the
       * matrix \f$ V \f$ in the eigendecomposition \f$ A = V D V^{-1} \f$.
       *
+      * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal
+      * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then
+      * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is
+      * equal to its transpose, \f$ V^{-1} = V^T \f$.
+      *
       * Example: \include SelfAdjointEigenSolver_eigenvectors.cpp
       * Output: \verbinclude SelfAdjointEigenSolver_eigenvectors.out
       *
@@ -489,8 +498,6 @@
 EIGEN_DEVICE_FUNC
 ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec)
 {
-  EIGEN_USING_STD(abs);
-
   ComputationInfo info;
   typedef typename MatrixType::Scalar Scalar;
 
@@ -501,15 +508,23 @@
   
   typedef typename DiagType::RealScalar RealScalar;
   const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
-  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
-  
+  const RealScalar precision_inv = RealScalar(1)/NumTraits<RealScalar>::epsilon();
   while (end>0)
   {
-    for (Index i = start; i<end; ++i)
-      if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1])),precision) || abs(subdiag[i]) <= considerAsZero)
-        subdiag[i] = 0;
+    for (Index i = start; i<end; ++i) {
+      if (numext::abs(subdiag[i]) < considerAsZero) {
+        subdiag[i] = RealScalar(0);
+      } else {
+        // abs(subdiag[i]) <= epsilon * sqrt(abs(diag[i]) + abs(diag[i+1]))
+        // Scaled to prevent underflows.
+        const RealScalar scaled_subdiag = precision_inv * subdiag[i];
+        if (scaled_subdiag * scaled_subdiag <= (numext::abs(diag[i])+numext::abs(diag[i+1]))) {
+          subdiag[i] = RealScalar(0);
+        }
+      }
+    }
 
-    // find the largest unreduced block
+    // find the largest unreduced block at the end of the matrix.
     while (end>0 && subdiag[end-1]==RealScalar(0))
     {
       end--;
@@ -812,32 +827,38 @@
 }
 
 namespace internal {
+
+// Francis implicit QR step.
 template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
 EIGEN_DEVICE_FUNC
 static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
 {
-  EIGEN_USING_STD(abs);
+  // Wilkinson Shift.
   RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5);
   RealScalar e = subdiag[end-1];
   // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still
   // underflow thus leading to inf/NaN values when using the following commented code:
-//   RealScalar e2 = numext::abs2(subdiag[end-1]);
-//   RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
+  //   RealScalar e2 = numext::abs2(subdiag[end-1]);
+  //   RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
   // This explain the following, somewhat more complicated, version:
   RealScalar mu = diag[end];
-  if(td==RealScalar(0))
-    mu -= abs(e);
-  else
-  {
-    RealScalar e2 = numext::abs2(subdiag[end-1]);
-    RealScalar h = numext::hypot(td,e);
-    if(e2==RealScalar(0)) mu -= (e / (td + (td>RealScalar(0) ? RealScalar(1) : RealScalar(-1)))) * (e / h);
-    else                  mu -= e2 / (td + (td>RealScalar(0) ? h : -h));
+  if(td==RealScalar(0)) {
+    mu -= numext::abs(e);
+  } else if (e != RealScalar(0)) {
+    const RealScalar e2 = numext::abs2(e);
+    const RealScalar h = numext::hypot(td,e);
+    if(e2 == RealScalar(0)) {
+      mu -= e / ((td + (td>RealScalar(0) ? h : -h)) / e);
+    } else {
+      mu -= e2 / (td + (td>RealScalar(0) ? h : -h)); 
+    }
   }
-  
+
   RealScalar x = diag[start] - mu;
   RealScalar z = subdiag[start];
-  for (Index k = start; k < end; ++k)
+  // If z ever becomes zero, the Givens rotation will be the identity and
+  // z will stay zero for all future iterations.
+  for (Index k = start; k < end && z != RealScalar(0); ++k)
   {
     JacobiRotation<RealScalar> rot;
     rot.makeGivens(x, z);
@@ -850,12 +871,11 @@
     diag[k+1] = rot.s() * sdk + rot.c() * dkp1;
     subdiag[k] = rot.c() * sdk - rot.s() * dkp1;
     
-
     if (k > start)
       subdiag[k - 1] = rot.c() * subdiag[k-1] - rot.s() * z;
 
+    // "Chasing the bulge" to return to triangular form.
     x = subdiag[k];
-
     if (k < end - 1)
     {
       z = -rot.s() * subdiag[k+1];
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index c5c1acf..6c8084f 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -11,10 +11,10 @@
 #ifndef EIGEN_TRIDIAGONALIZATION_H
 #define EIGEN_TRIDIAGONALIZATION_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
-  
+
 template<typename MatrixType> struct TridiagonalizationMatrixTReturnType;
 template<typename MatrixType>
 struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
@@ -354,7 +354,7 @@
   Index n = matA.rows();
   eigen_assert(n==matA.cols());
   eigen_assert(n==hCoeffs.size()+1 || n==1);
-  
+
   for (Index i = 0; i<n-1; ++i)
   {
     Index remainingSize = n-i-1;
@@ -547,8 +547,8 @@
       result.template diagonal<-1>() = m_matrix.template diagonal<-1>();
     }
 
-    Index rows() const { return m_matrix.rows(); }
-    Index cols() const { return m_matrix.cols(); }
+    EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
   protected:
     typename MatrixType::Nested m_matrix;
diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index 5f0da1a..94083ac 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_HOMOGENEOUS_H
 #define EIGEN_HOMOGENEOUS_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
   *
@@ -72,9 +72,11 @@
       : m_matrix(matrix)
     {}
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows() + (int(Direction)==Vertical   ? 1 : 0); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); }
-    
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows() + (int(Direction)==Vertical   ? 1 : 0); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); }
+
     EIGEN_DEVICE_FUNC const NestedExpression& nestedExpression() const { return m_matrix; }
 
     template<typename Rhs>
@@ -262,8 +264,10 @@
       m_rhs(rhs)
   {}
 
-  EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
-  EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
   template<typename Dest> EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const
   {
@@ -300,8 +304,8 @@
     : m_lhs(lhs), m_rhs(rhs)
   {}
 
-  EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
-  EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
   template<typename Dest> EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const
   {
@@ -322,7 +326,7 @@
 struct evaluator_traits<Homogeneous<ArgType,Direction> >
 {
   typedef typename storage_kind_to_evaluator_kind<typename ArgType::StorageKind>::Kind Kind;
-  typedef HomogeneousShape Shape;  
+  typedef HomogeneousShape Shape;
 };
 
 template<> struct AssignmentKind<DenseShape,HomogeneousShape> { typedef Dense2Dense Kind; };
@@ -414,7 +418,7 @@
   typedef typename helper::ConstantBlock ConstantBlock;
   typedef typename helper::Xpr RefactoredXpr;
   typedef evaluator<RefactoredXpr> Base;
-  
+
   EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : Base(  xpr.lhs().nestedExpression() .lazyProduct(  xpr.rhs().template topRows<helper::Dim>(xpr.lhs().nestedExpression().cols()) )
             + ConstantBlock(xpr.rhs().row(xpr.rhs().rows()-1),xpr.lhs().rows(), 1) )
@@ -467,7 +471,7 @@
   typedef typename helper::ConstantBlock ConstantBlock;
   typedef typename helper::Xpr RefactoredXpr;
   typedef evaluator<RefactoredXpr> Base;
-  
+
   EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : Base(   xpr.lhs().template leftCols<helper::Dim>(xpr.rhs().nestedExpression().rows()) .lazyProduct( xpr.rhs().nestedExpression() )
             + ConstantBlock(xpr.lhs().col(xpr.lhs().cols()-1),1,xpr.rhs().cols()) )
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 1cb1d2f..52b8c2a 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -12,7 +12,7 @@
 #ifndef EIGEN_TRANSFORM_H
 #define EIGEN_TRANSFORM_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -47,7 +47,7 @@
 
 template< typename Lhs,
           typename Rhs,
-          bool AnyProjective = 
+          bool AnyProjective =
             transform_traits<Lhs>::IsProjective ||
             transform_traits<Rhs>::IsProjective>
 struct transform_transform_product_impl;
@@ -242,7 +242,7 @@
   typedef const Block<ConstMatrixType,Dim,1,!(internal::traits<MatrixType>::Flags & RowMajorBit)> ConstTranslationPart;
   /** corresponding translation type */
   typedef Translation<Scalar,Dim> TranslationType;
-  
+
   // this intermediate enum is needed to avoid an ICE with gcc 3.4 and 4.0
   enum { TransformTimeDiagonalMode = ((Mode==int(Isometry))?Affine:int(Mode)) };
   /** The return type of the product between a diagonal matrix and a transform */
@@ -302,7 +302,7 @@
     internal::transform_construct_from_matrix<OtherDerived,Mode,Options,Dim,HDim>::run(this, other.derived());
     return *this;
   }
-  
+
   template<int OtherOptions>
   EIGEN_DEVICE_FUNC inline Transform(const Transform<Scalar,Dim,Mode,OtherOptions>& other)
   {
@@ -374,9 +374,9 @@
   inline Transform& operator=(const QTransform& other);
   inline QTransform toQTransform(void) const;
   #endif
-  
-  EIGEN_DEVICE_FUNC Index rows() const { return int(Mode)==int(Projective) ? m_matrix.cols() : (m_matrix.cols()-1); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_matrix.cols(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return int(Mode)==int(Projective) ? m_matrix.cols() : (m_matrix.cols()-1); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
   /** shortcut for m_matrix(row,col);
     * \sa MatrixBase::operator(Index,Index) const */
@@ -450,7 +450,7 @@
   /** \returns The product expression of a transform \a a times a diagonal matrix \a b
     *
     * The rhs diagonal matrix is interpreted as an affine scaling transformation. The
-    * product results in a Transform of the same type (mode) as the lhs only if the lhs 
+    * product results in a Transform of the same type (mode) as the lhs only if the lhs
     * mode is no isometry. In that case, the returned transform is an affinity.
     */
   template<typename DiagonalDerived>
@@ -465,7 +465,7 @@
   /** \returns The product expression of a diagonal matrix \a a times a transform \a b
     *
     * The lhs diagonal matrix is interpreted as an affine scaling transformation. The
-    * product results in a Transform of the same type (mode) as the lhs only if the lhs 
+    * product results in a Transform of the same type (mode) as the lhs only if the lhs
     * mode is no isometry. In that case, the returned transform is an affinity.
     */
   template<typename DiagonalDerived>
@@ -488,7 +488,7 @@
   {
     return internal::transform_transform_product_impl<Transform,Transform>::run(*this,other);
   }
-  
+
   #if EIGEN_COMP_ICC
 private:
   // this intermediate structure permits to workaround a bug in ICC 11:
@@ -497,13 +497,13 @@
   //  (the meaning of a name may have changed since the template declaration -- the type of the template is:
   // "Eigen::internal::transform_transform_product_impl<Eigen::Transform<double, 3, 32, 0>,
   //     Eigen::Transform<double, 3, Mode, Options>, <expression>>::ResultType (const Eigen::Transform<double, 3, Mode, Options> &) const")
-  // 
+  //
   template<int OtherMode,int OtherOptions> struct icc_11_workaround
   {
     typedef internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> > ProductType;
     typedef typename ProductType::ResultType ResultType;
   };
-  
+
 public:
   /** Concatenates two different transformations */
   template<int OtherMode,int OtherOptions>
@@ -536,7 +536,7 @@
   }
 
   template<typename OtherDerived>
-  EIGEN_DEVICE_FUNC 
+  EIGEN_DEVICE_FUNC
   inline Transform& scale(const MatrixBase<OtherDerived> &other);
 
   template<typename OtherDerived>
@@ -566,18 +566,18 @@
   EIGEN_DEVICE_FUNC Transform& preshear(const Scalar& sx, const Scalar& sy);
 
   EIGEN_DEVICE_FUNC inline Transform& operator=(const TranslationType& t);
-  
+
   EIGEN_DEVICE_FUNC
   inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); }
-  
+
   EIGEN_DEVICE_FUNC inline Transform operator*(const TranslationType& t) const;
 
-  EIGEN_DEVICE_FUNC 
+  EIGEN_DEVICE_FUNC
   inline Transform& operator=(const UniformScaling<Scalar>& t);
-  
+
   EIGEN_DEVICE_FUNC
   inline Transform& operator*=(const UniformScaling<Scalar>& s) { return scale(s.factor()); }
-  
+
   EIGEN_DEVICE_FUNC
   inline TransformTimeDiagonalReturnType operator*(const UniformScaling<Scalar>& s) const
   {
@@ -680,7 +680,7 @@
   #ifdef EIGEN_TRANSFORM_PLUGIN
   #include EIGEN_TRANSFORM_PLUGIN
   #endif
-  
+
 protected:
   #ifndef EIGEN_PARSED_BY_DOXYGEN
     EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void check_template_params()
@@ -1048,7 +1048,7 @@
   EIGEN_DEVICE_FUNC static inline
   const typename TransformType::LinearMatrixType run(const TransformType& t)
   {
-    typedef typename TransformType::LinearMatrixType LinearMatrixType; 
+    typedef typename TransformType::LinearMatrixType LinearMatrixType;
     LinearMatrixType result;
     t.computeRotationScaling(&result, (LinearMatrixType*)0);
     return result;
@@ -1177,7 +1177,7 @@
 {
   template<typename MatrixType> EIGEN_DEVICE_FUNC static void run(MatrixType &) { }
 };
-    
+
 // selector needed to avoid taking the inverse of a 3x4 matrix
 template<typename TransformType, int Mode=TransformType::Mode>
 struct projective_transform_inverse
@@ -1318,8 +1318,8 @@
 template<int LhsMode,int RhsMode>
 struct transform_product_result
 {
-  enum 
-  { 
+  enum
+  {
     Mode =
       (LhsMode == (int)Projective    || RhsMode == (int)Projective    ) ? Projective :
       (LhsMode == (int)Affine        || RhsMode == (int)Affine        ) ? Affine :
@@ -1342,8 +1342,8 @@
 template< typename TransformType, typename MatrixType, int RhsCols>
 struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols>
 {
-  enum { 
-    Dim = TransformType::Dim, 
+  enum {
+    Dim = TransformType::Dim,
     HDim = TransformType::HDim,
     OtherRows = MatrixType::RowsAtCompileTime,
     OtherCols = MatrixType::ColsAtCompileTime
@@ -1360,7 +1360,7 @@
     ResultType res(other.rows(),other.cols());
     TopLeftLhs(res, 0, 0, Dim, other.cols()).noalias() = T.affine() * other;
     res.row(OtherRows-1) = other.row(OtherRows-1);
-    
+
     return res;
   }
 };
@@ -1368,8 +1368,8 @@
 template< typename TransformType, typename MatrixType, int RhsCols>
 struct transform_right_product_impl< TransformType, MatrixType, 2, RhsCols>
 {
-  enum { 
-    Dim = TransformType::Dim, 
+  enum {
+    Dim = TransformType::Dim,
     HDim = TransformType::HDim,
     OtherRows = MatrixType::RowsAtCompileTime,
     OtherCols = MatrixType::ColsAtCompileTime
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index 9318c28..022f6c3 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_HOUSEHOLDER_SEQUENCE_H
 #define EIGEN_HOUSEHOLDER_SEQUENCE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \ingroup Householder_Module
   * \householder_module
@@ -34,8 +34,8 @@
   * form \f$ H = \prod_{i=0}^{n-1} H_i \f$ where the i-th Householder reflection is \f$ H_i = I - h_i v_i
   * v_i^* \f$. The i-th Householder coefficient \f$ h_i \f$ is a scalar and the i-th Householder vector \f$
   * v_i \f$ is a vector of the form
-  * \f[ 
-  * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. 
+  * \f[
+  * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ].
   * \f]
   * The last \f$ n-i \f$ entries of \f$ v_i \f$ are called the essential part of the Householder vector.
   *
@@ -120,7 +120,7 @@
   : public EigenBase<HouseholderSequence<VectorsType,CoeffsType,Side> >
 {
     typedef typename internal::hseq_side_dependent_impl<VectorsType,CoeffsType,Side>::EssentialVectorType EssentialVectorType;
-  
+
   public:
     enum {
       RowsAtCompileTime = internal::traits<HouseholderSequence>::RowsAtCompileTime,
@@ -198,18 +198,18 @@
     }
 
     /** \brief Number of rows of transformation viewed as a matrix.
-      * \returns Number of rows 
+      * \returns Number of rows
       * \details This equals the dimension of the space that the transformation acts on.
       */
-    EIGEN_DEVICE_FUNC
-    Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
 
     /** \brief Number of columns of transformation viewed as a matrix.
       * \returns Number of columns
       * \details This equals the dimension of the space that the transformation acts on.
       */
-    EIGEN_DEVICE_FUNC
-    Index cols() const { return rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return rows(); }
 
     /** \brief Essential part of a Householder vector.
       * \param[in]  k  Index of Householder reflection
@@ -217,8 +217,8 @@
       *
       * This function returns the essential part of the Householder vector \f$ v_i \f$. This is a vector of
       * length \f$ n-i \f$ containing the last \f$ n-i \f$ entries of the vector
-      * \f[ 
-      * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. 
+      * \f[
+      * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ].
       * \f]
       * The index \f$ i \f$ equals \p k + shift(), corresponding to the k-th column of the matrix \p v
       * passed to the constructor.
@@ -381,7 +381,7 @@
           Index k = m_reverse ? i : (std::max)(Index(0),end-blockSize);
           Index bs = end-k;
           Index start = k + m_shift;
-          
+
           typedef Block<typename internal::remove_all<VectorsType>::type,Dynamic,Dynamic> SubVectorsType;
           SubVectorsType sub_vecs1(m_vectors.const_cast_derived(), Side==OnTheRight ? k : start,
                                                                    Side==OnTheRight ? start : k,
@@ -519,7 +519,7 @@
 }
 
 /** \ingroup Householder_Module \householder_module
-  * \brief Convenience function for constructing a Householder sequence. 
+  * \brief Convenience function for constructing a Householder sequence.
   * \returns A HouseholderSequence constructed from the specified arguments.
   */
 template<typename VectorsType, typename CoeffsType>
@@ -529,7 +529,7 @@
 }
 
 /** \ingroup Householder_Module \householder_module
-  * \brief Convenience function for constructing a Householder sequence. 
+  * \brief Convenience function for constructing a Householder sequence.
   * \returns A HouseholderSequence constructed from the specified arguments.
   * \details This function differs from householderSequence() in that the template argument \p OnTheSide of
   * the constructed HouseholderSequence is set to OnTheRight, instead of the default OnTheLeft.
diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index f66c846..a117fc1 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_BASIC_PRECONDITIONERS_H
 #define EIGEN_BASIC_PRECONDITIONERS_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \ingroup IterativeLinearSolvers_Module
   * \brief A preconditioner based on the digonal entries
@@ -52,15 +52,15 @@
       compute(mat);
     }
 
-    Index rows() const { return m_invdiag.size(); }
-    Index cols() const { return m_invdiag.size(); }
-    
+    EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_invdiag.size(); }
+    EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_invdiag.size(); }
+
     template<typename MatType>
     DiagonalPreconditioner& analyzePattern(const MatType& )
     {
       return *this;
     }
-    
+
     template<typename MatType>
     DiagonalPreconditioner& factorize(const MatType& mat)
     {
@@ -77,7 +77,7 @@
       m_isInitialized = true;
       return *this;
     }
-    
+
     template<typename MatType>
     DiagonalPreconditioner& compute(const MatType& mat)
     {
@@ -99,7 +99,7 @@
                 && "DiagonalPreconditioner::solve(): invalid number of rows of the right hand side matrix b");
       return Solve<DiagonalPreconditioner, Rhs>(*this, b.derived());
     }
-    
+
     ComputationInfo info() { return Success; }
 
   protected:
@@ -121,7 +121,7 @@
   * \implsparsesolverconcept
   *
   * The diagonal entries are pre-inverted and stored into a dense vector.
-  * 
+  *
   * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner
   */
 template <typename _Scalar>
@@ -146,7 +146,7 @@
     {
       return *this;
     }
-    
+
     template<typename MatType>
     LeastSquareDiagonalPreconditioner& factorize(const MatType& mat)
     {
@@ -178,13 +178,13 @@
       Base::m_isInitialized = true;
       return *this;
     }
-    
+
     template<typename MatType>
     LeastSquareDiagonalPreconditioner& compute(const MatType& mat)
     {
       return factorize(mat);
     }
-    
+
     ComputationInfo info() { return Success; }
 
   protected:
@@ -205,19 +205,19 @@
 
     template<typename MatrixType>
     explicit IdentityPreconditioner(const MatrixType& ) {}
-    
+
     template<typename MatrixType>
     IdentityPreconditioner& analyzePattern(const MatrixType& ) { return *this; }
-    
+
     template<typename MatrixType>
     IdentityPreconditioner& factorize(const MatrixType& ) { return *this; }
 
     template<typename MatrixType>
     IdentityPreconditioner& compute(const MatrixType& ) { return *this; }
-    
+
     template<typename Rhs>
     inline const Rhs& solve(const Rhs& b) const { return b; }
-    
+
     ComputationInfo info() { return Success; }
 };
 
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index e5d0308..7803fd8 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -14,8 +14,8 @@
 #include <vector>
 #include <list>
 
-namespace Eigen {  
-/** 
+namespace Eigen {
+/**
   * \brief Modified Incomplete Cholesky with dual threshold
   *
   * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
@@ -48,15 +48,15 @@
     typedef SparseSolverBase<IncompleteCholesky<Scalar,_UpLo,_OrderingType> > Base;
     using Base::m_isInitialized;
   public:
-    typedef typename NumTraits<Scalar>::Real RealScalar; 
+    typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef _OrderingType OrderingType;
     typedef typename OrderingType::PermutationType PermutationType;
-    typedef typename PermutationType::StorageIndex StorageIndex; 
+    typedef typename PermutationType::StorageIndex StorageIndex;
     typedef SparseMatrix<Scalar,ColMajor,StorageIndex> FactorType;
     typedef Matrix<Scalar,Dynamic,1> VectorSx;
     typedef Matrix<RealScalar,Dynamic,1> VectorRx;
     typedef Matrix<StorageIndex,Dynamic, 1> VectorIx;
-    typedef std::vector<std::list<StorageIndex> > VectorList; 
+    typedef std::vector<std::list<StorageIndex> > VectorList;
     enum { UpLo = _UpLo };
     enum {
       ColsAtCompileTime = Dynamic,
@@ -71,7 +71,7 @@
       * \sa IncompleteCholesky(const MatrixType&)
       */
     IncompleteCholesky() : m_initialShift(1e-3),m_analysisIsOk(false),m_factorizationIsOk(false) {}
-    
+
     /** Constructor computing the incomplete factorization for the given matrix \a matrix.
       */
     template<typename MatrixType>
@@ -79,13 +79,13 @@
     {
       compute(matrix);
     }
-    
+
     /** \returns number of rows of the factored matrix */
-    Index rows() const { return m_L.rows(); }
-    
+    EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_L.rows(); }
+
     /** \returns number of columns of the factored matrix */
-    Index cols() const { return m_L.cols(); }
-    
+    EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_L.cols(); }
+
 
     /** \brief Reports whether previous computation was successful.
       *
@@ -100,19 +100,19 @@
       eigen_assert(m_isInitialized && "IncompleteCholesky is not initialized.");
       return m_info;
     }
-    
+
     /** \brief Set the initial shift parameter \f$ \sigma \f$.
       */
     void setInitialShift(RealScalar shift) { m_initialShift = shift; }
-    
+
     /** \brief Computes the fill reducing permutation vector using the sparsity pattern of \a mat
       */
     template<typename MatrixType>
     void analyzePattern(const MatrixType& mat)
     {
-      OrderingType ord; 
+      OrderingType ord;
       PermutationType pinv;
-      ord(mat.template selfadjointView<UpLo>(), pinv); 
+      ord(mat.template selfadjointView<UpLo>(), pinv);
       if(pinv.size()>0) m_perm = pinv.inverse();
       else              m_perm.resize(0);
       m_L.resize(mat.rows(), mat.cols());
@@ -120,7 +120,7 @@
       m_isInitialized = true;
       m_info = Success;
     }
-    
+
     /** \brief Performs the numerical factorization of the input matrix \a mat
       *
       * The method analyzePattern() or compute() must have been called beforehand
@@ -130,7 +130,7 @@
       */
     template<typename MatrixType>
     void factorize(const MatrixType& mat);
-    
+
     /** Computes or re-computes the incomplete Cholesky factorization of the input matrix \a mat
       *
       * It is a shortcut for a sequential call to the analyzePattern() and factorize() methods.
@@ -143,7 +143,7 @@
       analyzePattern(mat);
       factorize(mat);
     }
-    
+
     // internal
     template<typename Rhs, typename Dest>
     void _solve_impl(const Rhs& b, Dest& x) const
@@ -170,16 +170,16 @@
 
   protected:
     FactorType m_L;              // The lower part stored in CSC
-    VectorRx m_scale;            // The vector for scaling the matrix 
+    VectorRx m_scale;            // The vector for scaling the matrix
     RealScalar m_initialShift;   // The initial shift parameter
-    bool m_analysisIsOk; 
-    bool m_factorizationIsOk; 
+    bool m_analysisIsOk;
+    bool m_factorizationIsOk;
     ComputationInfo m_info;
-    PermutationType m_perm; 
+    PermutationType m_perm;
 
   private:
-    inline void updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); 
-}; 
+    inline void updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol);
+};
 
 // Based on the following paper:
 //   C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
@@ -190,10 +190,10 @@
 void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType& mat)
 {
   using std::sqrt;
-  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
-    
+  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first");
+
   // Dropping strategy : Keep only the p largest elements per column, where p is the number of elements in the column of the original matrix. Other strategies will be added
-  
+
   // Apply the fill-reducing permutation computed in analyzePattern()
   if (m_perm.rows() == mat.rows() ) // To detect the null permutation
   {
@@ -206,8 +206,8 @@
   {
     m_L.template selfadjointView<Lower>() = mat.template selfadjointView<_UpLo>();
   }
-  
-  Index n = m_L.cols(); 
+
+  Index n = m_L.cols();
   Index nnz = m_L.nonZeros();
   Map<VectorSx> vals(m_L.valuePtr(), nnz);         //values
   Map<VectorIx> rowIdx(m_L.innerIndexPtr(), nnz);  //Row indices
@@ -219,9 +219,9 @@
   VectorIx col_pattern(n);
   col_pattern.fill(-1);
   StorageIndex col_nnz;
-  
-  
-  // Computes the scaling factors 
+
+
+  // Computes the scaling factors
   m_scale.resize(n);
   m_scale.setZero();
   for (Index j = 0; j < n; j++)
@@ -231,7 +231,7 @@
       if(rowIdx[k]!=j)
         m_scale(rowIdx[k]) += numext::abs2(vals(k));
     }
-  
+
   m_scale = m_scale.cwiseSqrt().cwiseSqrt();
 
   for (Index j = 0; j < n; ++j)
@@ -241,8 +241,8 @@
       m_scale(j) = 1;
 
   // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
-  
-  // Scale and compute the shift for the matrix 
+
+  // Scale and compute the shift for the matrix
   RealScalar mindiag = NumTraits<RealScalar>::highest();
   for (Index j = 0; j < n; j++)
   {
@@ -253,7 +253,7 @@
   }
 
   FactorType L_save = m_L;
-  
+
   RealScalar shift = 0;
   if(mindiag <= RealScalar(0.))
     shift = m_initialShift - mindiag;
@@ -375,7 +375,7 @@
   if (jk < colPtr(col+1) )
   {
     Index p = colPtr(col+1) - jk;
-    Index minpos; 
+    Index minpos;
     rowIdx.segment(jk,p).minCoeff(&minpos);
     minpos += jk;
     if (rowIdx(minpos) != rowIdx(jk))
@@ -389,6 +389,6 @@
   }
 }
 
-} // end namespace Eigen 
+} // end namespace Eigen
 
 #endif
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 09436cb..cdcf709 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -12,19 +12,19 @@
 #define EIGEN_INCOMPLETE_LUT_H
 
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
-    
+
 /** \internal
-  * Compute a quick-sort split of a vector 
+  * Compute a quick-sort split of a vector
   * On output, the vector row is permuted such that its elements satisfy
   * abs(row(i)) >= abs(row(ncut)) if i<ncut
-  * abs(row(i)) <= abs(row(ncut)) if i>ncut 
+  * abs(row(i)) <= abs(row(ncut)) if i>ncut
   * \param row The vector of values
   * \param ind The array of index for the elements in @p row
   * \param ncut  The number of largest elements to keep
-  **/ 
+  **/
 template <typename VectorV, typename VectorI>
 Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
 {
@@ -34,15 +34,15 @@
   Index mid;
   Index n = row.size(); /* length of the vector */
   Index first, last ;
-  
+
   ncut--; /* to fit the zero-based indices */
-  first = 0; 
-  last = n-1; 
+  first = 0;
+  last = n-1;
   if (ncut < first || ncut > last ) return 0;
-  
+
   do {
-    mid = first; 
-    RealScalar abskey = abs(row(mid)); 
+    mid = first;
+    RealScalar abskey = abs(row(mid));
     for (Index j = first + 1; j <= last; j++) {
       if ( abs(row(j)) > abskey) {
         ++mid;
@@ -53,12 +53,12 @@
     /* Interchange for the pivot element */
     swap(row(mid), row(first));
     swap(ind(mid), ind(first));
-    
+
     if (mid > ncut) last = mid - 1;
-    else if (mid < ncut ) first = mid + 1; 
+    else if (mid < ncut ) first = mid + 1;
   } while (mid != ncut );
-  
-  return 0; /* mid is equal to ncut */ 
+
+  return 0; /* mid is equal to ncut */
 }
 
 }// end namespace internal
@@ -71,23 +71,23 @@
   *
   * During the numerical factorization, two dropping rules are used :
   *  1) any element whose magnitude is less than some tolerance is dropped.
-  *    This tolerance is obtained by multiplying the input tolerance @p droptol 
+  *    This tolerance is obtained by multiplying the input tolerance @p droptol
   *    by the average magnitude of all the original elements in the current row.
-  *  2) After the elimination of the row, only the @p fill largest elements in 
-  *    the L part and the @p fill largest elements in the U part are kept 
-  *    (in addition to the diagonal element ). Note that @p fill is computed from 
-  *    the input parameter @p fillfactor which is used the ratio to control the fill_in 
+  *  2) After the elimination of the row, only the @p fill largest elements in
+  *    the L part and the @p fill largest elements in the U part are kept
+  *    (in addition to the diagonal element ). Note that @p fill is computed from
+  *    the input parameter @p fillfactor which is used the ratio to control the fill_in
   *    relatively to the initial number of nonzero elements.
-  * 
+  *
   * The two extreme cases are when @p droptol=0 (to keep all the @p fill*2 largest elements)
-  * and when @p fill=n/2 with @p droptol being different to zero. 
-  * 
-  * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization, 
+  * and when @p fill=n/2 with @p droptol being different to zero.
+  *
+  * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization,
   *              Numerical Linear Algebra with Applications, 1(4), pp 387-402, 1994.
-  * 
+  *
   * NOTE : The following implementation is derived from the ILUT implementation
-  * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota 
-  *  released under the terms of the GNU LGPL: 
+  * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota
+  *  released under the terms of the GNU LGPL:
   *    http://www-users.cs.umn.edu/~saad/software/SPARSKIT/README
   * However, Yousef Saad gave us permission to relicense his ILUT code to MPL2.
   * See the Eigen mailing list archive, thread: ILUT, date: July 8, 2012:
@@ -115,24 +115,24 @@
     };
 
   public:
-    
+
     IncompleteLUT()
       : m_droptol(NumTraits<Scalar>::dummy_precision()), m_fillfactor(10),
         m_analysisIsOk(false), m_factorizationIsOk(false)
     {}
-    
+
     template<typename MatrixType>
     explicit IncompleteLUT(const MatrixType& mat, const RealScalar& droptol=NumTraits<Scalar>::dummy_precision(), int fillfactor = 10)
       : m_droptol(droptol),m_fillfactor(fillfactor),
         m_analysisIsOk(false),m_factorizationIsOk(false)
     {
       eigen_assert(fillfactor != 0);
-      compute(mat); 
+      compute(mat);
     }
-    
-    Index rows() const { return m_lu.rows(); }
-    
-    Index cols() const { return m_lu.cols(); }
+
+    EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+
+    EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
 
     /** \brief Reports whether previous computation was successful.
       *
@@ -144,36 +144,36 @@
       eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
       return m_info;
     }
-    
+
     template<typename MatrixType>
     void analyzePattern(const MatrixType& amat);
-    
+
     template<typename MatrixType>
     void factorize(const MatrixType& amat);
-    
+
     /**
       * Compute an incomplete LU factorization with dual threshold on the matrix mat
       * No pivoting is done in this version
-      * 
+      *
       **/
     template<typename MatrixType>
     IncompleteLUT& compute(const MatrixType& amat)
     {
-      analyzePattern(amat); 
+      analyzePattern(amat);
       factorize(amat);
       return *this;
     }
 
-    void setDroptol(const RealScalar& droptol); 
-    void setFillfactor(int fillfactor); 
-    
+    void setDroptol(const RealScalar& droptol);
+    void setFillfactor(int fillfactor);
+
     template<typename Rhs, typename Dest>
     void _solve_impl(const Rhs& b, Dest& x) const
     {
       x = m_Pinv * b;
       x = m_lu.template triangularView<UnitLower>().solve(x);
       x = m_lu.template triangularView<Upper>().solve(x);
-      x = m_P * x; 
+      x = m_P * x;
     }
 
 protected:
@@ -200,22 +200,22 @@
 
 /**
  * Set control parameter droptol
- *  \param droptol   Drop any element whose magnitude is less than this tolerance 
- **/ 
+ *  \param droptol   Drop any element whose magnitude is less than this tolerance
+ **/
 template<typename Scalar, typename StorageIndex>
 void IncompleteLUT<Scalar,StorageIndex>::setDroptol(const RealScalar& droptol)
 {
-  this->m_droptol = droptol;   
+  this->m_droptol = droptol;
 }
 
 /**
  * Set control parameter fillfactor
- * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row. 
- **/ 
+ * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row.
+ **/
 template<typename Scalar, typename StorageIndex>
 void IncompleteLUT<Scalar,StorageIndex>::setFillfactor(int fillfactor)
 {
-  this->m_fillfactor = fillfactor;   
+  this->m_fillfactor = fillfactor;
 }
 
 template <typename Scalar, typename StorageIndex>
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 13ba9a5..28a0c51 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_ITERATIVE_SOLVER_BASE_H
 #define EIGEN_ITERATIVE_SOLVER_BASE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -145,7 +145,7 @@
 protected:
   typedef SparseSolverBase<Derived> Base;
   using Base::m_isInitialized;
-  
+
 public:
   typedef typename internal::traits<Derived>::MatrixType MatrixType;
   typedef typename internal::traits<Derived>::Preconditioner Preconditioner;
@@ -169,10 +169,10 @@
   }
 
   /** Initialize the solver with matrix \a A for further \c Ax=b solving.
-    * 
+    *
     * This constructor is a shortcut for the default constructor followed
     * by a call to compute().
-    * 
+    *
     * \warning this class stores a reference to the matrix A as well as some
     * precomputed values that depend on it. Therefore, if \a A is changed
     * this class becomes invalid. Call compute() to update it with the new
@@ -187,7 +187,7 @@
   }
 
   ~IterativeSolverBase() {}
-  
+
   /** Initializes the iterative solver for the sparsity pattern of the matrix \a A for further solving \c Ax=b problems.
     *
     * Currently, this function mostly calls analyzePattern on the preconditioner. In the future
@@ -203,7 +203,7 @@
     m_info = m_preconditioner.info();
     return derived();
   }
-  
+
   /** Initializes the iterative solver with the numerical values of the matrix \a A for further solving \c Ax=b problems.
     *
     * Currently, this function mostly calls factorize on the preconditioner.
@@ -216,7 +216,7 @@
   template<typename MatrixDerived>
   Derived& factorize(const EigenBase<MatrixDerived>& A)
   {
-    eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); 
+    eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
     grab(A.derived());
     m_preconditioner.factorize(matrix());
     m_factorizationIsOk = true;
@@ -247,16 +247,16 @@
   }
 
   /** \internal */
-  Index rows() const { return matrix().rows(); }
+  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return matrix().rows(); }
 
   /** \internal */
-  Index cols() const { return matrix().cols(); }
+  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return matrix().cols(); }
 
   /** \returns the tolerance threshold used by the stopping criteria.
     * \sa setTolerance()
     */
   RealScalar tolerance() const { return m_tolerance; }
-  
+
   /** Sets the tolerance threshold used by the stopping criteria.
     *
     * This value is used as an upper bound to the relative residual error: |Ax-b|/|b|.
@@ -270,7 +270,7 @@
 
   /** \returns a read-write reference to the preconditioner for custom configuration. */
   Preconditioner& preconditioner() { return m_preconditioner; }
-  
+
   /** \returns a read-only reference to the preconditioner. */
   const Preconditioner& preconditioner() const { return m_preconditioner; }
 
@@ -282,7 +282,7 @@
   {
     return (m_maxIterations<0) ? 2*matrix().cols() : m_maxIterations;
   }
-  
+
   /** Sets the max number of iterations.
     * Default is twice the number of columns of the matrix.
     */
@@ -328,13 +328,13 @@
     eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
     return m_info;
   }
-  
+
   /** \internal */
   template<typename Rhs, typename DestDerived>
   void _solve_with_guess_impl(const Rhs& b, SparseMatrixBase<DestDerived> &aDest) const
   {
     eigen_assert(rows()==b.rows());
-    
+
     Index rhsCols = b.cols();
     Index size = b.rows();
     DestDerived& dest(aDest.derived());
@@ -368,7 +368,7 @@
   _solve_with_guess_impl(const Rhs& b, MatrixBase<DestDerived> &aDest) const
   {
     eigen_assert(rows()==b.rows());
-    
+
     Index rhsCols = b.cols();
     DestDerived& dest(aDest.derived());
     ComputationInfo global_info = Success;
@@ -420,19 +420,19 @@
   {
     return m_matrixWrapper.matrix();
   }
-  
+
   template<typename InputType>
   void grab(const InputType &A)
   {
     m_matrixWrapper.grab(A);
   }
-  
+
   MatrixWrapper m_matrixWrapper;
   Preconditioner m_preconditioner;
 
   Index m_maxIterations;
   RealScalar m_tolerance;
-  
+
   mutable RealScalar m_error;
   mutable Index m_iterations;
   mutable ComputationInfo m_info;
diff --git a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
index 79e1e48..7b89657 100644
--- a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
+++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@@ -13,7 +13,7 @@
 namespace Eigen {
 
 template<typename Decomposition, typename RhsType, typename GuessType> class SolveWithGuess;
-  
+
 /** \class SolveWithGuess
   * \ingroup IterativeLinearSolvers_Module
   *
@@ -45,13 +45,15 @@
   typedef typename internal::traits<SolveWithGuess>::PlainObject PlainObject;
   typedef typename internal::generic_xpr_base<SolveWithGuess<Decomposition,RhsType,GuessType>, MatrixXpr, typename internal::traits<RhsType>::StorageKind>::type Base;
   typedef typename internal::ref_selector<SolveWithGuess>::type Nested;
-  
+
   SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess)
     : m_dec(dec), m_rhs(rhs), m_guess(guess)
   {}
-  
-  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
   EIGEN_DEVICE_FUNC const Decomposition& dec()   const { return m_dec; }
   EIGEN_DEVICE_FUNC const RhsType&       rhs()   const { return m_rhs; }
@@ -61,7 +63,7 @@
   const Decomposition &m_dec;
   const RhsType       &m_rhs;
   const GuessType     &m_guess;
-  
+
 private:
   Scalar coeff(Index row, Index col) const;
   Scalar coeff(Index i) const;
@@ -85,8 +87,8 @@
     m_result = solve.guess();
     solve.dec()._solve_with_guess_impl(solve.rhs(), m_result);
   }
-  
-protected:  
+
+protected:
   PlainObject m_result;
 };
 
diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h
index d2633a9..215db35 100644
--- a/Eigen/src/KLUSupport/KLUSupport.h
+++ b/Eigen/src/KLUSupport/KLUSupport.h
@@ -101,8 +101,8 @@
       if(m_numeric)  klu_free_numeric(&m_numeric,&m_common);
     }
 
-    inline Index rows() const { return mp_matrix.rows(); }
-    inline Index cols() const { return mp_matrix.cols(); }
+    EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return mp_matrix.rows(); }
+    EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return mp_matrix.cols(); }
 
     /** \brief Reports whether previous computation was successful.
       *
@@ -253,7 +253,7 @@
 
       m_numeric = klu_factor(const_cast<StorageIndex*>(mp_matrix.outerIndexPtr()), const_cast<StorageIndex*>(mp_matrix.innerIndexPtr()), const_cast<Scalar*>(mp_matrix.valuePtr()),
                                     m_symbolic, &m_common, Scalar());
-                                         
+
 
       m_info = m_numeric ? Success : NumericalIssue;
       m_factorizationIsOk = m_numeric ? 1 : 0;
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index ef93ec5..ba1749f 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -404,8 +404,10 @@
 
     MatrixType reconstructedMatrix() const;
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index b893801..46ffdd3 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -70,7 +70,7 @@
   * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
   *
   * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  * 
+  *
   * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
   */
 template<typename _MatrixType> class PartialPivLU
@@ -216,8 +216,8 @@
 
     MatrixType reconstructedMatrix() const;
 
-    inline Index rows() const { return m_lu.rows(); }
-    inline Index cols() const { return m_lu.cols(); }
+    EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+    EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h
index df0fe0e..5a8d0c1 100644
--- a/Eigen/src/LU/arch/InverseSize4.h
+++ b/Eigen/src/LU/arch/InverseSize4.h
@@ -141,8 +141,8 @@
     iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
     iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
 
-    const int bits[4] = {0, -2147483648, -2147483648, 0};
-    const Packet4f p4f_sign_PNNP = preinterpret<Packet4f, Packet4i>(pgather<int, Packet4i>(bits, static_cast<Eigen::Index>(1)));
+    const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f};
+    const Packet4f p4f_sign_PNNP = pset<Packet4f>(sign_mask);
     rd = pxor(rd, p4f_sign_PNNP);
     iA = pmul(iA, rd);
     iB = pmul(iB, rd);
@@ -323,12 +323,12 @@
     iC1 = psub(pmul(B1, dC), iC1);
     iC2 = psub(pmul(B2, dC), iC2);
 
-    const int bits1[4] = {0, -2147483648, 0, 0};
-    const int bits2[4] = {0, 0, 0, -2147483648};
-    const Packet2d _Sign_NP = preinterpret<Packet2d, Packet4i>(pgather<int, Packet4i>(bits1, static_cast<Eigen::Index>(1)));
-    const Packet2d _Sign_PN = preinterpret<Packet2d, Packet4i>(pgather<int, Packet4i>(bits2, static_cast<Eigen::Index>(1)));
-    d1 = pxor(rd, _Sign_PN);
-    d2 = pxor(rd, _Sign_NP);
+    const double sign_mask1[2] = {0.0, -0.0};
+    const double sign_mask2[2] = {-0.0, 0.0};
+    const Packet2d sign_PN = pset<Packet2d>(sign_mask1);
+    const Packet2d sign_NP = pset<Packet2d>(sign_mask2);
+    d1 = pxor(rd, sign_PN);
+    d2 = pxor(rd, sign_NP);
 
     Index res_stride = result.outerStride();
     double *res = result.data();
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index e0c4456..17f8e44 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -208,6 +208,7 @@
   using Base::m_computeThinV;
   using Base::m_matrixU;
   using Base::m_matrixV;
+  using Base::m_info;
   using Base::m_isInitialized;
   using Base::m_nonzeroSingularValues;
 
@@ -256,16 +257,25 @@
   {
     // FIXME this line involves temporaries
     JacobiSVD<MatrixType> jsvd(matrix,computationOptions);
-    if(computeU()) m_matrixU = jsvd.matrixU();
-    if(computeV()) m_matrixV = jsvd.matrixV();
-    m_singularValues = jsvd.singularValues();
-    m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
     m_isInitialized = true;
+    m_info = jsvd.info();
+    if (m_info == Success || m_info == NoConvergence) {
+      if(computeU()) m_matrixU = jsvd.matrixU();
+      if(computeV()) m_matrixV = jsvd.matrixV();
+      m_singularValues = jsvd.singularValues();
+      m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
+    }
     return *this;
   }
   
   //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
-  RealScalar scale = matrix.cwiseAbs().maxCoeff();
+  RealScalar scale = matrix.cwiseAbs().template maxCoeff<PropagateNaN>();
+  if (!(numext::isfinite)(scale)) {
+    m_isInitialized = true;
+    m_info = InvalidInput;
+    return *this;
+  }
+
   if(scale==Literal(0)) scale = Literal(1);
   MatrixX copy;
   if (m_isTranspose) copy = matrix.adjoint()/scale;
@@ -282,7 +292,11 @@
   m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose();
   m_computed.template bottomRows<1>().setZero();
   divide(0, m_diagSize - 1, 0, 0, 0);
-
+  if (m_info != Success && m_info != NoConvergence) {
+    m_isInitialized = true;
+    return *this;
+  }
+    
   //**** step 3 - Copy singular values and vectors
   for (int i=0; i<m_diagSize; i++)
   {
@@ -394,7 +408,7 @@
 //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
 // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
 template<typename MatrixType>
-void BDCSVD<MatrixType>::divide (Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
+void BDCSVD<MatrixType>::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
 {
   // requires rows = cols + 1;
   using std::pow;
@@ -414,6 +428,8 @@
   {
     // FIXME this line involves temporaries
     JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0));
+    m_info = b.info();
+    if (m_info != Success && m_info != NoConvergence) return;
     if (m_compU)
       m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU();
     else 
@@ -433,7 +449,9 @@
   // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the 
   // right submatrix before the left one. 
   divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
+  if (m_info != Success && m_info != NoConvergence) return;
   divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
+  if (m_info != Success && m_info != NoConvergence) return;
 
   if (m_compU)
   {
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 2b68911..a22a2e5 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -585,6 +585,7 @@
     using Base::m_matrixU;
     using Base::m_matrixV;
     using Base::m_singularValues;
+    using Base::m_info;
     using Base::m_isInitialized;
     using Base::m_isAllocated;
     using Base::m_usePrescribedThreshold;
@@ -625,6 +626,7 @@
 
   m_rows = rows;
   m_cols = cols;
+  m_info = Success;
   m_isInitialized = false;
   m_isAllocated = true;
   m_computationOptions = computationOptions;
@@ -674,7 +676,12 @@
   const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
 
   // Scaling factor to reduce over/under-flows
-  RealScalar scale = matrix.cwiseAbs().maxCoeff();
+  RealScalar scale = matrix.cwiseAbs().template maxCoeff<PropagateNaN>();
+  if (!(numext::isfinite)(scale)) {
+    m_isInitialized = true;
+    m_info = InvalidInput;
+    return *this;
+  }
   if(scale==RealScalar(0)) scale = RealScalar(1);
   
   /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */
diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index 34d5c9d..bc7ab88 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -51,8 +51,11 @@
  * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual
  * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix,
  * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving.
+ * 
+ * The status of the computation can be retrived using the \a info() method. Unless \a info() returns \a Success, the results should be not
+ * considered well defined.
  *  
- * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to
+ * If the input matrix has inf or nan coefficients, the result of the computation is undefined, and \a info() will return \a InvalidInput, but the computation is guaranteed to
  * terminate in finite (and reasonable) time.
  * \sa class BDCSVD, class JacobiSVD
  */
@@ -97,7 +100,7 @@
    */
   const MatrixUType& matrixU() const
   {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    _check_compute_assertions();
     eigen_assert(computeU() && "This SVD decomposition didn't compute U. Did you ask for it?");
     return m_matrixU;
   }
@@ -113,7 +116,7 @@
    */
   const MatrixVType& matrixV() const
   {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    _check_compute_assertions();
     eigen_assert(computeV() && "This SVD decomposition didn't compute V. Did you ask for it?");
     return m_matrixV;
   }
@@ -125,14 +128,14 @@
    */
   const SingularValuesType& singularValues() const
   {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    _check_compute_assertions();
     return m_singularValues;
   }
 
   /** \returns the number of singular values that are not exactly 0 */
   Index nonzeroSingularValues() const
   {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    _check_compute_assertions();
     return m_nonzeroSingularValues;
   }
   
@@ -145,7 +148,7 @@
   inline Index rank() const
   {
     using std::abs;
-    eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
+    _check_compute_assertions();
     if(m_singularValues.size()==0) return 0;
     RealScalar premultiplied_threshold = numext::maxi<RealScalar>(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
     Index i = m_nonzeroSingularValues-1;
@@ -224,6 +227,18 @@
   solve(const MatrixBase<Rhs>& b) const;
   #endif
 
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful.
+   */
+  EIGEN_DEVICE_FUNC
+  ComputationInfo info() const
+  {
+    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    return m_info;
+  }
+
   #ifndef EIGEN_PARSED_BY_DOXYGEN
   template<typename RhsType, typename DstType>
   void _solve_impl(const RhsType &rhs, DstType &dst) const;
@@ -233,26 +248,31 @@
   #endif
 
 protected:
-  
+
   static void check_template_parameters()
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
   }
 
+  void _check_compute_assertions() const {
+    eigen_assert(m_isInitialized && "SVD is not initialized.");
+  }
+
   template<bool Transpose_, typename Rhs>
   void _check_solve_assertion(const Rhs& b) const {
       EIGEN_ONLY_USED_FOR_DEBUG(b);
-      eigen_assert(m_isInitialized && "SVD is not initialized.");
+      _check_compute_assertions();
       eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice).");
       eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b");
   }
-  
+
   // return true if already allocated
   bool allocate(Index rows, Index cols, unsigned int computationOptions) ;
 
   MatrixUType m_matrixU;
   MatrixVType m_matrixV;
   SingularValuesType m_singularValues;
+  ComputationInfo m_info;
   bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold;
   bool m_computeFullU, m_computeThinU;
   bool m_computeFullV, m_computeThinV;
@@ -265,7 +285,8 @@
    * Default constructor of SVDBase
    */
   SVDBase()
-    : m_isInitialized(false),
+    : m_info(Success),
+      m_isInitialized(false),
       m_isAllocated(false),
       m_usePrescribedThreshold(false),
       m_computeFullU(false),
@@ -327,6 +348,7 @@
 
   m_rows = rows;
   m_cols = cols;
+  m_info = Success;
   m_isInitialized = false;
   m_isAllocated = true;
   m_computationOptions = computationOptions;
diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h
index 935a604..63a52a6 100644
--- a/Eigen/src/plugins/BlockMethods.h
+++ b/Eigen/src/plugins/BlockMethods.h
@@ -1437,7 +1437,6 @@
   * \sa subVector(Index)
   */
 template<DirectionType Direction>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
 Index subVectors() const
 { return (Direction==Vertical)?cols():rows(); }
-
diff --git a/bench/bench_norm.cpp b/bench/bench_norm.cpp
index a861539..592f25d 100644
--- a/bench/bench_norm.cpp
+++ b/bench/bench_norm.cpp
@@ -111,12 +111,12 @@
     int nbig, ibeta, it, iemin, iemax, iexp;
     Scalar abig, eps;
 
-    nbig  = std::numeric_limits<int>::max();            // largest integer
-    ibeta = std::numeric_limits<Scalar>::radix; //NumTraits<Scalar>::Base;                    // base for floating-point numbers
-    it    = std::numeric_limits<Scalar>::digits; //NumTraits<Scalar>::Mantissa;                // number of base-beta digits in mantissa
-    iemin = std::numeric_limits<Scalar>::min_exponent;  // minimum exponent
-    iemax = std::numeric_limits<Scalar>::max_exponent;  // maximum exponent
-    rbig  = std::numeric_limits<Scalar>::max();         // largest floating-point number
+    nbig  = NumTraits<int>::highest();          // largest integer
+    ibeta = std::numeric_limits<Scalar>::radix; // NumTraits<Scalar>::Base;                    // base for floating-point numbers
+    it    = NumTraits<Scalar>::digits();        // NumTraits<Scalar>::Mantissa;                // number of base-beta digits in mantissa
+    iemin = NumTraits<Scalar>::min_exponent();  // minimum exponent
+    iemax = NumTraits<Scalar>::max_exponent();  // maximum exponent
+    rbig  = NumTraits<Scalar>::highest();       // largest floating-point number
 
     // Check the basic machine-dependent constants.
     if(iemin > 1 - 2*it || 1+it>iemax || (it==2 && ibeta<5)
diff --git a/cmake/EigenSmokeTestList.cmake b/cmake/EigenSmokeTestList.cmake
new file mode 100644
index 0000000..6f0f724
--- /dev/null
+++ b/cmake/EigenSmokeTestList.cmake
@@ -0,0 +1,131 @@
+# List of tests that will be build and run during Eigen's smoke testing. If one
+# of these tests doesn't exists or cannot be build with the current configuration
+# it will just be skipped.
+set(ei_smoke_test_list
+  adjoint_1
+  alignedvector3
+  array_cwise_7
+  array_cwise_8
+  array_for_matrix_1
+  array_of_string
+  array_replicate_1
+  array_reverse_1
+  autodiff_1
+  autodiff_scalar_1
+  bandmatrix
+  bdcsvd_9
+  bessel_functions_1
+  bfloat16_float
+  blasutil_1
+  block_5
+  BVH
+  cholesky_1
+  cholmod_support_23
+  cholmod_support_24
+  conservative_resize_1
+  constructor_1
+  corners_1
+  ctorleakmiscmatrices_4
+  dense_storage
+  determinant_1
+  diagonal_1
+  diagonal_2
+  diagonalmatrices_1
+  dynalloc
+  eigensolver_complex_1
+  eigensolver_selfadjoint_8
+  EulerAngles_1
+  exceptions
+  fastmath
+  first_aligned
+  geo_alignedbox_2
+  geo_eulerangles_1
+  geo_homogeneous_1
+  geo_hyperplane_1
+  geo_orthomethods_1
+  geo_parametrizedline_1
+  geo_transformations_7
+  half_float
+  hessenberg_1
+  hessenberg_6qr_10
+  householder_8
+  indexed_view_1
+  inplace_decomposition_1
+  integer_types_1
+  inverse_1
+  is_same_dense
+  jacobi_1
+  jacobisvd_1
+  kronecker_product
+  linearstructure_1
+  mapped_matrix_1
+  mapstaticmethods_1
+  mapstride_1
+  matrix_square_root_1
+  meta
+  minres_2
+  miscmatrices_1
+  mixingtypes_7
+  nestbyvalue
+  nesting_ops_1
+  nomalloc_1
+  nullary_1
+  num_dimensions
+  NumericalDiff
+  numext
+  packetmath
+  permutationmatrices_1
+  polynomialsolver_1
+  prec_inverse_4x4_1
+  product_extra_5
+  product_selfadjoint_1
+  product_small_7
+  product_symm_1
+  product_syrk_1
+  product_trmm_1
+  product_trmv_1
+  product_trsolve_5
+  qr_1
+  qr_colpivoting_7
+  qr_fullpivoting_4
+  rand
+  real_qz_1
+  redux_1
+  ref_1
+  resize
+  rvalue_types_1
+  schur_complex_1
+  schur_real_1
+  selfadjoint_1
+  sizeof
+  sizeoverflow
+  smallvectors
+  sparse_basic_3
+  sparse_block_1
+  sparse_extra_4
+  sparse_permutations_2
+  sparse_product_4
+  sparse_ref_1
+  sparse_solvers_1
+  sparse_vector_1
+  special_functions_1
+  special_numbers_1
+  special_packetmath_1
+  spqr_support_2
+  stable_norm_1
+  stddeque_1
+  stddeque_overload_1
+  stdlist_1
+  stdlist_overload_1
+  stdvector_1
+  stdvector_overload_1
+  stl_iterators_1
+  swap_1
+  symbolic_index_1
+  triangular_1
+  type_aliaslu_9
+  umeyama_3
+  unalignedassert
+  unalignedcount
+  vectorwiseop_1
+  visitor_1)
\ No newline at end of file
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index fae5bc1..0808446 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -18,6 +18,11 @@
     set(filename ${testname}.cpp)
   endif()
 
+  # Add the current target to the list of subtest targets
+  get_property(EIGEN_SUBTESTS_LIST GLOBAL PROPERTY EIGEN_SUBTESTS_LIST)
+  set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}${targetname}\n")
+  set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
+
   if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
     if(EIGEN_TEST_HIP)
       hip_reset_flags()
@@ -413,11 +418,13 @@
   define_property(GLOBAL PROPERTY EIGEN_MISSING_BACKENDS BRIEF_DOCS " " FULL_DOCS " ")
   define_property(GLOBAL PROPERTY EIGEN_TESTING_SUMMARY BRIEF_DOCS " " FULL_DOCS " ")
   define_property(GLOBAL PROPERTY EIGEN_TESTS_LIST BRIEF_DOCS " " FULL_DOCS " ")
+  define_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST BRIEF_DOCS " " FULL_DOCS " ")
 
   set_property(GLOBAL PROPERTY EIGEN_TESTED_BACKENDS "")
   set_property(GLOBAL PROPERTY EIGEN_MISSING_BACKENDS "")
   set_property(GLOBAL PROPERTY EIGEN_TESTING_SUMMARY "")
   set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "")
+  set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "")
 
   define_property(GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT BRIEF_DOCS " " FULL_DOCS " ")
   define_property(GLOBAL PROPERTY EIGEN_FAILTEST_COUNT BRIEF_DOCS " " FULL_DOCS " ")
@@ -708,3 +715,56 @@
     add_dependencies("${current_target}" "${curr_test}")
   endforeach()
 endmacro(ei_split_testsuite num_splits)
+
+# Defines the custom command buildsmoketests to build a number of tests
+# specified in smoke_test_list.
+# 
+# Test in smoke_test_list can be either test targets (e.g. packetmath) or
+# subtests targets (e.g. packetmath_2). If any of the test are not available
+# in the current configuration they are just skipped. 
+#
+# All tests added via this macro are labeled with the smoketest label. This
+# allows running smoketests only using ctest.
+#
+# Smoke tests are intended to be run before the whole test suite is invoked,
+# e.g., to smoke test patches.
+macro(ei_add_smoke_tests smoke_test_list)
+  # Set the build target to build smoketests
+  set(buildtarget "buildsmoketests")
+  add_custom_target("${buildtarget}")
+
+  # Get list of all tests and translate it into a CMake list
+  get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST)
+  string(REGEX REPLACE "\n" " " EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
+  set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
+  separate_arguments(EIGEN_TESTS_LIST)
+
+  # Check if the test in smoke_test_list is a currently valid test target
+  foreach(test IN ITEMS ${smoke_test_list})
+    # Add tests in smoke_test_list to our smoke test target but only if the test
+    # is currently available, i.e., is in EIGEN_SUBTESTS_LIST
+    if ("${test}" IN_LIST EIGEN_TESTS_LIST)
+      add_dependencies("${buildtarget}" "${test}")
+      # In the case of a test we match all subtests
+      set(ctest_regex "${ctest_regex}^${test}_[0-9]+$$|")
+    endif()
+  endforeach()
+
+  # Get list of all subtests and translate it into a CMake list
+  get_property(EIGEN_SUBTESTS_LIST GLOBAL PROPERTY EIGEN_SUBTESTS_LIST)
+  string(REGEX REPLACE "\n" " " EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
+  set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
+  separate_arguments(EIGEN_SUBTESTS_LIST)
+
+  # Check if the test in smoke_test_list is a currently valid subtest target
+  foreach(test IN ITEMS ${smoke_test_list})
+    # Add tests in smoke_test_list to our smoke test target but only if the test
+    # is currently available, i.e., is in EIGEN_SUBTESTS_LIST
+    if ("${test}" IN_LIST EIGEN_SUBTESTS_LIST)
+      add_dependencies("${buildtarget}" "${test}")
+      # Add label smoketest to be able to run smoketests using ctest
+      get_property(test_labels TEST ${test} PROPERTY LABELS)
+      set_property(TEST ${test} PROPERTY LABELS "${test_labels};smoketest")
+    endif()
+  endforeach()
+endmacro(ei_add_smoke_tests)
\ No newline at end of file
diff --git a/test/AnnoyingScalar.h b/test/AnnoyingScalar.h
index 889e6cc..0f8e70d 100644
--- a/test/AnnoyingScalar.h
+++ b/test/AnnoyingScalar.h
@@ -76,20 +76,20 @@
 
     AnnoyingScalar operator/(const AnnoyingScalar& other) const
     { return AnnoyingScalar((*v)/(*other.v)); }
-    
+
     AnnoyingScalar& operator+=(const AnnoyingScalar& other) { *v += *other.v; return *this; }
     AnnoyingScalar& operator-=(const AnnoyingScalar& other) { *v -= *other.v; return *this; }
     AnnoyingScalar& operator*=(const AnnoyingScalar& other) { *v *= *other.v; return *this; }
     AnnoyingScalar& operator/=(const AnnoyingScalar& other) { *v /= *other.v; return *this; }
     AnnoyingScalar& operator= (const AnnoyingScalar& other) { *v  = *other.v; return *this; }
-  
+
     bool operator==(const AnnoyingScalar& other) const { return *v == *other.v; }
     bool operator!=(const AnnoyingScalar& other) const { return *v != *other.v; }
     bool operator<=(const AnnoyingScalar& other) const { return *v <= *other.v; }
     bool operator< (const AnnoyingScalar& other) const { return *v <  *other.v; }
     bool operator>=(const AnnoyingScalar& other) const { return *v >= *other.v; }
     bool operator> (const AnnoyingScalar& other) const { return *v >  *other.v; }
-    
+  
     float* v;
     float data;
     static int instances;
@@ -136,12 +136,23 @@
 
 template<> inline AnnoyingScalar test_precision<AnnoyingScalar>() { return test_precision<float>(); }
 
-namespace internal {
-  template<> double cast(const AnnoyingScalar& x) { return double(*x.v); }
-  template<> float  cast(const AnnoyingScalar& x) { return *x.v; }
+namespace numext {
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isfinite)(const AnnoyingScalar& x) {
+  return (numext::isfinite)(*x.v);
+}
 }
 
+namespace internal {
+  template<> EIGEN_STRONG_INLINE AnnoyingScalar pcmp_eq(const AnnoyingScalar& a, const AnnoyingScalar& b)
+  { return AnnoyingScalar(pcmp_eq(*a.v, *b.v)); }
+  template<> EIGEN_STRONG_INLINE AnnoyingScalar pselect(const AnnoyingScalar& mask, const AnnoyingScalar& a, const AnnoyingScalar& b)
+  { return numext::equal_strict(*mask.v, 0.f) ? b : a; }
+  template<> EIGEN_STRONG_INLINE double cast(const AnnoyingScalar& x) { return double(*x.v); }
+  template<> EIGEN_STRONG_INLINE float  cast(const AnnoyingScalar& x) { return *x.v; }
 }
+}  // namespace Eigen
 
 AnnoyingScalar get_test_precision(const AnnoyingScalar&)
 { return Eigen::test_precision<AnnoyingScalar>(); }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f6390e8..56664e7 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -460,3 +460,7 @@
 if(EIGEN_TEST_BUILD_DOCUMENTATION)
   add_dependencies(buildtests doc)
 endif()
+
+# Register all smoke tests
+include("EigenSmokeTestList")
+ei_add_smoke_tests("${ei_smoke_test_list}")
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index 6a88e0e..1bc8e19 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -14,18 +14,18 @@
 template<typename Scalar>
 void pow_test() {
   const Scalar zero = Scalar(0);
-  const Scalar eps = std::numeric_limits<Scalar>::epsilon();
+  const Scalar eps = Eigen::NumTraits<Scalar>::epsilon();
   const Scalar one = Scalar(1);
   const Scalar two = Scalar(2);
   const Scalar three = Scalar(3);
   const Scalar sqrt_half = Scalar(std::sqrt(0.5));
   const Scalar sqrt2 = Scalar(std::sqrt(2));
-  const Scalar inf = std::numeric_limits<Scalar>::infinity();
-  const Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  const Scalar inf = Eigen::NumTraits<Scalar>::infinity();
+  const Scalar nan = Eigen::NumTraits<Scalar>::quiet_NaN();
   const Scalar denorm_min = std::numeric_limits<Scalar>::denorm_min();
   const Scalar min = (std::numeric_limits<Scalar>::min)();
   const Scalar max = (std::numeric_limits<Scalar>::max)();
-  const Scalar max_exp = (static_cast<Scalar>(int(std::numeric_limits<Scalar>::max_exponent)) * Scalar(EIGEN_LN2)) / eps;
+  const Scalar max_exp = (static_cast<Scalar>(int(Eigen::NumTraits<Scalar>::max_exponent())) * Scalar(EIGEN_LN2)) / eps;
 
   const static Scalar abs_vals[] = {zero,
                                     denorm_min,
@@ -613,7 +613,7 @@
 
   // min/max with various NaN propagation options.
   if (m1.size() > 1 && !NumTraits<Scalar>::IsInteger) {
-    m1(0,0) = std::numeric_limits<Scalar>::quiet_NaN();
+    m1(0,0) = NumTraits<Scalar>::quiet_NaN();
     maxM1 = m1.template maxCoeff<PropagateNaN>();
     minM1 = m1.template minCoeff<PropagateNaN>();
     VERIFY((numext::isnan)(maxM1));
diff --git a/test/commainitializer.cpp b/test/commainitializer.cpp
index f535117..eb275be 100644
--- a/test/commainitializer.cpp
+++ b/test/commainitializer.cpp
@@ -73,8 +73,7 @@
   }
 };
 
-EIGEN_DECLARE_TEST(commainitializer)
-{
+void test_basics() {
   Matrix3d m3;
   Matrix4d m4;
 
@@ -107,8 +106,13 @@
         4, 5, 6,
         vec[2].transpose();
   VERIFY_IS_APPROX(m3, ref);
+}
 
+EIGEN_DECLARE_TEST(commainitializer)
+{
+
+  CALL_SUBTEST_1(test_basics());
 
   // recursively test all block-sizes from 0 to 3:
-  test_block_recursion<8>::run();
+  CALL_SUBTEST_2(test_block_recursion<8>::run());
 }
diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp
index 6cfdb6f..c4a3162 100644
--- a/test/geo_quaternion.cpp
+++ b/test/geo_quaternion.cpp
@@ -332,7 +332,9 @@
     CALL_SUBTEST_2(( quaternionAlignment<double>() ));
     CALL_SUBTEST_2( mapQuaternion<double>() );
 
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
     AnnoyingScalar::dont_throw = true;
+#endif
     CALL_SUBTEST_3(( quaternion<AnnoyingScalar,AutoAlign>() ));
   }
 }
diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
index b2e657e..bf8dcac 100644
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@@ -343,6 +343,21 @@
   }
 };
 
+template<typename T>
+struct numeric_limits_test {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    EIGEN_UNUSED_VARIABLE(in)
+    int out_idx = i * 5;
+    out[out_idx++] = numext::numeric_limits<float>::epsilon();
+    out[out_idx++] = (numext::numeric_limits<float>::max)();
+    out[out_idx++] = (numext::numeric_limits<float>::min)();
+    out[out_idx++] = numext::numeric_limits<float>::infinity();
+    out[out_idx++] = numext::numeric_limits<float>::quiet_NaN();
+  }
+};
+
 template<typename Type1, typename Type2>
 bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
 {
@@ -434,6 +449,9 @@
   CALL_SUBTEST( run_and_compare_to_gpu(complex_operators<Vector3cf>(), nthreads, cfin, cfout) );
   CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
 
+  // numeric_limits
+  CALL_SUBTEST( test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out) );
+
 #if defined(__NVCC__)
   // FIXME
   // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
diff --git a/test/main.h b/test/main.h
index 3e80c9f..07f3794 100644
--- a/test/main.h
+++ b/test/main.h
@@ -177,20 +177,22 @@
     EigenTest(const char* a_name, void (*func)(void))
       : m_name(a_name), m_func(func)
     {
-      ms_registered_tests.push_back(this);
+      get_registered_tests().push_back(this);
     }
     const std::string& name() const { return m_name; }
     void operator()() const { m_func(); }
 
-    static const std::vector<EigenTest*>& all() { return ms_registered_tests; }
+    static const std::vector<EigenTest*>& all() { return get_registered_tests(); }
   protected:
+    static std::vector<EigenTest*>& get_registered_tests()
+    {
+      static std::vector<EigenTest*>* ms_registered_tests = new std::vector<EigenTest*>();
+      return *ms_registered_tests;
+    }
     std::string m_name;
     void (*m_func)(void);
-    static std::vector<EigenTest*> ms_registered_tests;
   };
 
-  std::vector<EigenTest*> EigenTest::ms_registered_tests;
-
   // Declare and register a test, e.g.:
   //    EIGEN_DECLARE_TEST(mytest) { ... }
   // will create a function:
diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
index aad63ec..d450dbf 100644
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@@ -311,6 +311,7 @@
 
 EIGEN_DECLARE_TEST(mixingtypes)
 {
+  g_called = false; // Silence -Wunneeded-internal-declaration.
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(mixingtypes<3>());
     CALL_SUBTEST_2(mixingtypes<4>());
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index f5cce64..67d329a 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -273,7 +273,7 @@
 
   //Test NaN
   for (int i = 0; i < PacketSize; ++i) {
-    data1[i] = std::numeric_limits<Scalar>::quiet_NaN();
+    data1[i] = NumTraits<Scalar>::quiet_NaN();
     data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
   }
   CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
@@ -634,7 +634,7 @@
   if (PacketTraits::HasExp) {
     // Check denormals:
     for (int j=0; j<3; ++j) {
-      data1[0] = Scalar(std::ldexp(1, std::numeric_limits<Scalar>::min_exponent-j));
+      data1[0] = Scalar(std::ldexp(1, NumTraits<Scalar>::min_exponent()-j));
       CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
       data1[0] = -data1[0];
       CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
@@ -671,10 +671,10 @@
   if (PacketTraits::HasExp) {
     data1[0] = Scalar(-1);
     // underflow to zero
-    data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::min_exponent-55);
+    data1[PacketSize] = Scalar(NumTraits<Scalar>::min_exponent()-55);
     CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
     // overflow to inf
-    data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::max_exponent+10);
+    data1[PacketSize] = Scalar(NumTraits<Scalar>::max_exponent()+10);
     CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
     // NaN stays NaN
     data1[0] = NumTraits<Scalar>::quiet_NaN();
@@ -682,21 +682,21 @@
     VERIFY((numext::isnan)(data2[0]));
     // inf stays inf
     data1[0] = NumTraits<Scalar>::infinity();
-    data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::min_exponent-10);
+    data1[PacketSize] = Scalar(NumTraits<Scalar>::min_exponent()-10);
     CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
     // zero stays zero
     data1[0] = Scalar(0);
-    data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::max_exponent+10);
+    data1[PacketSize] = Scalar(NumTraits<Scalar>::max_exponent()+10);
     CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
     // Small number big exponent.
-    data1[0] = Scalar(std::ldexp(Scalar(1.0), std::numeric_limits<Scalar>::min_exponent-1));
-    data1[PacketSize] = Scalar(-std::numeric_limits<Scalar>::min_exponent
-                               +std::numeric_limits<Scalar>::max_exponent);
+    data1[0] = Scalar(std::ldexp(Scalar(1.0), NumTraits<Scalar>::min_exponent()-1));
+    data1[PacketSize] = Scalar(-NumTraits<Scalar>::min_exponent()
+                               +NumTraits<Scalar>::max_exponent());
     CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
     // Big number small exponent.
-    data1[0] = Scalar(std::ldexp(Scalar(1.0), std::numeric_limits<Scalar>::max_exponent-1));
-    data1[PacketSize] = Scalar(+std::numeric_limits<Scalar>::min_exponent
-                               -std::numeric_limits<Scalar>::max_exponent);
+    data1[0] = Scalar(std::ldexp(Scalar(1.0), NumTraits<Scalar>::max_exponent()-1));
+    data1[PacketSize] = Scalar(+NumTraits<Scalar>::min_exponent()
+                               -NumTraits<Scalar>::max_exponent());
     CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
   }
 
@@ -707,8 +707,8 @@
   data1[0] = Scalar(1e-20);
   CHECK_CWISE1_IF(PacketTraits::HasTanh, std::tanh, internal::ptanh);
   if (PacketTraits::HasExp && PacketSize >= 2) {
-    const Scalar small = std::numeric_limits<Scalar>::epsilon();
-    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    const Scalar small = NumTraits<Scalar>::epsilon();
+    data1[0] = NumTraits<Scalar>::quiet_NaN();
     data1[1] = small;
     test::packet_helper<PacketTraits::HasExp, Packet> h;
     h.store(data2, internal::pexp(h.load(data1)));
@@ -742,7 +742,7 @@
 
   if (PacketTraits::HasTanh) {
     // NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
-    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    data1[0] = NumTraits<Scalar>::quiet_NaN();
     test::packet_helper<internal::packet_traits<Scalar>::HasTanh, Packet> h;
     h.store(data2, internal::ptanh(h.load(data1)));
     VERIFY((numext::isnan)(data2[0]));
@@ -762,17 +762,17 @@
   }
 
 #if EIGEN_HAS_C99_MATH && (EIGEN_COMP_CXXVER >= 11)
-  data1[0] = std::numeric_limits<Scalar>::infinity();
+  data1[0] = NumTraits<Scalar>::infinity();
   data1[1] = Scalar(-1);
   CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p);
-  data1[0] = std::numeric_limits<Scalar>::infinity();
-  data1[1] = -std::numeric_limits<Scalar>::infinity();
+  data1[0] = NumTraits<Scalar>::infinity();
+  data1[1] = -NumTraits<Scalar>::infinity();
   CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1);
 #endif
 
   if (PacketSize >= 2) {
-    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
-    data1[1] = std::numeric_limits<Scalar>::epsilon();
+    data1[0] = NumTraits<Scalar>::quiet_NaN();
+    data1[1] = NumTraits<Scalar>::epsilon();
     if (PacketTraits::HasLog) {
       test::packet_helper<PacketTraits::HasLog, Packet> h;
       h.store(data2, internal::plog(h.load(data1)));
@@ -782,7 +782,7 @@
         VERIFY_IS_APPROX(std::log(data1[1]), data2[1]);
       }
 
-      data1[0] = -std::numeric_limits<Scalar>::epsilon();
+      data1[0] = -NumTraits<Scalar>::epsilon();
       data1[1] = Scalar(0);
       h.store(data2, internal::plog(h.load(data1)));
       VERIFY((numext::isnan)(data2[0]));
@@ -813,14 +813,14 @@
       h.store(data2, internal::plog(h.load(data1)));
       VERIFY((numext::isnan)(data2[0]));
 
-      data1[0] = std::numeric_limits<Scalar>::infinity();
+      data1[0] = NumTraits<Scalar>::infinity();
       h.store(data2, internal::plog(h.load(data1)));
       VERIFY((numext::isinf)(data2[0]));
     }
     if (PacketTraits::HasLog1p) {
       test::packet_helper<PacketTraits::HasLog1p, Packet> h;
       data1[0] = Scalar(-2);
-      data1[1] = -std::numeric_limits<Scalar>::infinity();
+      data1[1] = -NumTraits<Scalar>::infinity();
       h.store(data2, internal::plog1p(h.load(data1)));
       VERIFY((numext::isnan)(data2[0]));
       VERIFY((numext::isnan)(data2[1]));
@@ -831,7 +831,7 @@
       if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
         data1[1] = -std::numeric_limits<Scalar>::denorm_min();
       } else {
-        data1[1] = -std::numeric_limits<Scalar>::epsilon();
+        data1[1] = -NumTraits<Scalar>::epsilon();
       }
       h.store(data2, internal::psqrt(h.load(data1)));
       VERIFY((numext::isnan)(data2[0]));
@@ -842,7 +842,7 @@
         && !internal::is_same<Scalar, half>::value
         && !internal::is_same<Scalar, bfloat16>::value) {
       test::packet_helper<PacketTraits::HasCos, Packet> h;
-      for (Scalar k = Scalar(1); k < Scalar(10000) / std::numeric_limits<Scalar>::epsilon(); k *= Scalar(2)) {
+      for (Scalar k = Scalar(1); k < Scalar(10000) / NumTraits<Scalar>::epsilon(); k *= Scalar(2)) {
         for (int k1 = 0; k1 <= 1; ++k1) {
           data1[0] = Scalar((2 * double(k) + k1) * double(EIGEN_PI) / 2 * internal::random<double>(0.8, 1.2));
           data1[1] = Scalar((2 * double(k) + 2 + k1) * double(EIGEN_PI) / 2 * internal::random<double>(0.8, 1.2));
@@ -863,8 +863,8 @@
         }
       }
 
-      data1[0] = std::numeric_limits<Scalar>::infinity();
-      data1[1] = -std::numeric_limits<Scalar>::infinity();
+      data1[0] = NumTraits<Scalar>::infinity();
+      data1[1] = -NumTraits<Scalar>::infinity();
       h.store(data2, internal::psin(h.load(data1)));
       VERIFY((numext::isnan)(data2[0]));
       VERIFY((numext::isnan)(data2[1]));
@@ -873,7 +873,7 @@
       VERIFY((numext::isnan)(data2[0]));
       VERIFY((numext::isnan)(data2[1]));
 
-      data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+      data1[0] = NumTraits<Scalar>::quiet_NaN();
       h.store(data2, internal::psin(h.load(data1)));
       VERIFY((numext::isnan)(data2[0]));
       h.store(data2, internal::pcos(h.load(data1)));
@@ -997,13 +997,13 @@
     VERIFY(internal::isApprox(ref[0], internal::predux_max<PropagateNaN>(internal::pload<Packet>(data1))) && "internal::predux_max<PropagateNumbers>");
     // A single NaN.
     const size_t index = std::numeric_limits<size_t>::quiet_NaN() % PacketSize;
-    data1[index] = std::numeric_limits<Scalar>::quiet_NaN();
+    data1[index] = NumTraits<Scalar>::quiet_NaN();
     VERIFY(PacketSize==1 || !(numext::isnan)(internal::predux_min<PropagateNumbers>(internal::pload<Packet>(data1))));
     VERIFY((numext::isnan)(internal::predux_min<PropagateNaN>(internal::pload<Packet>(data1))));
     VERIFY(PacketSize==1 || !(numext::isnan)(internal::predux_max<PropagateNumbers>(internal::pload<Packet>(data1))));
     VERIFY((numext::isnan)(internal::predux_max<PropagateNaN>(internal::pload<Packet>(data1))));
     // All NaNs.
-    for (int i = 0; i < 4 * PacketSize; ++i) data1[i] = std::numeric_limits<Scalar>::quiet_NaN();
+    for (int i = 0; i < 4 * PacketSize; ++i) data1[i] = NumTraits<Scalar>::quiet_NaN();
     VERIFY((numext::isnan)(internal::predux_min<PropagateNumbers>(internal::pload<Packet>(data1))));
     VERIFY((numext::isnan)(internal::predux_min<PropagateNaN>(internal::pload<Packet>(data1))));
     VERIFY((numext::isnan)(internal::predux_max<PropagateNumbers>(internal::pload<Packet>(data1))));
@@ -1011,8 +1011,8 @@
 
     // Test NaN propagation for coefficient-wise min and max.
     for (int i = 0; i < PacketSize; ++i) {
-      data1[i] = internal::random<bool>() ? std::numeric_limits<Scalar>::quiet_NaN() : Scalar(0);
-      data1[i + PacketSize] = internal::random<bool>() ? std::numeric_limits<Scalar>::quiet_NaN() : Scalar(0);
+      data1[i] = internal::random<bool>() ? NumTraits<Scalar>::quiet_NaN() : Scalar(0);
+      data1[i + PacketSize] = internal::random<bool>() ? NumTraits<Scalar>::quiet_NaN() : Scalar(0);
     }
     // Note: NaN propagation is implementation defined for pmin/pmax, so we do not test it here.
     CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_number_min, (internal::pmin<PropagateNumbers>));
diff --git a/test/product_large.cpp b/test/product_large.cpp
index 578dfb5..3d0204b 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -112,6 +112,11 @@
     CALL_SUBTEST_1( test_aliasing<float>() );
 
     CALL_SUBTEST_6( bug_1622<1>() );
+
+    CALL_SUBTEST_7( product(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_8( product(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_9( product(Matrix<std::complex<float>,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_10( product(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
 
   CALL_SUBTEST_6( product_large_regressions<0>() );
diff --git a/test/svd_common.h b/test/svd_common.h
index 5c0f2a0..bd62edc 100644
--- a/test/svd_common.h
+++ b/test/svd_common.h
@@ -298,7 +298,8 @@
 // workaround aggressive optimization in ICC
 template<typename T> EIGEN_DONT_INLINE  T sub(T a, T b) { return a - b; }
 
-// all this function does is verify we don't iterate infinitely on nan/inf values
+// This function verifies we don't iterate infinitely on nan/inf values,
+// and that info() returns InvalidInput.
 template<typename SvdType, typename MatrixType>
 void svd_inf_nan()
 {
@@ -307,18 +308,22 @@
   Scalar some_inf = Scalar(1) / zero<Scalar>();
   VERIFY(sub(some_inf, some_inf) != sub(some_inf, some_inf));
   svd.compute(MatrixType::Constant(10,10,some_inf), ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
 
   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
   VERIFY(nan != nan);
   svd.compute(MatrixType::Constant(10,10,nan), ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);  
 
   MatrixType m = MatrixType::Zero(10,10);
   m(internal::random<int>(0,9), internal::random<int>(0,9)) = some_inf;
   svd.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
 
   m = MatrixType::Zero(10,10);
   m(internal::random<int>(0,9), internal::random<int>(0,9)) = nan;
   svd.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
   
   // regression test for bug 791
   m.resize(3,3);
@@ -326,6 +331,7 @@
        0,   -0.5,                             0,
        nan,  0,                               0;
   svd.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
   
   m.resize(4,4);
   m <<  1, 0, 0, 0,
@@ -333,6 +339,7 @@
         1, 0, 1, nan,
         0, nan, nan, 0;
   svd.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
 }
 
 // Regression test for bug 286: JacobiSVD loops indefinitely with some
diff --git a/test/visitor.cpp b/test/visitor.cpp
index de938fc..20fb2c3 100644
--- a/test/visitor.cpp
+++ b/test/visitor.cpp
@@ -56,9 +56,44 @@
   VERIFY_IS_APPROX(maxc, m.maxCoeff());
 
   eigen_maxc = (m.adjoint()*m).maxCoeff(&eigen_maxrow,&eigen_maxcol);
-  eigen_maxc = (m.adjoint()*m).eval().maxCoeff(&maxrow,&maxcol);
-  VERIFY(maxrow == eigen_maxrow);
-  VERIFY(maxcol == eigen_maxcol);
+  Index maxrow2=0,maxcol2=0;
+  eigen_maxc = (m.adjoint()*m).eval().maxCoeff(&maxrow2,&maxcol2);
+  VERIFY(maxrow2 == eigen_maxrow);
+  VERIFY(maxcol2 == eigen_maxcol);
+
+  if (!NumTraits<Scalar>::IsInteger && m.size() > 2) {
+    // Test NaN propagation by replacing an element with NaN.
+    bool stop = false;
+    for (Index j = 0; j < cols && !stop; ++j) {
+      for (Index i = 0; i < rows && !stop; ++i) {
+        if (!(j == mincol && i == minrow) &&
+            !(j == maxcol && i == maxrow)) {
+          m(i,j) = NumTraits<Scalar>::quiet_NaN();
+          stop = true;
+          break;
+        }
+      }
+    }
+
+    eigen_minc = m.template minCoeff<PropagateNumbers>(&eigen_minrow, &eigen_mincol);
+    eigen_maxc = m.template maxCoeff<PropagateNumbers>(&eigen_maxrow, &eigen_maxcol);
+    VERIFY(minrow == eigen_minrow);
+    VERIFY(maxrow == eigen_maxrow);
+    VERIFY(mincol == eigen_mincol);
+    VERIFY(maxcol == eigen_maxcol);
+    VERIFY_IS_APPROX(minc, eigen_minc);
+    VERIFY_IS_APPROX(maxc, eigen_maxc);
+    VERIFY_IS_APPROX(minc, m.template minCoeff<PropagateNumbers>());
+    VERIFY_IS_APPROX(maxc, m.template maxCoeff<PropagateNumbers>());
+
+    eigen_minc = m.template minCoeff<PropagateNaN>(&eigen_minrow, &eigen_mincol);
+    eigen_maxc = m.template maxCoeff<PropagateNaN>(&eigen_maxrow, &eigen_maxcol);
+    VERIFY(minrow != eigen_minrow || mincol != eigen_mincol);
+    VERIFY(maxrow != eigen_maxrow || maxcol != eigen_maxcol);
+    VERIFY((numext::isnan)(eigen_minc));
+    VERIFY((numext::isnan)(eigen_maxc));
+  }
+
 }
 
 template<typename VectorType> void vectorVisitor(const VectorType& w)
@@ -111,6 +146,31 @@
   v2.maxCoeff(&eigen_maxidx);
   VERIFY(eigen_minidx == (std::min)(idx0,idx1));
   VERIFY(eigen_maxidx == (std::min)(idx0,idx2));
+
+  if (!NumTraits<Scalar>::IsInteger && size > 2) {
+    // Test NaN propagation by replacing an element with NaN.
+    for (Index i = 0; i < size; ++i) {
+      if (i != minidx && i != maxidx) {
+        v(i) = NumTraits<Scalar>::quiet_NaN();
+        break;
+      }
+    }
+    eigen_minc = v.template minCoeff<PropagateNumbers>(&eigen_minidx);
+    eigen_maxc = v.template maxCoeff<PropagateNumbers>(&eigen_maxidx);
+    VERIFY(minidx == eigen_minidx);
+    VERIFY(maxidx == eigen_maxidx);
+    VERIFY_IS_APPROX(minc, eigen_minc);
+    VERIFY_IS_APPROX(maxc, eigen_maxc);
+    VERIFY_IS_APPROX(minc, v.template minCoeff<PropagateNumbers>());
+    VERIFY_IS_APPROX(maxc, v.template maxCoeff<PropagateNumbers>());
+
+    eigen_minc = v.template minCoeff<PropagateNaN>(&eigen_minidx);
+    eigen_maxc = v.template maxCoeff<PropagateNaN>(&eigen_maxidx);
+    VERIFY(minidx != eigen_minidx);
+    VERIFY(maxidx != eigen_maxidx);
+    VERIFY((numext::isnan)(eigen_minc));
+    VERIFY((numext::isnan)(eigen_maxc));
+  }
 }
 
 EIGEN_DECLARE_TEST(visitor)
diff --git a/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h b/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
index 753c1b3..b0d5e10 100644
--- a/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
+++ b/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
@@ -12,7 +12,7 @@
 
 #include "SkylineUtil.h"
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \ingroup Skyline_Module
  *
@@ -102,18 +102,18 @@
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
     /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-    inline Index rows() const {
+    inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
         return derived().rows();
     }
 
     /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-    inline Index cols() const {
+    inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {
         return derived().cols();
     }
 
     /** \returns the number of coefficients, which is \a rows()*cols().
      * \sa rows(), cols(), SizeAtCompileTime. */
-    inline Index size() const {
+    inline EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT {
         return rows() * cols();
     }
 
diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
index 833edd4..dd786d5 100644
--- a/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -101,7 +101,7 @@
   template<typename Scalar>
   inline void putVectorElt(std::complex<Scalar> value, std::ofstream& out)
   {
-    out << value.real << " " << value.imag()<< "\n"; 
+    out << value.real() << " " << value.imag()<< "\n"; 
   }
 
 } // end namespace internal
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index cbb799a..bc681e3 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -161,6 +161,17 @@
   VERIFY_IS_EQUAL(DenseMatrix(m1),DenseMatrix(m2));
 }
 
+template<typename VectorType>
+void check_marketio_vector()
+{
+  Index size = internal::random<Index>(1,100);
+  VectorType v1, v2;
+  v1 = VectorType::Random(size);
+  saveMarketVector(v1, "vector_extra.mtx");
+  loadMarketVector(v2, "vector_extra.mtx");
+  VERIFY_IS_EQUAL(v1,v2);
+}
+
 EIGEN_DECLARE_TEST(sparse_extra)
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -184,6 +195,17 @@
     CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,long int> >()) );
     CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,long int> >()) );
     CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,long int> >()) );
+
+
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,Dynamic,1> >()) );
+
     TEST_SET_BUT_UNUSED_VARIABLE(s);
   }
 }