BEGIN_PUBLIC
Update Eigen to: https://gitlab.com/libeigen/eigen/-/commit/9b51dc7972c9f64727e9c8e8db0c60aaf9aae532
END_PUBLIC

PiperOrigin-RevId: 359385295
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 34bc242..dd3f243 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -734,12 +734,13 @@
 }
 
 template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
-  return pfrexp_float(a,exponent);
+  return pfrexp_generic(a,exponent);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4d pfrexp<Packet4d>(const Packet4d& a, Packet4d& exponent) {
-  const Packet4d cst_1022d = pset1<Packet4d>(1022.0);
-  const Packet4d cst_half = pset1<Packet4d>(0.5);
+// Extract exponent without existence of Packet4l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) {
   const Packet4d cst_exp_mask  = pset1frombits<Packet4d>(static_cast<uint64_t>(0x7ff0000000000000ull));
   __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask));
 #ifdef EIGEN_VECTORIZE_AVX2
@@ -754,15 +755,18 @@
 #endif
   Packet2d exponent_lo = _mm_cvtepi32_pd(vec4i_swizzle1(lo, 0, 2, 1, 3));
   Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3));
-  exponent = _mm256_insertf128_pd(exponent, exponent_lo, 0);
+  Packet4d exponent = _mm256_insertf128_pd(_mm256_setzero_pd(), exponent_lo, 0);
   exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1);
-  exponent = psub(exponent, cst_1022d);
-  const Packet4d cst_mant_mask  = pset1frombits<Packet4d>(static_cast<uint64_t>(~0x7ff0000000000000ull));
-  return por(pand(a, cst_mant_mask), cst_half);
+  return exponent;
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet4d pfrexp<Packet4d>(const Packet4d& a, Packet4d& exponent) {
+  return pfrexp_generic(a, exponent);
 }
 
 template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
-  return pldexp_float(a,exponent);
+  return pldexp_generic(a, exponent);
 }
 
 template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index b9e9fdb..f874137 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -895,25 +895,28 @@
 
 template<>
 EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent){
-  return pfrexp_float(a, exponent);
+  return pfrexp_generic(a, exponent);
+}
+
+// Extract exponent without existence of Packet8l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) {
+  const Packet8d cst_exp_mask  = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  #ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52));
+  #else
+  return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52)));
+  #endif
 }
 
 template<>
 EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
-  const Packet8d cst_1022d = pset1<Packet8d>(1022.0);
-#ifdef EIGEN_TEST_AVX512DQ
-  exponent = psub(_mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), 52)), cst_1022d);
-#else
-  exponent = psub(_mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(a), 52))),
-                                     cst_1022d);
-#endif
-  const Packet8d cst_half = pset1<Packet8d>(0.5);
-  const Packet8d cst_inv_mant_mask  = pset1frombits<Packet8d>(static_cast<uint64_t>(~0x7ff0000000000000ull));
-  return por(pand(a, cst_inv_mant_mask), cst_half);
+  return pfrexp_generic(a, exponent);
 }
 
 template<> EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
-  return pldexp_float(a,exponent);
+  return pldexp_generic(a, exponent);
 }
 
 template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index 53116ad..9d9bbeb 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -70,7 +70,7 @@
 // MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out
 // to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then
 // are responsible to extract from convert between Eigen's and MatrixProduct approach.
-const static Packet4f p4f_CONJUGATE = {-1.0f, -1.0f, -1.0f, -1.0f};
+const static Packet4f p4f_CONJUGATE = {float(-1.0), float(-1.0), float(-1.0), float(-1.0)};
 
 const static Packet2d p2d_CONJUGATE = {-1.0, -1.0};
 
@@ -122,7 +122,7 @@
     v.imag(dt(i,j).imag());
   } else {
     v.real(dt(i,j).real());
-    v.imag((Scalar)0.0f);
+    v.imag((Scalar)0.0);
   }
   return v;
 }
@@ -136,7 +136,7 @@
   Scalar* blockBf = reinterpret_cast<Scalar *>(blockB);
 
   Index ri = 0, j = 0;
-  for(; j + vectorSize < cols; j+=vectorSize)
+  for(; j + vectorSize <= cols; j+=vectorSize)
   {
       Index i = k2;
       for(; i < depth; i++)
@@ -192,7 +192,7 @@
   Index ri = 0, j = 0;
   Scalar *blockAf = (Scalar *)(blockA);
 
-  for(; j + vectorSize < rows; j+=vectorSize)
+  for(; j + vectorSize <= rows; j+=vectorSize)
   {
       Index i = 0;
 
@@ -247,7 +247,7 @@
   const int vectorSize = quad_traits<Scalar>::vectorsize;
 
   Index ri = 0, j = 0;
-  for(; j + N*vectorSize < cols; j+=N*vectorSize)
+  for(; j + N*vectorSize <= cols; j+=N*vectorSize)
   {
       Index i = k2;
       for(; i < depth; i++)
@@ -284,7 +284,7 @@
   const int vectorSize = quad_traits<Scalar>::vectorsize;
   Index ri = 0, j = 0;
 
-  for(j = 0; j + vectorSize < rows; j+=vectorSize)
+  for(j = 0; j + vectorSize <= rows; j+=vectorSize)
   {
       Index i = 0;
 
@@ -410,15 +410,15 @@
     const int vectorSize = quad_traits<Scalar>::vectorsize;
     Index ri = 0, j = 0;
     Scalar *blockAt  = reinterpret_cast<Scalar *>(blockA);
-    Packet conj = pset1<Packet>((Scalar)-1.0f);
+    Packet conj = pset1<Packet>((Scalar)-1.0);
 
-    for(j = 0; j + vectorSize < rows; j+=vectorSize)
+    for(j = 0; j + vectorSize <= rows; j+=vectorSize)
     {
       Index i = 0;
 
       if(PanelMode) ri += vectorSize*offset;
 
-      for(; i + vectorSize < depth; i+=vectorSize)
+      for(; i + vectorSize <= depth; i+=vectorSize)
       {
         PacketBlock<Packet, 4> block;
 
@@ -446,10 +446,10 @@
           cblock.packet[7] = lhs.template loadPacket<PacketC>(j + 3, i + 2);
         }
 
-        block.packet[0] = vec_perm(cblock.packet[0].v , cblock.packet[4].v, p16uc_GETREAL32);
-        block.packet[1] = vec_perm(cblock.packet[1].v , cblock.packet[5].v, p16uc_GETREAL32);
-        block.packet[2] = vec_perm(cblock.packet[2].v , cblock.packet[6].v, p16uc_GETREAL32);
-        block.packet[3] = vec_perm(cblock.packet[3].v , cblock.packet[7].v, p16uc_GETREAL32);
+        block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
+        block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
+        block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
+        block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
 
         if(StorageOrder == RowMajor) ptranspose(block);
 
@@ -475,7 +475,7 @@
 
       if(PanelMode) ri += vectorSize*offset;
 
-      for(; i + vectorSize < depth; i+=vectorSize)
+      for(; i + vectorSize <= depth; i+=vectorSize)
       {
         PacketBlock<PacketC, 8> cblock;
         if(StorageOrder == ColMajor)
@@ -502,10 +502,10 @@
         }
 
         PacketBlock<Packet, 4> block;
-        block.packet[0] = vec_perm(cblock.packet[0].v , cblock.packet[4].v, p16uc_GETIMAG32);
-        block.packet[1] = vec_perm(cblock.packet[1].v , cblock.packet[5].v, p16uc_GETIMAG32);
-        block.packet[2] = vec_perm(cblock.packet[2].v , cblock.packet[6].v, p16uc_GETIMAG32);
-        block.packet[3] = vec_perm(cblock.packet[3].v , cblock.packet[7].v, p16uc_GETIMAG32);
+        block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
+        block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
+        block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
+        block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
 
         if(Conjugate)
         {
@@ -585,13 +585,13 @@
     const int vectorSize = quad_traits<Scalar>::vectorsize;
     Index ri = 0, j = 0;
 
-    for(j = 0; j + vectorSize < rows; j+=vectorSize)
+    for(j = 0; j + vectorSize <= rows; j+=vectorSize)
     {
       Index i = 0;
 
       if(PanelMode) ri += vectorSize*offset;
 
-      for(; i + vectorSize < depth; i+=vectorSize)
+      for(; i + vectorSize <= depth; i+=vectorSize)
       {
         PacketBlock<Packet, 4> block;
 
@@ -637,13 +637,16 @@
 
     if(PanelMode) ri += offset*(rows - j);
 
-    for(Index i = 0; i < depth; i++)
+    if (j < rows)
     {
-      Index k = j;
-      for(; k < rows; k++)
+      for(Index i = 0; i < depth; i++)
       {
-        blockA[ri] = lhs(k, i);
-        ri += 1;
+        Index k = j;
+        for(; k < rows; k++)
+        {
+          blockA[ri] = lhs(k, i);
+          ri += 1;
+        }
       }
     }
 
@@ -659,16 +662,16 @@
   {
     const int vectorSize = quad_traits<Scalar>::vectorsize;
     Scalar *blockBt = reinterpret_cast<Scalar *>(blockB);
-    Packet conj = pset1<Packet>((Scalar)-1.0f);
+    Packet conj = pset1<Packet>((Scalar)-1.0);
 
     Index ri = 0, j = 0;
-    for(; j + vectorSize < cols; j+=vectorSize)
+    for(; j + vectorSize <= cols; j+=vectorSize)
     {
         Index i = 0;
 
         if(PanelMode) ri += offset*vectorSize;
 
-        for(; i + vectorSize < depth; i+=vectorSize)
+        for(; i + vectorSize <= depth; i+=vectorSize)
         {
             PacketBlock<PacketC, 8> cblock;
             if(StorageOrder == ColMajor)
@@ -695,10 +698,10 @@
             }
 
             PacketBlock<Packet, 4> block;
-            block.packet[0] = vec_perm(cblock.packet[0].v , cblock.packet[4].v, p16uc_GETREAL32);
-            block.packet[1] = vec_perm(cblock.packet[1].v , cblock.packet[5].v, p16uc_GETREAL32);
-            block.packet[2] = vec_perm(cblock.packet[2].v , cblock.packet[6].v, p16uc_GETREAL32);
-            block.packet[3] = vec_perm(cblock.packet[3].v , cblock.packet[7].v, p16uc_GETREAL32);
+            block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
+            block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
+            block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
+            block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
 
             if(StorageOrder == ColMajor) ptranspose(block);
 
@@ -724,7 +727,7 @@
 
         if(PanelMode) ri += offset*vectorSize;
 
-        for(; i + vectorSize < depth; i+=vectorSize)
+        for(; i + vectorSize <= depth; i+=vectorSize)
         {
           PacketBlock<PacketC, 8> cblock;
           if(StorageOrder == ColMajor)
@@ -752,10 +755,10 @@
           }
 
           PacketBlock<Packet, 4> block;
-          block.packet[0] = vec_perm(cblock.packet[0].v , cblock.packet[4].v, p16uc_GETIMAG32);
-          block.packet[1] = vec_perm(cblock.packet[1].v , cblock.packet[5].v, p16uc_GETIMAG32);
-          block.packet[2] = vec_perm(cblock.packet[2].v , cblock.packet[6].v, p16uc_GETIMAG32);
-          block.packet[3] = vec_perm(cblock.packet[3].v , cblock.packet[7].v, p16uc_GETIMAG32);
+          block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
+          block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
+          block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
+          block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
 
           if(Conjugate)
           {
@@ -832,13 +835,14 @@
   {
     const int vectorSize = quad_traits<Scalar>::vectorsize;
     Index ri = 0, j = 0;
-    for(; j + vectorSize < cols; j+=vectorSize)
+
+    for(; j + vectorSize <= cols; j+=vectorSize)
     {
       Index i = 0;
 
       if(PanelMode) ri += offset*vectorSize;
 
-      for(; i + vectorSize < depth; i+=vectorSize)
+      for(; i + vectorSize <= depth; i+=vectorSize)
       {
         PacketBlock<Packet, 4> block;
         if(StorageOrder == ColMajor)
@@ -883,13 +887,16 @@
 
     if(PanelMode) ri += offset*(cols - j);
 
-    for(Index i = 0; i < depth; i++)
+    if (j < cols)
     {
-      Index k = j;
-      for(; k < cols; k++)
+      for(Index i = 0; i < depth; i++)
       {
-        blockB[ri] = rhs(i, k);
-        ri += 1;
+        Index k = j;
+        for(; k < cols; k++)
+        {
+          blockB[ri] = rhs(i, k);
+          ri += 1;
+        }
       }
     }
     if(PanelMode) ri += (cols - j)*(stride - offset - depth);
@@ -905,13 +912,13 @@
     const int vectorSize = quad_traits<double>::vectorsize;
     Index ri = 0, j = 0;
 
-    for(j = 0; j + vectorSize < rows; j+=vectorSize)
+    for(j = 0; j + vectorSize <= rows; j+=vectorSize)
     {
       Index i = 0;
 
       if(PanelMode) ri += vectorSize*offset;
 
-      for(; i + vectorSize < depth; i+=vectorSize)
+      for(; i + vectorSize <= depth; i+=vectorSize)
       {
         PacketBlock<Packet2d, 2> block;
         if(StorageOrder == RowMajor)
@@ -970,12 +977,12 @@
   {
     const int vectorSize = quad_traits<double>::vectorsize;
     Index ri = 0, j = 0;
-    for(; j + 2*vectorSize < cols; j+=2*vectorSize)
+    for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
     {
       Index i = 0;
 
       if(PanelMode) ri += offset*(2*vectorSize);
-      for(; i + vectorSize < depth; i+=vectorSize)
+      for(; i + vectorSize <= depth; i+=vectorSize)
       {
         PacketBlock<Packet2d, 4> block;
         if(StorageOrder == ColMajor)
@@ -1059,13 +1066,13 @@
     double *blockAt  = reinterpret_cast<double *>(blockA);
     Packet conj = pset1<Packet>(-1.0);
 
-    for(j = 0; j + vectorSize < rows; j+=vectorSize)
+    for(j = 0; j + vectorSize <= rows; j+=vectorSize)
     {
       Index i = 0;
 
       if(PanelMode) ri += vectorSize*offset;
 
-      for(; i + vectorSize < depth; i+=vectorSize)
+      for(; i + vectorSize <= depth; i+=vectorSize)
       {
         PacketBlock<Packet, 2> block;
 
@@ -1078,8 +1085,8 @@
           cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 1, i + 0); //[a2 a2i]
           cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i]
 
-          block.packet[0] = vec_perm(cblock.packet[0].v , cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2]
-          block.packet[1] = vec_perm(cblock.packet[1].v , cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
+          block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2]
+          block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
         } else {
           cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i); //[a1 a1i]
           cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i); //[a2 a2i]
@@ -1087,8 +1094,8 @@
           cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 0, i + 1); //[b1 b1i]
           cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i]
 
-          block.packet[0] = vec_perm(cblock.packet[0].v , cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2]
-          block.packet[1] = vec_perm(cblock.packet[2].v , cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
+          block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2]
+          block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
         }
 
         pstore<double>(blockAt + ri     , block.packet[0]);
@@ -1108,7 +1115,7 @@
 
       if(PanelMode) ri += vectorSize*offset;
 
-      for(; i + vectorSize < depth; i+=vectorSize)
+      for(; i + vectorSize <= depth; i+=vectorSize)
       {
         PacketBlock<Packet, 2> block;
 
@@ -1121,8 +1128,8 @@
           cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 1, i + 0);
           cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1);
 
-          block.packet[0] = vec_perm(cblock.packet[0].v , cblock.packet[2].v, p16uc_GETIMAG64);
-          block.packet[1] = vec_perm(cblock.packet[1].v , cblock.packet[3].v, p16uc_GETIMAG64);
+          block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64);
+          block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64);
         } else {
           cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
           cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
@@ -1130,8 +1137,8 @@
           cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 0, i + 1);
           cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1);
 
-          block.packet[0] = vec_perm(cblock.packet[0].v , cblock.packet[1].v, p16uc_GETIMAG64);
-          block.packet[1] = vec_perm(cblock.packet[2].v , cblock.packet[3].v, p16uc_GETIMAG64);
+          block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
+          block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
         }
 
         if(Conjugate)
@@ -1205,7 +1212,7 @@
     Packet conj = pset1<Packet>(-1.0);
 
     Index ri = 0, j = 0;
-    for(; j + 2*vectorSize < cols; j+=2*vectorSize)
+    for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
     {
       Index i = 0;
 
@@ -1221,8 +1228,8 @@
         cblock.packet[2] = rhs.template loadPacket<PacketC>(i, j + 2);
         cblock.packet[3] = rhs.template loadPacket<PacketC>(i, j + 3);
 
-        block.packet[0] = vec_perm(cblock.packet[0].v , cblock.packet[1].v, p16uc_GETREAL64);
-        block.packet[1] = vec_perm(cblock.packet[2].v , cblock.packet[3].v, p16uc_GETREAL64);
+        block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
+        block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
 
         pstore<double>(blockBt + ri    , block.packet[0]);
         pstore<double>(blockBt + ri + 2, block.packet[1]);
@@ -1246,8 +1253,8 @@
         cblock.packet[2] = rhs.template loadPacket<PacketC>(i, j + 2); //[c1 c1i]
         cblock.packet[3] = rhs.template loadPacket<PacketC>(i, j + 3); //[d1 d1i]
 
-        block.packet[0] = vec_perm(cblock.packet[0].v , cblock.packet[1].v, p16uc_GETIMAG64);
-        block.packet[1] = vec_perm(cblock.packet[2].v , cblock.packet[3].v, p16uc_GETIMAG64);
+        block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
+        block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
 
         if(Conjugate)
         {
@@ -1300,25 +1307,84 @@
 
 // 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm).
 template<typename Scalar, typename Packet, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pger(PacketBlock<Packet, 4> *acc, const Scalar* lhs, const Scalar* rhs)
+EIGEN_STRONG_INLINE void pger(PacketBlock<Packet,4>* acc, const Scalar* lhs, const Packet* rhsV)
 {
-  Packet lhsV = *((Packet *) lhs);
-  Packet rhsV1 = pset1<Packet>(rhs[0]);
-  Packet rhsV2 = pset1<Packet>(rhs[1]);
-  Packet rhsV3 = pset1<Packet>(rhs[2]);
-  Packet rhsV4 = pset1<Packet>(rhs[3]);
+  asm("#pger begin");
+  Packet lhsV = pload<Packet>(lhs);
 
   if(NegativeAccumulate)
   {
-    acc->packet[0] -= lhsV*rhsV1;
-    acc->packet[1] -= lhsV*rhsV2;
-    acc->packet[2] -= lhsV*rhsV3;
-    acc->packet[3] -= lhsV*rhsV4;
+    acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
+    acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]);
+    acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]);
+    acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]);
   } else {
-    acc->packet[0] += lhsV*rhsV1;
-    acc->packet[1] += lhsV*rhsV2;
-    acc->packet[2] += lhsV*rhsV3;
-    acc->packet[3] += lhsV*rhsV4;
+    acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
+    acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]);
+    acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]);
+    acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]);
+  }
+  asm("#pger end");
+}
+
+template<typename Scalar, typename Packet, bool NegativeAccumulate>
+EIGEN_STRONG_INLINE void pger(PacketBlock<Packet,1>* acc, const Scalar* lhs, const Packet* rhsV)
+{
+  Packet lhsV = pload<Packet>(lhs);
+
+  if(NegativeAccumulate)
+  {
+    acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
+  } else {
+    acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
+  }
+}
+
+template<typename Scalar, typename Packet, typename Index, bool NegativeAccumulate>
+EIGEN_STRONG_INLINE void pger(PacketBlock<Packet,4>* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows)
+{
+#ifdef _ARCH_PWR9
+  Packet lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar));
+#else
+  Packet lhsV;
+  Index i = 0;
+  do {
+    lhsV[i] = lhs[i];
+  } while (++i < remaining_rows);
+#endif
+
+  if(NegativeAccumulate)
+  {
+    acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
+    acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]);
+    acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]);
+    acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]);
+  } else {
+    acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
+    acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]);
+    acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]);
+    acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]);
+  }
+}
+
+template<typename Scalar, typename Packet, typename Index, bool NegativeAccumulate>
+EIGEN_STRONG_INLINE void pger(PacketBlock<Packet,1>* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows)
+{
+#ifdef _ARCH_PWR9
+  Packet lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar));
+#else
+  Packet lhsV;
+  Index i = 0;
+  do {
+    lhsV[i] = lhs[i];
+  } while (++i < remaining_rows);
+#endif
+
+  if(NegativeAccumulate)
+  {
+    acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
+  } else {
+    acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
   }
 }
 
@@ -1399,7 +1465,7 @@
 template<typename Scalar, typename Packet>
 EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar *lhs)
 {
-    return *((Packet *)lhs);
+  return *((Packet *)lhs);
 }
 
 // Zero the accumulator on PacketBlock.
@@ -1412,14 +1478,26 @@
   acc.packet[3] = pset1<Packet>((Scalar)0);
 }
 
+template<typename Scalar, typename Packet>
+EIGEN_STRONG_INLINE void bsetzero(PacketBlock<Packet,1>& acc)
+{
+  acc.packet[0] = pset1<Packet>((Scalar)0);
+}
+
 // Scale the PacketBlock vectors by alpha.
 template<typename Packet>
 EIGEN_STRONG_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha)
 {
-  acc.packet[0] = pmadd(pAlpha,accZ.packet[0], acc.packet[0]);
-  acc.packet[1] = pmadd(pAlpha,accZ.packet[1], acc.packet[1]);
-  acc.packet[2] = pmadd(pAlpha,accZ.packet[2], acc.packet[2]);
-  acc.packet[3] = pmadd(pAlpha,accZ.packet[3], acc.packet[3]);
+  acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
+  acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]);
+  acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]);
+  acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]);
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE void bscale(PacketBlock<Packet,1>& acc, PacketBlock<Packet,1>& accZ, const Packet& pAlpha)
+{
+  acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
 }
 
 // Complex version of PacketBlock scaling.
@@ -1471,16 +1549,453 @@
   acc.packet[7] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 3);
 }
 
+const static Packet4i mask41 = { -1,  0,  0,  0 };
+const static Packet4i mask42 = { -1, -1,  0,  0 };
+const static Packet4i mask43 = { -1, -1, -1,  0 };
+
+const static Packet2l mask21 = { -1, 0 };
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows)
+{
+  if (remaining_rows == 0) {
+    return pset1<Packet>(float(0.0));
+  } else {
+    switch (remaining_rows) {
+      case 1:  return Packet(mask41);
+      case 2:  return Packet(mask42);
+      default: return Packet(mask43);
+    }
+  }
+}
+
+template<>
+EIGEN_STRONG_INLINE Packet2d bmask<Packet2d>(const int remaining_rows)
+{
+  if (remaining_rows == 0) {
+    return pset1<Packet2d>(double(0.0));
+  } else {
+    return Packet2d(mask21);
+  }
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha, const Packet& pMask)
+{
+  acc.packet[0] = pmadd(pAlpha, pand(accZ.packet[0], pMask), acc.packet[0]);
+  acc.packet[1] = pmadd(pAlpha, pand(accZ.packet[1], pMask), acc.packet[1]);
+  acc.packet[2] = pmadd(pAlpha, pand(accZ.packet[2], pMask), acc.packet[2]);
+  acc.packet[3] = pmadd(pAlpha, pand(accZ.packet[3], pMask), acc.packet[3]);
+}
 
 // PEEL loop factor.
 #define PEEL 10
 
+template<typename Scalar, typename Packet, typename Index>
+EIGEN_STRONG_INLINE void MICRO_EXTRA_COL(
+  const Scalar* &lhs_ptr,
+  const Scalar* &rhs_ptr,
+  PacketBlock<Packet,1> &accZero,
+  Index remaining_rows,
+  Index remaining_cols)
+{
+  Packet rhsV[1];
+  rhsV[0] = pset1<Packet>(rhs_ptr[0]);
+  pger<Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
+  lhs_ptr += remaining_rows;
+  rhs_ptr += remaining_cols;
+}
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
+EIGEN_STRONG_INLINE void gemm_extra_col(
+  const DataMapper& res,
+  const Scalar *lhs_base,
+  const Scalar *rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index row,
+  Index col,
+  Index remaining_rows,
+  Index remaining_cols,
+  const Packet& pAlpha)
+{
+  const Scalar *rhs_ptr = rhs_base;
+  const Scalar *lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
+  PacketBlock<Packet,1> accZero, acc;
+
+  bsetzero<Scalar, Packet>(accZero);
+
+  Index remaining_depth = (depth & -accRows);
+  Index k = 0;
+  for(; k + PEEL <= remaining_depth; k+= PEEL)
+  {
+    prefetch(rhs_ptr);
+    prefetch(lhs_ptr);
+    for (int l = 0; l < PEEL; l++) {
+      MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
+    }
+  }
+  for(; k < remaining_depth; k++)
+  {
+    MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
+  }
+  for(; k < depth; k++)
+  {
+    Packet rhsV[1];
+    rhsV[0] = pset1<Packet>(rhs_ptr[0]);
+    pger<Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
+    lhs_ptr += remaining_rows;
+    rhs_ptr += remaining_cols;
+  }
+
+  acc.packet[0] = vec_mul(pAlpha, accZero.packet[0]);
+  for(Index i = 0; i < remaining_rows; i++){
+    res(row + i, col) += acc.packet[0][i];
+  }
+}
+
+template<typename Scalar, typename Packet, typename Index, const Index accRows>
+EIGEN_STRONG_INLINE void MICRO_EXTRA_ROW(
+  const Scalar* &lhs_ptr,
+  const Scalar* &rhs_ptr,
+  PacketBlock<Packet,4> &accZero,
+  Index remaining_rows)
+{
+  Packet rhsV[4];
+  pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+  pger<Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
+  lhs_ptr += remaining_rows;
+  rhs_ptr += accRows;
+}
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
+EIGEN_STRONG_INLINE void gemm_extra_row(
+  const DataMapper& res,
+  const Scalar *lhs_base,
+  const Scalar *rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index row,
+  Index col,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask)
+{
+  const Scalar *rhs_ptr = rhs_base;
+  const Scalar *lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
+  PacketBlock<Packet,4> accZero, acc;
+
+  bsetzero<Scalar, Packet>(accZero);
+
+  Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows);
+  Index k = 0;
+  for(; k + PEEL <= remaining_depth; k+= PEEL)
+  {
+    prefetch(rhs_ptr);
+    prefetch(lhs_ptr);
+    for (int l = 0; l < PEEL; l++) {
+      MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
+    }
+  }
+  for(; k < remaining_depth; k++)
+  {
+    MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
+  }
+
+  if (remaining_depth == depth)
+  {
+    for(Index j = 0; j < 4; j++){
+      acc.packet[j] = res.template loadPacket<Packet>(row, col + j);
+    }
+    bscale(acc, accZero, pAlpha, pMask);
+    res.template storePacketBlock<Packet, 4>(row, col, acc);
+  } else {
+    for(; k < depth; k++)
+    {
+      Packet rhsV[4];
+      pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+      pger<Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
+      lhs_ptr += remaining_rows;
+      rhs_ptr += accRows;
+    }
+
+    for(Index j = 0; j < 4; j++){
+      acc.packet[j] = vec_mul(pAlpha, accZero.packet[j]);
+    }
+    for(Index j = 0; j < 4; j++){
+      for(Index i = 0; i < remaining_rows; i++){
+        res(row + i, col + j) += acc.packet[j][i];
+      }
+    }
+  }
+}
+
+#define MICRO_DST \
+  PacketBlock<Packet,4> *accZero0, PacketBlock<Packet,4> *accZero1, PacketBlock<Packet,4> *accZero2, \
+  PacketBlock<Packet,4> *accZero3, PacketBlock<Packet,4> *accZero4, PacketBlock<Packet,4> *accZero5, \
+  PacketBlock<Packet,4> *accZero6, PacketBlock<Packet,4> *accZero7
+
+#define MICRO_COL_DST \
+  PacketBlock<Packet,1> *accZero0, PacketBlock<Packet,1> *accZero1, PacketBlock<Packet,1> *accZero2, \
+  PacketBlock<Packet,1> *accZero3, PacketBlock<Packet,1> *accZero4, PacketBlock<Packet,1> *accZero5, \
+  PacketBlock<Packet,1> *accZero6, PacketBlock<Packet,1> *accZero7
+
+#define MICRO_SRC \
+  const Scalar **lhs_ptr0, const Scalar **lhs_ptr1, const Scalar **lhs_ptr2, \
+  const Scalar **lhs_ptr3, const Scalar **lhs_ptr4, const Scalar **lhs_ptr5, \
+  const Scalar **lhs_ptr6, const Scalar **lhs_ptr7
+
+#define MICRO_ONE \
+  MICRO<unroll_factor, Scalar, Packet, accRows, accCols>(\
+    &lhs_ptr0, &lhs_ptr1, &lhs_ptr2, &lhs_ptr3, &lhs_ptr4, &lhs_ptr5, &lhs_ptr6, &lhs_ptr7, \
+    rhs_ptr, \
+    &accZero0, &accZero1, &accZero2, &accZero3, &accZero4, &accZero5, &accZero6, &accZero7);
+
+#define MICRO_COL_ONE \
+  MICRO_COL<unroll_factor, Scalar, Packet, Index, accCols>(\
+    &lhs_ptr0, &lhs_ptr1, &lhs_ptr2, &lhs_ptr3, &lhs_ptr4, &lhs_ptr5, &lhs_ptr6, &lhs_ptr7, \
+    rhs_ptr, \
+    &accZero0, &accZero1, &accZero2, &accZero3, &accZero4, &accZero5, &accZero6, &accZero7, \
+    remaining_cols);
+
+#define MICRO_WORK_ONE(iter) \
+  if (N > iter) { \
+    pger<Scalar, Packet, false>(accZero##iter, *lhs_ptr##iter, rhsV); \
+    *lhs_ptr##iter += accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accZero##iter); \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
+  }
+
+#define MICRO_UNROLL(func) \
+  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+
+#define MICRO_WORK MICRO_UNROLL(MICRO_WORK_ONE)
+
+#define MICRO_DST_PTR_ONE(iter) \
+  if (unroll_factor > iter){ \
+    bsetzero<Scalar, Packet>(accZero##iter); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accZero##iter); \
+  }
+
+#define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE)
+
+#define MICRO_SRC_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
+  }
+
+#define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE)
+
+#define MICRO_PREFETCH_ONE(iter) \
+  if (unroll_factor > iter){ \
+    prefetch(lhs_ptr##iter); \
+  }
+
+#define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
+
+#define MICRO_STORE_ONE(iter) \
+  if (unroll_factor > iter){ \
+    acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
+    acc.packet[1] = res.template loadPacket<Packet>(row + iter*accCols, col + 1); \
+    acc.packet[2] = res.template loadPacket<Packet>(row + iter*accCols, col + 2); \
+    acc.packet[3] = res.template loadPacket<Packet>(row + iter*accCols, col + 3); \
+    bscale(acc, accZero##iter, pAlpha); \
+    res.template storePacketBlock<Packet, 4>(row + iter*accCols, col, acc); \
+  }
+
+#define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
+
+#define MICRO_COL_STORE_ONE(iter) \
+  if (unroll_factor > iter){ \
+    acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
+    bscale(acc, accZero##iter, pAlpha); \
+    res.template storePacketBlock<Packet, 1>(row + iter*accCols, col, acc); \
+  }
+
+#define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE)
+
+template<int N, typename Scalar, typename Packet, const Index accRows, const Index accCols>
+EIGEN_STRONG_INLINE void MICRO(
+  MICRO_SRC,
+  const Scalar* &rhs_ptr,
+  MICRO_DST)
+  {
+    Packet rhsV[4];
+    pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+    asm("#unrolled pger? begin");
+    MICRO_WORK
+    asm("#unrolled pger? end");
+    rhs_ptr += accRows;
+  }
+
+template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_unrolled_iteration(
+  const DataMapper& res,
+  const Scalar *lhs_base,
+  const Scalar *rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index& row,
+  Index col,
+  const Packet& pAlpha)
+{
+  const Scalar *rhs_ptr = rhs_base;
+  const Scalar *lhs_ptr0, *lhs_ptr1, *lhs_ptr2, *lhs_ptr3, *lhs_ptr4, *lhs_ptr5, *lhs_ptr6, *lhs_ptr7;
+  PacketBlock<Packet,4> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+  PacketBlock<Packet,4> acc;
+
+  asm("#unrolled start");
+  MICRO_SRC_PTR
+  asm("#unrolled zero?");
+  MICRO_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL <= depth; k+= PEEL)
+  {
+    prefetch(rhs_ptr);
+    MICRO_PREFETCH
+    asm("#unrolled inner loop?");
+    for (int l = 0; l < PEEL; l++) {
+      MICRO_ONE
+    }
+    asm("#unrolled inner loop end?");
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_ONE
+  }
+  MICRO_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<int N, typename Scalar, typename Packet, typename Index, const Index accCols>
+EIGEN_STRONG_INLINE void MICRO_COL(
+  MICRO_SRC,
+  const Scalar* &rhs_ptr,
+  MICRO_COL_DST,
+  Index remaining_rows)
+  {
+    Packet rhsV[1];
+    rhsV[0] = pset1<Packet>(rhs_ptr[0]);
+    asm("#unrolled pger? begin");
+    MICRO_WORK
+    asm("#unrolled pger? end");
+    rhs_ptr += remaining_rows;
+  }
+
+template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration(
+  const DataMapper& res,
+  const Scalar *lhs_base,
+  const Scalar *rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index& row,
+  Index col,
+  Index remaining_cols,
+  const Packet& pAlpha)
+{
+  const Scalar *rhs_ptr = rhs_base;
+  const Scalar *lhs_ptr0, *lhs_ptr1, *lhs_ptr2, *lhs_ptr3, *lhs_ptr4, *lhs_ptr5, *lhs_ptr6, *lhs_ptr7;
+  PacketBlock<Packet,1> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+  PacketBlock<Packet,1> acc;
+
+  MICRO_SRC_PTR
+  MICRO_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL <= depth; k+= PEEL)
+  {
+    prefetch(rhs_ptr);
+    MICRO_PREFETCH
+    for (int l = 0; l < PEEL; l++) {
+      MICRO_COL_ONE
+    }
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_COL_ONE
+  }
+  MICRO_COL_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_unrolled_col(
+  const DataMapper& res,
+  const Scalar *lhs_base,
+  const Scalar *rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index& row,
+  Index rows,
+  Index col,
+  Index remaining_cols,
+  const Packet& pAlpha)
+{
+#define MAX_UNROLL 6
+  while(row + MAX_UNROLL*accCols <= rows){
+    gemm_unrolled_col_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+  }
+  switch( (rows-row)/accCols ){
+#if MAX_UNROLL > 7
+    case 7:
+      gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 6
+    case 6:
+      gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 5
+   case 5:
+      gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 4
+   case 4:
+      gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 3
+   case 3:
+     gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+     break;
+#endif
+#if MAX_UNROLL > 2
+   case 2:
+     gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+     break;
+#endif
+#if MAX_UNROLL > 1
+   case 1:
+     gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+     break;
+#endif
+   default:
+     break;
+  }
+#undef MAX_UNROLL
+}
+
 /****************
  * GEMM kernels *
  * **************/
-template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper>
-EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
-          Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB, const int accRows, const int accCols)
+template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
+EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
       const Index remaining_rows = rows % accCols;
       const Index remaining_cols = cols % accRows;
@@ -1489,516 +2004,91 @@
       if( strideB == -1 ) strideB = depth;
 
       const Packet pAlpha = pset1<Packet>(alpha);
+      const Packet pMask  = bmask<Packet>((const int)(remaining_rows));
+
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        const Scalar *rhs_base = blockB + ( col/accRows     )*strideB*accRows;
+        const Scalar *rhs_base = blockB + col*strideB + accRows*offsetB;
         const Scalar *lhs_base = blockA;
-
         Index row = 0;
-        for(; row + 6*accCols <= rows; row += 6*accCols)
-        {
-#define MICRO() \
-            pger<Scalar, Packet, false>(&accZero1, lhs_ptr1, rhs_ptr); \
-            lhs_ptr1 += accCols; \
-            pger<Scalar, Packet, false>(&accZero2, lhs_ptr2, rhs_ptr); \
-            lhs_ptr2 += accCols; \
-            pger<Scalar, Packet, false>(&accZero3, lhs_ptr3, rhs_ptr); \
-            lhs_ptr3 += accCols; \
-            pger<Scalar, Packet, false>(&accZero4, lhs_ptr4, rhs_ptr); \
-            lhs_ptr4 += accCols; \
-            pger<Scalar, Packet, false>(&accZero5, lhs_ptr5, rhs_ptr); \
-            lhs_ptr5 += accCols; \
-            pger<Scalar, Packet, false>(&accZero6, lhs_ptr6, rhs_ptr); \
-            lhs_ptr6 += accCols; \
-            rhs_ptr += accRows;
 
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *lhs_ptr1 = lhs_base + (row/accCols)*strideA*accCols;
-          const Scalar *lhs_ptr2 = lhs_base + ((row/accCols) + 1)*strideA*accCols;
-          const Scalar *lhs_ptr3 = lhs_base + ((row/accCols) + 2)*strideA*accCols;
-          const Scalar *lhs_ptr4 = lhs_base + ((row/accCols) + 3)*strideA*accCols;
-          const Scalar *lhs_ptr5 = lhs_base + ((row/accCols) + 4)*strideA*accCols;
-          const Scalar *lhs_ptr6 = lhs_base + ((row/accCols) + 5)*strideA*accCols;
-
-          PacketBlock<Packet,4> acc1, accZero1;
-          PacketBlock<Packet,4> acc2, accZero2;
-          PacketBlock<Packet,4> acc3, accZero3;
-          PacketBlock<Packet,4> acc4, accZero4;
-          PacketBlock<Packet,4> acc5, accZero5;
-          PacketBlock<Packet,4> acc6, accZero6;
-
-          bload<DataMapper, Packet, Index, 0>(acc1, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero1);
-          bload<DataMapper, Packet, Index, 1>(acc2, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero2);
-          bload<DataMapper, Packet, Index, 2>(acc3, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero3);
-          bload<DataMapper, Packet, Index, 3>(acc4, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero4);
-          bload<DataMapper, Packet, Index, 4>(acc5, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero5);
-          bload<DataMapper, Packet, Index, 5>(acc6, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero6);
-
-          lhs_ptr1 += accCols*offsetA;
-          lhs_ptr2 += accCols*offsetA;
-          lhs_ptr3 += accCols*offsetA;
-          lhs_ptr4 += accCols*offsetA;
-          lhs_ptr5 += accCols*offsetA;
-          lhs_ptr6 += accCols*offsetA;
-          rhs_ptr += accRows*offsetB;
-
-          Index k = 0;
-          for(; k + PEEL < depth; k+= PEEL)
-          {
-            prefetch(rhs_ptr);
-            prefetch(lhs_ptr1);
-            prefetch(lhs_ptr2);
-            prefetch(lhs_ptr3);
-            prefetch(lhs_ptr4);
-            prefetch(lhs_ptr5);
-            prefetch(lhs_ptr6);
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-#if PEEL > 8
-            MICRO();
-            MICRO();
-#endif
-          }
-          for(; k < depth; k++)
-          {
-            MICRO();
-          }
-
-          bscale<Packet>(acc1,accZero1, pAlpha);
-          bscale<Packet>(acc2,accZero2, pAlpha);
-          bscale<Packet>(acc3,accZero3, pAlpha);
-          bscale<Packet>(acc4,accZero4, pAlpha);
-          bscale<Packet>(acc5,accZero5, pAlpha);
-          bscale<Packet>(acc6,accZero6, pAlpha);
-
-          res.template storePacketBlock<Packet, 4>(row + 0*accCols, col, acc1);
-          res.template storePacketBlock<Packet, 4>(row + 1*accCols, col, acc2);
-          res.template storePacketBlock<Packet, 4>(row + 2*accCols, col, acc3);
-          res.template storePacketBlock<Packet, 4>(row + 3*accCols, col, acc4);
-          res.template storePacketBlock<Packet, 4>(row + 4*accCols, col, acc5);
-          res.template storePacketBlock<Packet, 4>(row + 5*accCols, col, acc6);
-#undef MICRO
+        asm("#jump table");
+#define MAX_UNROLL 6
+        while(row + MAX_UNROLL*accCols <= rows){
+          gemm_unrolled_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
         }
-        for(; row + 5*accCols <= rows; row += 5*accCols)
-        {
-#define MICRO() \
-            pger<Scalar, Packet, false>(&accZero1, lhs_ptr1, rhs_ptr); \
-            lhs_ptr1 += accCols; \
-            pger<Scalar, Packet, false>(&accZero2, lhs_ptr2, rhs_ptr); \
-            lhs_ptr2 += accCols; \
-            pger<Scalar, Packet, false>(&accZero3, lhs_ptr3, rhs_ptr); \
-            lhs_ptr3 += accCols; \
-            pger<Scalar, Packet, false>(&accZero4, lhs_ptr4, rhs_ptr); \
-            lhs_ptr4 += accCols; \
-            pger<Scalar, Packet, false>(&accZero5, lhs_ptr5, rhs_ptr); \
-            lhs_ptr5 += accCols; \
-            rhs_ptr += accRows;
-
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *lhs_ptr1 = lhs_base + (row/accCols      )*strideA*accCols;
-          const Scalar *lhs_ptr2 = lhs_base + ((row/accCols) + 1)*strideA*accCols;
-          const Scalar *lhs_ptr3 = lhs_base + ((row/accCols) + 2)*strideA*accCols;
-          const Scalar *lhs_ptr4 = lhs_base + ((row/accCols) + 3)*strideA*accCols;
-          const Scalar *lhs_ptr5 = lhs_base + ((row/accCols) + 4)*strideA*accCols;
-
-          PacketBlock<Packet,4> acc1, accZero1;
-          PacketBlock<Packet,4> acc2, accZero2;
-          PacketBlock<Packet,4> acc3, accZero3;
-          PacketBlock<Packet,4> acc4, accZero4;
-          PacketBlock<Packet,4> acc5, accZero5;
-
-          bload<DataMapper, Packet, Index, 0>(acc1, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero1);
-          bload<DataMapper, Packet, Index, 1>(acc2, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero2);
-          bload<DataMapper, Packet, Index, 2>(acc3, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero3);
-          bload<DataMapper, Packet, Index, 3>(acc4, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero4);
-          bload<DataMapper, Packet, Index, 4>(acc5, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero5);
-
-          lhs_ptr1 += accCols*offsetA;
-          lhs_ptr2 += accCols*offsetA;
-          lhs_ptr3 += accCols*offsetA;
-          lhs_ptr4 += accCols*offsetA;
-          lhs_ptr5 += accCols*offsetA;
-          rhs_ptr += accRows*offsetB;
-          Index k = 0;
-
-          for(; k + PEEL < depth; k+= PEEL)
-          {
-            prefetch(rhs_ptr);
-            prefetch(lhs_ptr1);
-            prefetch(lhs_ptr2);
-            prefetch(lhs_ptr3);
-            prefetch(lhs_ptr4);
-            prefetch(lhs_ptr5);
-
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-#if PEEL > 8
-            MICRO();
-            MICRO();
+        switch( (rows-row)/accCols ){
+#if MAX_UNROLL > 7
+          case 7:
+            gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
 #endif
-          }
-          for(; k < depth; k++)
-          {
-            MICRO();
-          }
-
-          bscale<Packet>(acc1,accZero1, pAlpha);
-          bscale<Packet>(acc2,accZero2, pAlpha);
-          bscale<Packet>(acc3,accZero3, pAlpha);
-          bscale<Packet>(acc4,accZero4, pAlpha);
-          bscale<Packet>(acc5,accZero5, pAlpha);
-
-          res.template storePacketBlock<Packet, 4>(row + 0*accCols, col, acc1);
-          res.template storePacketBlock<Packet, 4>(row + 1*accCols, col, acc2);
-          res.template storePacketBlock<Packet, 4>(row + 2*accCols, col, acc3);
-          res.template storePacketBlock<Packet, 4>(row + 3*accCols, col, acc4);
-          res.template storePacketBlock<Packet, 4>(row + 4*accCols, col, acc5);
-#undef MICRO
-        }
-        for(; row + 4*accCols <= rows; row += 4*accCols)
-        {
-#define MICRO() \
-            pger<Scalar, Packet, false>(&accZero1, lhs_ptr1, rhs_ptr); \
-            lhs_ptr1 += accCols; \
-            pger<Scalar, Packet, false>(&accZero2, lhs_ptr2, rhs_ptr); \
-            lhs_ptr2 += accCols; \
-            pger<Scalar, Packet, false>(&accZero3, lhs_ptr3, rhs_ptr); \
-            lhs_ptr3 += accCols; \
-            pger<Scalar, Packet, false>(&accZero4, lhs_ptr4, rhs_ptr); \
-            lhs_ptr4 += accCols; \
-            rhs_ptr += accRows;
-
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *lhs_ptr1 = lhs_base + (row/accCols      )*strideA*accCols;
-          const Scalar *lhs_ptr2 = lhs_base + ((row/accCols) + 1)*strideA*accCols;
-          const Scalar *lhs_ptr3 = lhs_base + ((row/accCols) + 2)*strideA*accCols;
-          const Scalar *lhs_ptr4 = lhs_base + ((row/accCols) + 3)*strideA*accCols;
-
-          PacketBlock<Packet,4> acc1, accZero1;
-          PacketBlock<Packet,4> acc2, accZero2;
-          PacketBlock<Packet,4> acc3, accZero3;
-          PacketBlock<Packet,4> acc4, accZero4;
-
-          bload<DataMapper, Packet, Index, 0>(acc1, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero1);
-          bload<DataMapper, Packet, Index, 1>(acc2, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero2);
-          bload<DataMapper, Packet, Index, 2>(acc3, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero3);
-          bload<DataMapper, Packet, Index, 3>(acc4, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero4);
-
-          lhs_ptr1 += accCols*offsetA;
-          lhs_ptr2 += accCols*offsetA;
-          lhs_ptr3 += accCols*offsetA;
-          lhs_ptr4 += accCols*offsetA;
-          rhs_ptr += accRows*offsetB;
-          Index k = 0;
-
-          for(; k + PEEL < depth; k+= PEEL)
-          {
-            prefetch(rhs_ptr);
-            prefetch(lhs_ptr1);
-            prefetch(lhs_ptr2);
-            prefetch(lhs_ptr3);
-            prefetch(lhs_ptr4);
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-#if PEEL > 8
-            MICRO();
-            MICRO();
+#if MAX_UNROLL > 6
+          case 6:
+            gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
 #endif
-          }
-          for(; k < depth; k++)
-          {
-            MICRO();
-          }
-
-          bscale<Packet>(acc1,accZero1, pAlpha);
-          bscale<Packet>(acc2,accZero2, pAlpha);
-          bscale<Packet>(acc3,accZero3, pAlpha);
-          bscale<Packet>(acc4,accZero4, pAlpha);
-
-          res.template storePacketBlock<Packet, 4>(row + 0*accCols, col, acc1);
-          res.template storePacketBlock<Packet, 4>(row + 1*accCols, col, acc2);
-          res.template storePacketBlock<Packet, 4>(row + 2*accCols, col, acc3);
-          res.template storePacketBlock<Packet, 4>(row + 3*accCols, col, acc4);
-#undef MICRO
-        }
-        for(; row + 3*accCols <= rows; row += 3*accCols)
-        {
-#define MICRO() \
-            pger<Scalar, Packet, false>(&accZero1, lhs_ptr1, rhs_ptr); \
-            lhs_ptr1 += accCols; \
-            pger<Scalar, Packet, false>(&accZero2, lhs_ptr2, rhs_ptr); \
-            lhs_ptr2 += accCols; \
-            pger<Scalar, Packet, false>(&accZero3, lhs_ptr3, rhs_ptr); \
-            lhs_ptr3 += accCols; \
-            rhs_ptr += accRows;
-
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *lhs_ptr1 = lhs_base + (row/accCols      )*strideA*accCols;
-          const Scalar *lhs_ptr2 = lhs_base + ((row/accCols) + 1)*strideA*accCols;
-          const Scalar *lhs_ptr3 = lhs_base + ((row/accCols) + 2)*strideA*accCols;
-
-          PacketBlock<Packet,4> acc1, accZero1;
-          PacketBlock<Packet,4> acc2, accZero2;
-          PacketBlock<Packet,4> acc3, accZero3;
-
-          bload<DataMapper, Packet, Index, 0>(acc1, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero1);
-          bload<DataMapper, Packet, Index, 1>(acc2, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero2);
-          bload<DataMapper, Packet, Index, 2>(acc3, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero3);
-
-          lhs_ptr1 += accCols*offsetA;
-          lhs_ptr2 += accCols*offsetA;
-          lhs_ptr3 += accCols*offsetA;
-          rhs_ptr += accRows*offsetB;
-          Index k = 0;
-          for(; k + PEEL < depth; k+= PEEL)
-          {
-            prefetch(rhs_ptr);
-            prefetch(lhs_ptr1);
-            prefetch(lhs_ptr2);
-            prefetch(lhs_ptr3);
-
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-#if PEEL > 8
-            MICRO();
-            MICRO();
+#if MAX_UNROLL > 5
+          case 5:
+            gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
 #endif
-          }
-          for(; k < depth; k++)
-          {
-            MICRO();
-          }
-
-          bscale<Packet>(acc1,accZero1, pAlpha);
-          bscale<Packet>(acc2,accZero2, pAlpha);
-          bscale<Packet>(acc3,accZero3, pAlpha);
-
-          res.template storePacketBlock<Packet, 4>(row + 0*accCols, col, acc1);
-          res.template storePacketBlock<Packet, 4>(row + 1*accCols, col, acc2);
-          res.template storePacketBlock<Packet, 4>(row + 2*accCols, col, acc3);
-#undef MICRO
-        }
-        for(; row + 2*accCols <= rows; row += 2*accCols)
-        {
-#define MICRO() \
-            pger<Scalar, Packet, false>(&accZero1, lhs_ptr1, rhs_ptr); \
-            lhs_ptr1 += accCols; \
-            pger<Scalar, Packet, false>(&accZero2, lhs_ptr2, rhs_ptr); \
-            lhs_ptr2 += accCols; \
-            rhs_ptr += accRows;
-
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *lhs_ptr1 = lhs_base + (row/accCols      )*strideA*accCols;
-          const Scalar *lhs_ptr2 = lhs_base + ((row/accCols) + 1)*strideA*accCols;
-          
-          PacketBlock<Packet,4> acc1, accZero1;
-          PacketBlock<Packet,4> acc2, accZero2;
-
-          bload<DataMapper, Packet, Index, 0>(acc1, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero1);
-          bload<DataMapper, Packet, Index, 1>(acc2, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero2);
-
-          lhs_ptr1 += accCols*offsetA;
-          lhs_ptr2 += accCols*offsetA;
-          rhs_ptr += accRows*offsetB;
-          Index k = 0;
-          for(; k + PEEL < depth; k+= PEEL)
-          {
-            prefetch(rhs_ptr);
-            prefetch(lhs_ptr1);
-            prefetch(lhs_ptr2);
-
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-#if PEEL > 8
-            MICRO();
-            MICRO();
+#if MAX_UNROLL > 4
+          case 4:
+            gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
 #endif
-          }
-          for(; k < depth; k++)
-          {
-            MICRO();
-          }
-
-          bscale<Packet>(acc1,accZero1, pAlpha);
-          bscale<Packet>(acc2,accZero2, pAlpha);
-
-          res.template storePacketBlock<Packet, 4>(row + 0*accCols, col, acc1);
-          res.template storePacketBlock<Packet, 4>(row + 1*accCols, col, acc2);
-#undef MICRO
-        }
-
-        for(; row + accCols <= rows; row += accCols)
-        {
-#define MICRO() \
-            pger<Scalar, Packet, false>(&accZero1, lhs_ptr1, rhs_ptr); \
-            lhs_ptr1 += accCols; \
-            rhs_ptr += accRows;
-
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *lhs_ptr1 = lhs_base + (row/accCols)*strideA*accCols;
-
-          PacketBlock<Packet,4> acc1, accZero1;
-
-          bload<DataMapper, Packet, Index, 0>(acc1, res, row, col, accCols);
-          bsetzero<Scalar, Packet>(accZero1);
-
-          lhs_ptr1 += accCols*offsetA;
-          rhs_ptr += accRows*offsetB;
-          Index k = 0;
-          for(; k + PEEL < depth; k+= PEEL)
-          {
-            prefetch(rhs_ptr);
-            prefetch(lhs_ptr1);
-
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-            MICRO();
-#if PEEL > 8
-            MICRO();
-            MICRO();
+#if MAX_UNROLL > 3
+          case 3:
+            gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
 #endif
-          }
-          for(; k < depth; k++)
-          {
-            MICRO();
-          }
-
-          bscale<Packet>(acc1,accZero1, pAlpha);
-
-          res.template storePacketBlock<Packet, 4>(row, col, acc1);
-#undef MICRO
+#if MAX_UNROLL > 2
+          case 2:
+            gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_UNROLL > 1
+          case 1:
+            gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+          default:
+            break;
         }
+#undef MAX_UNROLL
+        asm("#jump table end");
+
         if(remaining_rows > 0)
         {
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *lhs_ptr = lhs_base + (row/accCols)*strideA*accCols;
-
-          lhs_ptr += remaining_rows*offsetA;
-          rhs_ptr += accRows*offsetB;
-          for(Index k = 0; k < depth; k++)
-          {
-              for(Index arow = 0; arow < remaining_rows; arow++)
-              {
-                  for(Index acol = 0; acol < accRows; acol++ )
-                  {
-                    res(row + arow, col + acol) += alpha*lhs_ptr[arow]*rhs_ptr[acol];
-                  }
-              }
-              rhs_ptr += accRows;
-              lhs_ptr += remaining_rows;
-          }
+          gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, cols, remaining_rows, pAlpha, pMask);
         }
     }
 
     if(remaining_cols > 0)
     {
-      const Scalar *rhs_base = blockB + (col/accRows)*strideB*accRows;
+      const Scalar *rhs_base = blockB + col*strideB + remaining_cols*offsetB;
       const Scalar *lhs_base = blockA;
 
-      Index row = 0;
-      for(; row + accCols <= rows; row += accCols)
+      for(; col < cols; col++)
       {
-        const Scalar *rhs_ptr = rhs_base;
-        const Scalar *lhs_ptr = lhs_base + (row/accCols)*strideA*accCols;
+        Index row = 0;
 
-        lhs_ptr += accCols*offsetA;
-        rhs_ptr += remaining_cols*offsetB;
-        for(Index k = 0; k < depth; k++)
-        {
-          for(Index arow = 0; arow < accCols; arow++)
-          {
-            for(Index acol = 0; acol < remaining_cols; acol++ )
-            {
-              res(row + arow, col + acol) += alpha*lhs_ptr[arow]*rhs_ptr[acol];
-            }
-          }
-          rhs_ptr += remaining_cols;
-          lhs_ptr += accCols;
-        }
-      }
-      
-      if(remaining_rows > 0 )
-      {
-        const Scalar *rhs_ptr  = rhs_base;
-        const Scalar *lhs_ptr = lhs_base + (row/accCols)*strideA*accCols;
+        gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
 
-        lhs_ptr += remaining_rows*offsetA;
-        rhs_ptr += remaining_cols*offsetB;
-        for(Index k = 0; k < depth; k++)
+        if (remaining_rows > 0)
         {
-            for(Index arow = 0; arow < remaining_rows; arow++)
-            {
-                for(Index acol = 0; acol < remaining_cols; acol++ )
-                {
-                  res(row + arow, col + acol) += alpha*lhs_ptr[arow]*rhs_ptr[acol];
-                }
-            }
-            rhs_ptr += remaining_cols;
-            lhs_ptr += remaining_rows;
+          gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
         }
+        rhs_base++;
       }
     }
 }
 
-template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const int accRows, const int accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc,
-          Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB, const int accRows, const int accCols)
+          Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
       const int remaining_rows = rows % accCols;
       const int remaining_cols = cols % accRows;
@@ -2018,7 +2108,7 @@
       const Scalar *blockA = (Scalar *) blockAc;
       const Scalar *blockB = (Scalar *) blockBc;
 
-      Packet conj = pset1<Packet>((Scalar)-1.0f);
+      Packet conj = pset1<Packet>((Scalar)-1.0);
 
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
@@ -2054,7 +2144,7 @@
           if(!RhsIsReal)
             rhs_ptr_imag += accRows*offsetB;
           Index k = 0;
-          for(; k + PEEL < depth; k+=PEEL)
+          for(; k + PEEL <= depth; k+=PEEL)
           {
             prefetch(rhs_ptr);
             prefetch(rhs_ptr_imag);
@@ -2180,8 +2270,8 @@
           {
             for(Index acol = 0; acol < 4; acol++ )
             {
-              scalarAcc[arow][acol].real((Scalar)0.0f);
-              scalarAcc[arow][acol].imag((Scalar)0.0f);
+              scalarAcc[arow][acol].real((Scalar)0.0);
+              scalarAcc[arow][acol].imag((Scalar)0.0);
             }
           }
           for(Index k = 0; k < depth; k++)
@@ -2550,24 +2640,24 @@
                Index rows, Index depth, Index cols, float alpha,
                Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
-    const int accRows = quad_traits<float>::rows;
-    const int accCols = quad_traits<float>::size;
-    void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index, const int, const int);
+    const Index accRows = quad_traits<float>::rows;
+    const Index accCols = quad_traits<float>::size;
+    void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index);
 
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
       //generate with MMA only
-      gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper>;
+      gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-        gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper>;
+        gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
       }
       else{
-        gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper>;
+        gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
       }
     #else
-      gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper>;
+      gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
     #endif
-      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2591,22 +2681,22 @@
     const int accRows = quad_traits<float>::rows;
     const int accCols = quad_traits<float>::size;
     void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*,
-          Index, Index, Index, std::complex<float>, Index, Index , Index, Index, const int, const int);
+          Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
 
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
        //generate with MMA only
-       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
      #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
        if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
        }
        else{
-         gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+         gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
        }
      #else
-       gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+       gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
      #endif
-       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2630,21 +2720,21 @@
     const int accRows = quad_traits<float>::rows;
     const int accCols = quad_traits<float>::size;
     void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*,
-          Index, Index, Index, std::complex<float>, Index, Index , Index, Index, const int, const int);
+          Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
        //generate with MMA only
-       gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+       gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
      #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
        if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-         gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+         gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
        }
        else{
-         gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+         gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
        }
      #else
-       gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+       gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
      #endif
-       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2668,21 +2758,21 @@
     const int accRows = quad_traits<float>::rows;
     const int accCols = quad_traits<float>::size;
     void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*,
-          Index, Index, Index, std::complex<float>, Index, Index , Index, Index, const int, const int);
+          Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
        //generate with MMA only
-       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
      #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
        if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
        }
        else{
-         gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+         gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
        }
      #else
-       gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+       gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
      #endif
-       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2702,24 +2792,24 @@
                Index rows, Index depth, Index cols, double alpha,
                Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
-    const int accRows = quad_traits<double>::rows;
-    const int accCols = quad_traits<double>::size;
-    void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index, const int, const int);
+    const Index accRows = quad_traits<double>::rows;
+    const Index accCols = quad_traits<double>::size;
+    void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index);
 
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
       //generate with MMA only
-      gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper>;
+      gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-        gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper>;
+        gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
       }
       else{
-        gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper>;
+        gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
       }
     #else
-      gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper>;
+      gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
     #endif
-      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2743,21 +2833,21 @@
     const int accRows = quad_traits<double>::rows;
     const int accCols = quad_traits<double>::size;
     void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*,
-          Index, Index, Index, std::complex<double>, Index, Index , Index, Index, const int, const int);
+          Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
        //generate with MMA only
-       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
      #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
        if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
        }
        else{
-         gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+         gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
        }
      #else
-       gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, false>;
+       gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
      #endif
-       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2781,21 +2871,21 @@
     const int accRows = quad_traits<double>::rows;
     const int accCols = quad_traits<double>::size;
     void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*,
-          Index, Index, Index, std::complex<double>, Index, Index , Index, Index, const int, const int);
+          Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
        //generate with MMA only
-       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
      #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
        if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
        }
        else{
-         gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+         gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
        }
      #else
-       gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, false, true>;
+       gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
      #endif
-       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2819,21 +2909,21 @@
     const int accRows = quad_traits<double>::rows;
     const int accCols = quad_traits<double>::size;
     void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*,
-          Index, Index, Index, std::complex<double>, Index, Index , Index, Index, const int, const int);
+          Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
     #ifdef EIGEN_ALTIVEC_MMA_ONLY
        //generate with MMA only
-       gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+       gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
      #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
        if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-         gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+         gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
        }
        else{
-         gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+         gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
        }
      #else
-       gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, ConjugateLhs, ConjugateRhs, true, false>;
+       gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
      #endif
-       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB, accRows, accCols);
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 } // end namespace internal
 
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
index 1866a71..a67dbcc 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -16,29 +16,22 @@
 
 namespace internal {
 
-template<typename Packet>
-union Packetx2u
-{
-    __vector_pair             vectorpair;
-    PacketBlock<Packet, 2>    pair;
-};
 const static Packet16uc MMA_p16uc_SETCOMPLEX32_FIRST = {  0,  1,  2,  3,
-                                                     16, 17, 18, 19,
-                                                      4,  5,  6,  7,
-                                                     20, 21, 22, 23};
+                                                         16, 17, 18, 19,
+                                                          4,  5,  6,  7,
+                                                         20, 21, 22, 23};
 
 const static Packet16uc MMA_p16uc_SETCOMPLEX32_SECOND = {  8,  9, 10, 11,
-                                                      24, 25, 26, 27,
-                                                      12, 13, 14, 15,
-                                                      28, 29, 30, 31};
+                                                          24, 25, 26, 27,
+                                                          12, 13, 14, 15,
+                                                          28, 29, 30, 31};
 //[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64
 const static Packet16uc MMA_p16uc_SETCOMPLEX64_FIRST = {  0,  1,  2,  3,  4,  5,  6,  7,
-                                                     16, 17, 18, 19, 20, 21, 22, 23};
+                                                         16, 17, 18, 19, 20, 21, 22, 23};
 
 //[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64
 const static Packet16uc MMA_p16uc_SETCOMPLEX64_SECOND = {  8,  9, 10, 11, 12, 13, 14, 15,
-                                                      24, 25, 26, 27, 28, 29, 30, 31};
-
+                                                          24, 25, 26, 27, 28, 29, 30, 31};
 
 
 // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
@@ -93,11 +86,11 @@
 template<typename Scalar, typename Packet>
 EIGEN_STRONG_INLINE Packet ploadLhsMMA(const Scalar *lhs)
 {
-    return *((Packet *)lhs);
+  return *((Packet *)lhs);
 }
 
 template<typename Packet>
-EIGEN_STRONG_INLINE PacketBlock<Packet,2> pmul (const PacketBlock<Packet,2>&   a, const Packet&   b)
+EIGEN_STRONG_INLINE PacketBlock<Packet,2> pmul(const PacketBlock<Packet,2>& a, const Packet& b)
 {
   PacketBlock<Packet,2> pb;
   pb.packet[0] = a.packet[0]*b;
@@ -117,13 +110,12 @@
   PacketBlock<Packet, 4> result;
   __builtin_mma_disassemble_acc(&result.packet, acc);
 
-  PacketBlock<Packet, 4> block;
-  block.packet[0] = data.template loadPacket<Packet>(i, j + 0) + pmul<Packet>(alpha, result.packet[0]);
-  block.packet[1] = data.template loadPacket<Packet>(i, j + 1) + pmul<Packet>(alpha, result.packet[1]);
-  block.packet[2] = data.template loadPacket<Packet>(i, j + 2) + pmul<Packet>(alpha, result.packet[2]);
-  block.packet[3] = data.template loadPacket<Packet>(i, j + 3) + pmul<Packet>(alpha, result.packet[3]);
+  result.packet[0] = pmadd<Packet>(alpha, result.packet[0], data.template loadPacket<Packet>(i, j + 0));
+  result.packet[1] = pmadd<Packet>(alpha, result.packet[1], data.template loadPacket<Packet>(i, j + 1));
+  result.packet[2] = pmadd<Packet>(alpha, result.packet[2], data.template loadPacket<Packet>(i, j + 2));
+  result.packet[3] = pmadd<Packet>(alpha, result.packet[3], data.template loadPacket<Packet>(i, j + 3));
 
-  data.template storePacketBlock<Packet, 4>(i, j, block);
+  data.template storePacketBlock<Packet, 4>(i, j, result);
 }
 
 template<typename DataMapper, typename Index, typename Packet, typename Packetc, int N>
@@ -187,38 +179,221 @@
 template<>
 EIGEN_STRONG_INLINE void pgerMMA<Packet2d, PacketBlock<Packet2d, 2>, false>(__vector_quad *acc, const PacketBlock<Packet2d,2>& a, const Packet2d& b)
 {
-  Packetx2u<Packet2d> p;
-  p.pair = a;
-  __builtin_mma_xvf64gerpp(acc, p.vectorpair, (__vector unsigned char)b);
+  __vector_pair *a0 = (__vector_pair *)(&a.packet[0]);
+  __builtin_mma_xvf64gerpp(acc, *a0, (__vector unsigned char)b);
 }
 
 template<>
 EIGEN_STRONG_INLINE void pgerMMA<Packet2d, PacketBlock<Packet2d, 2>, true>(__vector_quad *acc, const PacketBlock<Packet2d, 2>& a, const Packet2d& b)
 {
-  Packetx2u<Packet2d> p;
-  p.pair = a;
-  __builtin_mma_xvf64gernp(acc, p.vectorpair, (__vector unsigned char)b);
+  __vector_pair *a0 = (__vector_pair *)(&a.packet[0]);
+  __builtin_mma_xvf64gernp(acc, *a0, (__vector unsigned char)b);
+}
+
+template<>
+EIGEN_STRONG_INLINE void pgerMMA<Packet2d, __vector_pair, false>(__vector_quad *acc, const __vector_pair& a, const Packet2d& b)
+{
+  __builtin_mma_xvf64gerpp(acc, a, (__vector unsigned char)b);
+}
+
+template<>
+EIGEN_STRONG_INLINE void pgerMMA<Packet2d, __vector_pair, true>(__vector_quad *acc, const __vector_pair& a, const Packet2d& b)
+{
+  __builtin_mma_xvf64gernp(acc, a, (__vector unsigned char)b);
 }
 
 // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
 template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE Packet ploadRhsMMA(const Scalar *rhs)
+EIGEN_STRONG_INLINE void ploadRhsMMA(const Scalar *rhs, Packet &rhsV)
 {
-    return *((Packet *)rhs);
+  rhsV = *((Packet *)rhs);
 } 
 
 template<>
-EIGEN_STRONG_INLINE PacketBlock<Packet2d, 2> ploadRhsMMA<double, PacketBlock<Packet2d, 2> >(const double *rhs)
+EIGEN_STRONG_INLINE void ploadRhsMMA<double, PacketBlock<Packet2d, 2> >(const double *rhs, PacketBlock<Packet2d, 2> &rhsV)
 {
-    PacketBlock<Packet2d, 2> pair;
-    pair.packet[0] = *((Packet2d *)rhs      );
-    pair.packet[1] = *(((Packet2d *)rhs) + 1);
-    return pair;
+  rhsV.packet[0] = *((Packet2d *)rhs      );
+  rhsV.packet[1] = *(((Packet2d *)rhs) + 1);
 }
 
-template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper>
-void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
-          Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB, const int accRows, const int accCols)
+template<>
+EIGEN_STRONG_INLINE void ploadRhsMMA<double, __vector_pair>(const double *rhs, __vector_pair &rhsV)
+{
+  __builtin_mma_assemble_pair(&rhsV, (__vector unsigned char)(*(((Packet2d *)rhs) + 1)), (__vector unsigned char)(*((Packet2d *)rhs)));
+}
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
+EIGEN_STRONG_INLINE void gemm_extra_col(
+  const DataMapper& res,
+  const Scalar *lhs_base,
+  const Scalar *rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index row,
+  Index col,
+  Index remaining_rows,
+  Index remaining_cols,
+  const Packet& pAlpha);
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
+EIGEN_STRONG_INLINE void gemm_extra_row(
+  const DataMapper& res,
+  const Scalar *lhs_base,
+  const Scalar *rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index row,
+  Index col,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask);
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_unrolled_col(
+  const DataMapper& res,
+  const Scalar *lhs_base,
+  const Scalar *rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index& row,
+  Index rows,
+  Index col,
+  Index remaining_cols,
+  const Packet& pAlpha);
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows);
+
+#define MICRO_MMA_DST \
+  __vector_quad *accZero0, __vector_quad *accZero1, __vector_quad *accZero2, \
+  __vector_quad *accZero3, __vector_quad *accZero4, __vector_quad *accZero5, \
+  __vector_quad *accZero6, __vector_quad *accZero7
+
+#define MICRO_MMA_SRC \
+  const Scalar **lhs_ptr0, const Scalar **lhs_ptr1, const Scalar **lhs_ptr2, \
+  const Scalar **lhs_ptr3, const Scalar **lhs_ptr4, const Scalar **lhs_ptr5, \
+  const Scalar **lhs_ptr6, const Scalar **lhs_ptr7
+
+#define MICRO_MMA_ONE \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_MMA<unroll_factor, Scalar, Packet, RhsPacket, accRows, accCols>(\
+      &lhs_ptr0, &lhs_ptr1, &lhs_ptr2, &lhs_ptr3, &lhs_ptr4, &lhs_ptr5, &lhs_ptr6, &lhs_ptr7, \
+      rhs_ptr, \
+      &accZero0, &accZero1, &accZero2, &accZero3, &accZero4, &accZero5, &accZero6, &accZero7); \
+  } else { \
+    MICRO_MMA<unroll_factor, Scalar, Packet, __vector_pair, accRows, accCols>(\
+      &lhs_ptr0, &lhs_ptr1, &lhs_ptr2, &lhs_ptr3, &lhs_ptr4, &lhs_ptr5, &lhs_ptr6, &lhs_ptr7, \
+      rhs_ptr, \
+      &accZero0, &accZero1, &accZero2, &accZero3, &accZero4, &accZero5, &accZero6, &accZero7); \
+  }
+
+#define MICRO_MMA_WORK_ONE(iter) \
+  if (N > iter) { \
+    Packet lhsV = ploadLhsMMA<Scalar, Packet>(*lhs_ptr##iter); \
+    pgerMMA<Packet, RhsPacket, false>(accZero##iter, rhsV, lhsV); \
+    *lhs_ptr##iter += accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accZero##iter); \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
+  }
+
+#define MICRO_MMA_UNROLL(func) \
+  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+
+#define MICRO_MMA_WORK MICRO_MMA_UNROLL(MICRO_MMA_WORK_ONE)
+
+#define MICRO_MMA_DST_PTR_ONE(iter) \
+  if (unroll_factor > iter){ \
+    bsetzeroMMA<Scalar, Packet>(&accZero##iter); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accZero##iter); \
+  }
+
+#define MICRO_MMA_DST_PTR MICRO_MMA_UNROLL(MICRO_MMA_DST_PTR_ONE)
+
+#define MICRO_MMA_SRC_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
+  }
+
+#define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_MMA_SRC_PTR_ONE)
+
+#define MICRO_MMA_PREFETCH_ONE(iter) \
+  if (unroll_factor > iter){ \
+    prefetch(lhs_ptr##iter); \
+  }
+
+#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_MMA_PREFETCH_ONE)
+
+#define MICRO_MMA_STORE_ONE(iter) \
+  if (unroll_factor > iter){ \
+    storeAccumulator<DataMapper, Index, Packet>(row + iter*accCols, col, res, pAlpha, &accZero##iter); \
+  }
+
+#define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
+
+// PEEL_MMA loop factor.
+#define PEEL_MMA 10
+
+template<int N, typename Scalar, typename Packet, typename RhsPacket, const Index accRows, const Index accCols>
+EIGEN_STRONG_INLINE void MICRO_MMA(
+  MICRO_MMA_SRC,
+  const Scalar* &rhs_ptr,
+  MICRO_MMA_DST)
+  {
+    RhsPacket rhsV;
+    ploadRhsMMA<Scalar, RhsPacket>(rhs_ptr, rhsV);
+    MICRO_MMA_WORK
+    rhs_ptr += accRows;
+  }
+
+template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration(
+  const DataMapper& res,
+  const Scalar *lhs_base,
+  const Scalar *rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index& row,
+  Index col,
+  const Packet& pAlpha)
+{
+  const Scalar *rhs_ptr = rhs_base;
+  const Scalar *lhs_ptr0, *lhs_ptr1, *lhs_ptr2, *lhs_ptr3, *lhs_ptr4, *lhs_ptr5, *lhs_ptr6, *lhs_ptr7;
+  __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+
+  asm("#unrolled MMA start");
+  MICRO_MMA_SRC_PTR
+  MICRO_MMA_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL_MMA <= depth; k+= PEEL_MMA)
+  {
+    prefetch(rhs_ptr);
+    MICRO_MMA_PREFETCH
+    for (int l = 0; l < PEEL_MMA; l++) {
+      MICRO_MMA_ONE
+    }
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_MMA_ONE
+  }
+  MICRO_MMA_STORE
+
+  row += unroll_factor*accCols;
+  asm("#unrolled MMA end");
+}
+
+template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
+void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
       const Index remaining_rows = rows % accCols;
       const Index remaining_cols = cols % accRows;
@@ -227,111 +402,89 @@
       if( strideB == -1 ) strideB = depth;
 
       const Packet pAlpha = pset1<Packet>(alpha);
+      const Packet pMask  = bmask<Packet>((const int)(remaining_rows));
+
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        const Scalar *rhs_base = blockB + ( col/accRows     )*strideB*accRows;
+        const Scalar *rhs_base = blockB + col*strideB + accRows*offsetB;
         const Scalar *lhs_base = blockA;
 
         Index row = 0;
-        for(; row + accCols <= rows; row += accCols)
-        {
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *lhs_ptr1 = lhs_base + (row/accCols)*strideA*accCols;
-
-          __vector_quad acc;
-          bsetzeroMMA<Scalar, Packet>(&acc);
-
-          lhs_ptr1 += accCols*offsetA;
-          rhs_ptr += accRows*offsetB;
-          for(Index k = 0; k < depth; k++)
-          {
-            Packet lhsV = ploadLhsMMA<Scalar, Packet>(lhs_ptr1);
-            RhsPacket rhsV = ploadRhsMMA<Scalar, RhsPacket>(rhs_ptr);
-
-            pgerMMA<Packet, RhsPacket, false>(&acc, rhsV, lhsV);
-
-            lhs_ptr1 += accCols;
-            rhs_ptr += accRows;
-          }
-
-          storeAccumulator<DataMapper, Index, Packet>(row, col, res, pAlpha, &acc);
+#define MAX_MMA_UNROLL 7
+        while(row + MAX_MMA_UNROLL*accCols <= rows){
+          gemm_unrolled_MMA_iteration<MAX_MMA_UNROLL, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
         }
+        switch( (rows-row)/accCols ){
+#if MAX_MMA_UNROLL > 7
+          case 7:
+            gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_MMA_UNROLL > 6
+          case 6:
+            gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_MMA_UNROLL > 5
+          case 5:
+            gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_MMA_UNROLL > 4
+          case 4:
+            gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_MMA_UNROLL > 3
+          case 3:
+            gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_MMA_UNROLL > 2
+          case 2:
+            gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_MMA_UNROLL > 1
+          case 1:
+            gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+          default:
+            break;
+        }
+#undef MAX_MMA_UNROLL
+
         if(remaining_rows > 0)
         {
-          const Scalar *rhs_ptr  = rhs_base;
-          const Scalar *lhs_ptr = lhs_base + (row/accCols)*strideA*accCols;
-
-          lhs_ptr += remaining_rows*offsetA;
-          rhs_ptr += accRows*offsetB;
-          for(Index k = 0; k < depth; k++)
-          {
-              for(Index arow = 0; arow < remaining_rows; arow++)
-              {
-                  for(Index acol = 0; acol < accRows; acol++ )
-                  {
-                    res(row + arow, col + acol) += alpha*lhs_ptr[arow]*rhs_ptr[acol];
-                  }
-              }
-              rhs_ptr += accRows;
-              lhs_ptr += remaining_rows;
-          }
+          gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, cols, remaining_rows, pAlpha, pMask);
         }
     }
 
     if(remaining_cols > 0)
     {
-      const Scalar *rhs_base = blockB + (col/accRows)*strideB*accRows;
+      const Scalar *rhs_base = blockB + col*strideB + remaining_cols*offsetB;
       const Scalar *lhs_base = blockA;
 
-      Index row = 0;
-      for(; row + accCols <= rows; row += accCols)
+      for(; col < cols; col++)
       {
-        const Scalar *rhs_ptr = rhs_base;
-        const Scalar *lhs_ptr = lhs_base + (row/accCols)*strideA*accCols;
+        Index row = 0;
 
-        lhs_ptr += accCols*offsetA;
-        rhs_ptr += remaining_cols*offsetB;
-        for(Index k = 0; k < depth; k++)
-        {
-          for(Index arow = 0; arow < accCols; arow++)
-          {
-            for(Index acol = 0; acol < remaining_cols; acol++ )
-            {
-              res(row + arow, col + acol) += alpha*lhs_ptr[arow]*rhs_ptr[acol];
-            }
-          }
-          rhs_ptr += remaining_cols;
-          lhs_ptr += accCols;
-        }
-      }
-      
-      if(remaining_rows > 0 )
-      {
-        const Scalar *rhs_ptr  = rhs_base;
-        const Scalar *lhs_ptr = lhs_base + (row/accCols)*strideA*accCols;
+        gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
 
-        lhs_ptr += remaining_rows*offsetA;
-        rhs_ptr += remaining_cols*offsetB;
-        for(Index k = 0; k < depth; k++)
+        if (remaining_rows > 0)
         {
-            for(Index arow = 0; arow < remaining_rows; arow++)
-            {
-                for(Index acol = 0; acol < remaining_cols; acol++ )
-                {
-                  res(row + arow, col + acol) += alpha*lhs_ptr[arow]*rhs_ptr[acol];
-                }
-            }
-            rhs_ptr += remaining_cols;
-            lhs_ptr += remaining_rows;
+          gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
         }
+        rhs_base++;
       }
     }
 }
 
-template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const int accRows, const int accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc,
-          Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB, const int accRows, const int accCols)
+          Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
       const int remaining_rows = rows % accCols;
       const int remaining_cols = cols % accRows;
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index f29b595..495afac 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -1160,43 +1160,48 @@
   return pand<Packet8us>(p8us_abs_mask, a);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a)
+template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a)
 { return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a)
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a)
 { return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a)
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a)
 { return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(Packet4f a)
+template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a)
 {
   const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
   Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
   return reinterpret_cast<Packet4f>(r);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(Packet4f a)
+template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a)
 {
   const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
   Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
   return reinterpret_cast<Packet4f>(r);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a)
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a)
 {
   const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
   return vec_sr(a, p4ui_mask);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a)
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a)
 {
   const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
   return vec_sl(a, p4ui_mask);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a)
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a)
 {
   const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
   return vec_sl(a, p8us_mask);
 }
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
+  return vec_sr(a, p8us_mask);
+}
 
 EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){
   return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
@@ -1323,14 +1328,14 @@
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
-  return pldexp_float(a,exponent);
+  return pldexp_generic(a,exponent);
 }
 template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){
-  BF16_TO_F32_BINARY_OP_WRAPPER(pldexp_float, a, exponent);
+  BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
-  return pfrexp_float(a,exponent);
+  return pfrexp_generic(a,exponent);
 }
 template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){
   Packet4f a_even = Bf16ToF32Even(a);
@@ -2324,16 +2329,20 @@
   return v;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
+  Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
+  return reinterpret_cast<Packet2d>(v);
+}
+
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                       Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
 {
-  a1 = pload<Packet2d>(a);
-  a0 = vec_splat_dbl<0>(a1);
-  a1 = vec_splat_dbl<1>(a1);
-  a3 = pload<Packet2d>(a+2);
-  a2 = vec_splat_dbl<0>(a3);
-  a3 = vec_splat_dbl<1>(a3);
+  //This way is faster than vec_splat (at least for doubles in Power 9)
+  a0 = pset1<Packet2d>(a[0]);
+  a1 = pset1<Packet2d>(a[1]);
+  a2 = pset1<Packet2d>(a[2]);
+  a3 = pset1<Packet2d>(a[3]);
 }
 
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
@@ -2439,7 +2448,8 @@
 // a slow version that works with older compilers. 
 // Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
 // are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
-static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
+template<>
+inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
 #if EIGEN_GNUC_AT_LEAST(5, 4) || \
     (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
   return vec_cts(x, 0);    // TODO: check clang version.
@@ -2452,6 +2462,15 @@
 #endif
 }
 
+template<>
+inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
+  unsigned long long tmp[2];
+  memcpy(tmp, &x, sizeof(tmp));
+  Packet2d d = { static_cast<double>(tmp[0]),
+                 static_cast<double>(tmp[1]) };
+  return d;
+}
+
 
 // Packet2l shifts.
 // For POWER8 we simply use vec_sr/l. 
@@ -2569,7 +2588,7 @@
 template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
   // Clamp exponent to [-2099, 2099]
   const Packet2d max_exponent = pset1<Packet2d>(2099.0);
-  const Packet2l e = ConvertToPacket2l(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
 
   // Split 2^e into four factors and multiply:
   const Packet2l  bias = { 1023, 1023 };
@@ -2582,14 +2601,16 @@
   return out;
 }
 
+
+// Extract exponent without existence of Packet2l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+  return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) {
-  double exp[2] = { exponent[0], exponent[1] };
-  Packet2d ret = { pfrexp<double>(a[0], exp[0]), pfrexp<double>(a[1], exp[1]) };
-  exponent[0] = exp[0];
-  exponent[1] = exp[1];
-  return ret;
-//  This doesn't currently work (no integer_packet for Packet2d - but adding it causes other problems)
-//  return pfrexp_double(a, exponent);
+  return pfrexp_generic(a, exponent);
 }
 
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 452019ec..e4c7e62 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -25,80 +25,117 @@
   return pload<Packet>(a);
 }
 
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pfrexp_float(const Packet& a, Packet& exponent) {
+// Creates a Scalar integer type with same bit-width.
+template<typename T> struct make_integer;
+template<> struct make_integer<float>    { typedef numext::int32_t type; };
+template<> struct make_integer<double>   { typedef numext::int64_t type; };
+template<> struct make_integer<half>     { typedef numext::int16_t type; };
+template<> struct make_integer<bfloat16> { typedef numext::int16_t type; };
+
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC  
+Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  const Packet cst_126f = pset1<Packet>(126.0f);
-  const Packet cst_half = pset1<Packet>(0.5f);
-  const Packet cst_inv_mant_mask  = pset1frombits<Packet>(~0x7f800000u);
-  exponent = psub(pcast<PacketI,Packet>(plogical_shift_right<23>(preinterpret<PacketI>(pabs<Packet>(a)))), cst_126f);
-  return por(pand(a, cst_inv_mant_mask), cst_half);
+  enum { mantissa_bits = numext::numeric_limits<Scalar>::digits - 1};
+  return pcast<PacketI, Packet>(plogical_shift_right<mantissa_bits>(preinterpret<PacketI>(pabs(a))));
 }
 
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pfrexp_double(const Packet& a, Packet& exponent) {
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  const Packet cst_1022d = pset1<Packet>(1022.0);
-  const Packet cst_half = pset1<Packet>(0.5);
-  const Packet cst_inv_mant_mask  = pset1frombits<Packet, uint64_t>(static_cast<uint64_t>(~0x7ff0000000000000ull));
-  exponent = psub(pcast<PacketI,Packet>(plogical_shift_right<52>(preinterpret<PacketI>(pabs<Packet>(a)))), cst_1022d);
-  return por(pand(a, cst_inv_mant_mask), cst_half);
+// Safely applies frexp, correctly handles denormals.
+// Assumes IEEE floating point format.
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pfrexp_generic(const Packet& a, Packet& exponent) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
+  enum {
+    TotalBits = sizeof(Scalar) * CHAR_BIT,
+    MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+    ExponentBits = int(TotalBits) - int(MantissaBits) - 1
+  };
+
+  EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask = 
+      ~(((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)) << int(MantissaBits)); // ~0x7f800000
+  const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask)); 
+  const Packet half = pset1<Packet>(Scalar(0.5));
+  const Packet zero = pzero(a);
+  const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)()); // Minimum normal value, 2^-126
+  
+  // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
+  const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
+  EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(int(MantissaBits) + 1); // 24
+  // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
+  const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24
+  const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);  
+  const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
+  
+  // Determine exponent offset: -126 if normal, -126-24 if denormal
+  const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(int(ExponentBits)-1)) - ScalarUI(2)); // -126
+  Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
+  const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset)); // -24
+  exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
+  
+  // Determine exponent and mantissa from normalized_a.
+  exponent = pfrexp_generic_get_biased_exponent(normalized_a);
+  // Zero, Inf and NaN return 'a' unmodified, exponent is zero
+  // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero)
+  const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1));  // 255
+  const Packet non_finite_exponent = pset1<Packet>(scalar_non_finite_exponent);
+  const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent));
+  const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half));
+  exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));  
+  return m;
 }
 
 // Safely applies ldexp, correctly handles overflows, underflows and denormals.
 // Assumes IEEE floating point format.
-template<typename Packet>
-struct pldexp_impl {
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pldexp_generic(const Packet& a, const Packet& exponent) {
+  // We want to return a * 2^exponent, allowing for all possible integer
+  // exponents without overflowing or underflowing in intermediate
+  // computations.
+  //
+  // Since 'a' and the output can be denormal, the maximum range of 'exponent'
+  // to consider for a float is:
+  //   -255-23 -> 255+23
+  // Below -278 any finite float 'a' will become zero, and above +278 any
+  // finite float will become inf, including when 'a' is the smallest possible 
+  // denormal.
+  //
+  // Unfortunately, 2^(278) cannot be represented using either one or two
+  // finite normal floats, so we must split the scale factor into at least
+  // three parts. It turns out to be faster to split 'exponent' into four
+  // factors, since [exponent>>2] is much faster to compute that [exponent/3].
+  //
+  // Set e = min(max(exponent, -278), 278);
+  //     b = floor(e/4);
+  //   out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b))
+  //
+  // This will avoid any intermediate overflows and correctly handle 0, inf,
+  // NaN cases.
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
   typedef typename unpacket_traits<Packet>::type Scalar;
   typedef typename unpacket_traits<PacketI>::type ScalarI;
   enum {
     TotalBits = sizeof(Scalar) * CHAR_BIT,
-    MantissaBits = std::numeric_limits<Scalar>::digits - 1,
+    MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
     ExponentBits = int(TotalBits) - int(MantissaBits) - 1
   };
-   
-  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-  Packet run(const Packet& a, const Packet& exponent) {
-    // We want to return a * 2^exponent, allowing for all possible integer
-    // exponents without overflowing or underflowing in intermediate
-    // computations.
-    //
-    // Since 'a' and the output can be denormal, the maximum range of 'exponent'
-    // to consider for a float is:
-    //   -255-23 -> 255+23
-    // Below -278 any finite float 'a' will become zero, and above +278 any
-    // finite float will become inf, including when 'a' is the smallest possible 
-    // denormal.
-    //
-    // Unfortunately, 2^(278) cannot be represented using either one or two
-    // finite normal floats, so we must split the scale factor into at least
-    // three parts. It turns out to be faster to split 'exponent' into four
-    // factors, since [exponent>>2] is much faster to compute that [exponent/3].
-    //
-    // Set e = min(max(exponent, -278), 278);
-    //     b = floor(e/4);
-    //   out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b))
-    //
-    // This will avoid any intermediate overflows and correctly handle 0, inf,
-    // NaN cases.
-    const Packet max_exponent = pset1<Packet>(Scalar( (ScalarI(1)<<int(ExponentBits)) + ScalarI(MantissaBits) - ScalarI(1)));  // 278
-    const PacketI bias = pset1<PacketI>((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1));  // 127
-    const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
-    PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
-    Packet c = preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(padd(b, bias)));  // 2^b
-    Packet out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
-    b = psub(psub(psub(e, b), b), b); // e - 3b
-    c = preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(padd(b, bias)));  // 2^(e-3*b)
-    out = pmul(out, c);
-    return out;
-  }
-};
+
+  const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1)<<int(ExponentBits)) + ScalarI(int(MantissaBits) - 1)));  // 278
+  const PacketI bias = pset1<PacketI>((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1));  // 127
+  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
+  Packet c = preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(padd(b, bias)));  // 2^b
+  Packet out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b); // e - 3b
+  c = preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(padd(b, bias)));  // 2^(e-3*b)
+  out = pmul(out, c);
+  return out;
+}
 
 // Explicitly multiplies 
 //    a * (2^e)
 // clamping e to the range
-// [std::numeric_limits<Scalar>::min_exponent-2, std::numeric_limits<Scalar>::max_exponent]
+// [numeric_limits<Scalar>::min_exponent-2, numeric_limits<Scalar>::max_exponent]
 //
 // This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
 // if 2^e doesn't fit into a normal floating-point Scalar.
@@ -111,7 +148,7 @@
   typedef typename unpacket_traits<PacketI>::type ScalarI;
   enum {
     TotalBits = sizeof(Scalar) * CHAR_BIT,
-    MantissaBits = std::numeric_limits<Scalar>::digits - 1,
+    MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
     ExponentBits = int(TotalBits) - int(MantissaBits) - 1
   };
   
@@ -126,14 +163,6 @@
   }
 };
 
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_float(const Packet& a, const Packet& exponent)
-{ return pldexp_impl<Packet>::run(a, exponent); }
-
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_double(const Packet& a, const Packet& exponent)
-{ return pldexp_impl<Packet>::run(a, exponent); }
-
 // Natural or base 2 logarithm.
 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
 // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
@@ -438,16 +467,10 @@
   // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
   // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
   // truncation errors.
-  Packet r;
-#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-  const Packet cst_nln2 = pset1<Packet>(-0.6931471805599453f);
-  r = pmadd(m, cst_nln2, x);
-#else
-  const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693359375f);
-  const Packet cst_cephes_exp_C2 = pset1<Packet>(-2.12194440e-4f);
-  r = psub(x, pmul(m, cst_cephes_exp_C1));
-  r = psub(r, pmul(m, cst_cephes_exp_C2));
-#endif
+  const Packet cst_cephes_exp_C1 = pset1<Packet>(-0.693359375f);
+  const Packet cst_cephes_exp_C2 = pset1<Packet>(2.12194440e-4f);
+  Packet r = pmadd(m, cst_cephes_exp_C1, x);
+  r = pmadd(m, cst_cephes_exp_C2, r);
 
   Packet r2 = pmul(r, r);
   Packet r3 = pmul(r2, r);
@@ -864,112 +887,355 @@
                   pselect(is_real_inf, real_inf_result,result));
 }
 
-
-// This function implements the Veltkamp splitting. Given a floating point
-// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
-// exactly and that half of the significant of x fits in x_hi.
-// This code corresponds to Algorithms 3 and 4 in
-// https://hal.inria.fr/hal-01774587v2/document
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
-  Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
-  Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
-#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-  x_hi = pmadd(pset1<Packet>(-shift_scale), x, gamma);
-#else
-  Packet rho = psub(x, gamma);
-  x_hi = padd(rho, gamma);
-#endif
-  x_lo = psub(x, x_hi);
-}
+// TODO(rmlarsen): The following set of utilities for double word arithmetic
+// should perhaps be refactored as a separate file, since it would be generally
+// useful for special function implementation etc. Writing the algorithms in
+// terms if a double word type would also make the code more readable.
 
 // This function splits x into the nearest integer n and fractional part r,
 // such that x = n + r holds exactly.
 template<typename Packet>
 EIGEN_STRONG_INLINE
-void integer_split(const Packet& x, Packet& n, Packet& r) {
+void absolute_split(const Packet& x, Packet& n, Packet& r) {
   n = pround(x);
   r = psub(x, n);
 }
 
-// This function implements Dekker's algorithm for two products {x * y1, x * y2} with
-// a shared factor. Given floating point numbers {x, y1, y2} computes the pairs
-// {p1, r1} and {p2, r2} such that x * y1 = p1 + r1 holds exactly and
-// p1 = fl(x * y1), and x * y2 = p2 + r2 holds exactly and p2 = fl(x * y2).
+// This function computes the sum {s, r}, such that x + y = s_hi + s_lo
+// holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
 template<typename Packet>
 EIGEN_STRONG_INLINE
-void double_dekker(const Packet& x, const Packet& y1, const Packet& y2,
-                   Packet& p1, Packet& r1, Packet& p2, Packet& r2) {
-  Packet x_hi, x_lo, y1_hi, y1_lo, y2_hi, y2_lo;
-  veltkamp_splitting(x, x_hi, x_lo);
-  veltkamp_splitting(y1, y1_hi, y1_lo);
-  veltkamp_splitting(y2, y2_hi, y2_lo);
-
-  p1 = pmul(x, y1);
-  r1 = pmadd(x_hi, y1_hi, pnegate(p1));
-  r1 = pmadd(x_hi, y1_lo, r1);
-  r1 = pmadd(x_lo, y1_hi, r1);
-  r1 = pmadd(x_lo, y1_lo, r1);
-
-  p2 = pmul(x, y2);
-  r2 = pmadd(x_hi, y2_hi, pnegate(p2));
-  r2 = pmadd(x_hi, y2_lo, r2);
-  r2 = pmadd(x_lo, y2_hi, r2);
-  r2 = pmadd(x_lo, y2_lo, r2);
+void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
+  s_hi = padd(x, y);
+  const Packet t = psub(s_hi, x);
+  s_lo = psub(y, t);
 }
 
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+// This function implements the extended precision product of
+// a pair of floating point numbers. Given {x, y}, it computes the pair
+// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
+// p_hi = fl(x * y).
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void twoprod(const Packet& x, const Packet& y,
+             Packet& p_hi, Packet& p_lo) {
+  p_hi = pmul(x, y);
+  p_lo = pmadd(x, y, pnegate(p_hi));
+}
+
+#else
+
+// This function implements the Veltkamp splitting. Given a floating point
+// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
+// exactly and that half of the significant of x fits in x_hi.
+// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
+  const Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
+  const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
+  Packet rho = psub(x, gamma);
+  x_hi = padd(rho, gamma);
+  x_lo = psub(x, x_hi);
+}
+
+// This function implements Dekker's algorithm for products x * y.
+// Given floating point numbers {x, y} computes the pair
+// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
+// p_hi = fl(x * y).
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void twoprod(const Packet& x, const Packet& y,
+             Packet& p_hi, Packet& p_lo) {
+  Packet x_hi, x_lo, y_hi, y_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y, y_hi, y_lo);
+
+  p_hi = pmul(x, y);
+  p_lo = pmadd(x_hi, y_hi, pnegate(p_hi));
+  p_lo = pmadd(x_hi, y_lo, p_lo);
+  p_lo = pmadd(x_lo, y_hi, p_lo);
+  p_lo = pmadd(x_lo, y_lo, p_lo);
+}
+
+#endif  // EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+
+
+// This function implements Dekker's algorithm for the addition
+// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
+// It returns the result as a pair {s_hi, s_lo} such that
+// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
+// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+  void twosum(const Packet& x_hi, const Packet& x_lo,
+              const Packet& y_hi, const Packet& y_lo,
+              Packet& s_hi, Packet& s_lo) {
+  const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
+  Packet r_hi_1, r_lo_1;
+  fast_twosum(x_hi, y_hi,r_hi_1, r_lo_1);
+  Packet r_hi_2, r_lo_2;
+  fast_twosum(y_hi, x_hi,r_hi_2, r_lo_2);
+  const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
+
+  const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
+  const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo);
+  const Packet s = pselect(x_greater_mask, s1, s2);
+
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This is a version of twosum for double word numbers,
+// which assumes that |x_hi| >= |y_hi|.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+  void fast_twosum(const Packet& x_hi, const Packet& x_lo,
+              const Packet& y_hi, const Packet& y_lo,
+              Packet& s_hi, Packet& s_lo) {
+  Packet r_hi, r_lo;
+  fast_twosum(x_hi, y_hi, r_hi, r_lo);
+  const Packet s = padd(padd(y_lo, r_lo), x_lo);
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This function implements the multiplication of a double word
+// number represented by {x_hi, x_lo} by a floating point number y.
+// It returns the result as a pair {p_hi, p_lo} such that
+// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error
+// of less than 2*2^{-2p}, where p is the number of significand bit
+// in the floating point type.
+// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
+             Packet& p_hi, Packet& p_lo) {
+  Packet c_hi, c_lo1;
+  twoprod(x_hi, y, c_hi, c_lo1);
+  const Packet c_lo2 = pmul(x_lo, y);
+  Packet t_hi, t_lo1;
+  fast_twosum(c_hi, c_lo2, t_hi, t_lo1);
+  const Packet t_lo2 = padd(t_lo1, c_lo1);
+  fast_twosum(t_hi, t_lo2, p_hi, p_lo);
+}
+
+// This function computes log2(x) and returns the result as a double word.
+template <typename Scalar>
+struct accurate_log2 {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+    log2_x_hi = plog2(x);
+    log2_x_lo = pzero(x);
+  }
+};
+
+// This specialization uses a more accurate algorithm to compute log2(x) for
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.42e-10.
+// This additional accuracy is needed to counter the error-magnification
+// inherent in multiplying by a potentially large exponent in pow(x,y).
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+template <>
+struct accurate_log2<float> {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
+    // The function log(1+x)/x is approximated in the interval
+    // [1/sqrt(2)-1;sqrt(2)-1] by a degree 10 polynomial of the form
+    //  Q(x) = (C0 + x * (C1 + x * (C2 + x * (C3 + x * P(x))))),
+    // where the degree 6 polynomial P(x) is evaluated in single precision,
+    // while the remaining 4 terms of Q(x), as well as the final multiplication by x
+    // to reconstruct log(1+x) are evaluated in extra precision using
+    // double word arithmetic. C0 through C3 are extra precise constants
+    // stored as double words.
+    //
+    // The polynomial coefficients were calculated using Sollya commands:
+    // > n = 10;
+    // > f = log2(1+x)/x;
+    // > interval = [sqrt(0.5)-1;sqrt(2)-1];
+    // > p = fpminimax(f,n,[|double,double,double,double,single...|],interval,relative,floating);
+    
+    const Packet p6 = pset1<Packet>( 9.703654795885e-2f);
+    const Packet p5 = pset1<Packet>(-0.1690667718648f);
+    const Packet p4 = pset1<Packet>( 0.1720575392246f);
+    const Packet p3 = pset1<Packet>(-0.1789081543684f);
+    const Packet p2 = pset1<Packet>( 0.2050433009862f);
+    const Packet p1 = pset1<Packet>(-0.2404672354459f);
+    const Packet p0 = pset1<Packet>( 0.2885761857032f);
+
+    const Packet C3_hi = pset1<Packet>(-0.360674142838f);
+    const Packet C3_lo = pset1<Packet>(-6.13283912543e-09f);
+    const Packet C2_hi = pset1<Packet>(0.480897903442f);
+    const Packet C2_lo = pset1<Packet>(-1.44861207474e-08f);
+    const Packet C1_hi = pset1<Packet>(-0.721347510815f);
+    const Packet C1_lo = pset1<Packet>(-4.84483164698e-09f);
+    const Packet C0_hi = pset1<Packet>(1.44269502163f);
+    const Packet C0_lo = pset1<Packet>(2.01711713999e-08f);
+    const Packet one = pset1<Packet>(1.0f);
+
+    const Packet x = psub(z, one);
+    // Evaluate P(x) in working precision.
+    // We evaluate it in multiple parts to improve instruction level
+    // parallelism.
+    Packet x2 = pmul(x,x);
+    Packet p_even = pmadd(p6, x2, p4);
+    p_even = pmadd(p_even, x2, p2);
+    p_even = pmadd(p_even, x2, p0);
+    Packet p_odd = pmadd(p5, x2, p3);
+    p_odd = pmadd(p_odd, x2, p1);
+    Packet p = pmadd(p_odd, x, p_even);
+
+    // Now evaluate the low-order tems of Q(x) in double word precision.
+    // In the following, due to the alternating signs and the fact that
+    // |x| < sqrt(2)-1, we can assume that |C*_hi| >= q_i, and use
+    // fast_twosum instead of the slower twosum.
+    Packet q_hi, q_lo;
+    Packet t_hi, t_lo;
+    // C3 + x * p(x)
+    twoprod(p, x, t_hi, t_lo);
+    fast_twosum(C3_hi, C3_lo, t_hi, t_lo, q_hi, q_lo);
+    // C2 + x * p(x)
+    twoprod(q_hi, q_lo, x, t_hi, t_lo);
+    fast_twosum(C2_hi, C2_lo, t_hi, t_lo, q_hi, q_lo);
+    // C1 + x * p(x)
+    twoprod(q_hi, q_lo, x, t_hi, t_lo);
+    fast_twosum(C1_hi, C1_lo, t_hi, t_lo, q_hi, q_lo);
+    // C0 + x * p(x)
+    twoprod(q_hi, q_lo, x, t_hi, t_lo);
+    fast_twosum(C0_hi, C0_lo, t_hi, t_lo, q_hi, q_lo);
+
+    // log(z) ~= x * Q(x)
+    twoprod(q_hi, q_lo, x, log2_x_hi, log2_x_lo);
+  }
+};
+
+// This function computes exp2(x) (i.e. 2**x).
+template <typename Scalar>
+struct fast_accurate_exp2 {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  Packet operator()(const Packet& x) {
+    // TODO(rmlarsen): Add a pexp2 packetop.
+    return pexp(pmul(pset1<Packet>(Scalar(EIGEN_LN2)), x));
+  }
+};
+
+// This specialization uses a faster algorithm to compute exp2(x) for floats
+// in [-0.5;0.5] with a relative accuracy of 1 ulp.
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+template <>
+struct fast_accurate_exp2<float> {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  Packet operator()(const Packet& x) {
+    // This function approximates exp2(x) by a degree 6 polynomial of the form
+    // Q(x) = 1 + x * (C + x * P(x)), where the degree 4 polynomial P(x) is evaluated in
+    // single precision, and the remaining steps are evaluated with extra precision using
+    // double word arithmetic. C is an extra precise constant stored as a double word.
+    //
+    // The polynomial coefficients were calculated using Sollya commands:
+    // > n = 6;
+    // > f = 2^x;
+    // > interval = [-0.5;0.5];
+    // > p = fpminimax(f,n,[|1,double,single...|],interval,relative,floating);
+
+    const Packet p4 = pset1<Packet>(1.539513905e-4f);
+    const Packet p3 = pset1<Packet>(1.340007293e-3f);
+    const Packet p2 = pset1<Packet>(9.618283249e-3f);
+    const Packet p1 = pset1<Packet>(5.550328270e-2f);
+    const Packet p0 = pset1<Packet>(0.2402264923f);
+
+    const Packet C_hi = pset1<Packet>(0.6931471825f);
+    const Packet C_lo = pset1<Packet>(2.36836577e-08f);
+    const Packet one = pset1<Packet>(1.0f);
+
+    // Evaluate P(x) in working precision.
+    // We evaluate even and odd parts of the polynomial separately
+    // to gain some instruction level parallelism.
+    Packet x2 = pmul(x,x);
+    Packet p_even = pmadd(p4, x2, p2);
+    p_even = pmadd(p_even, x2, p0);
+    Packet p_odd = pmadd(p3, x2, p1);
+    Packet p = pmadd(p_odd, x, p_even);
+
+    // Evaluate the remaining terms of Q(x) with extra precision using
+    // double word arithmetic.
+    Packet p_hi, p_lo;
+    // x * p(x)
+    twoprod(p, x, p_hi, p_lo);
+    // C + x * p(x)
+    Packet q1_hi, q1_lo;
+    twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
+    // x * (C + x * p(x))
+    Packet q2_hi, q2_lo;
+    twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
+    // 1 + x * (C + x * p(x))
+    Packet q3_hi, q3_lo;
+    // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
+    // for adding it to unity here.
+    fast_twosum(one, q2_hi, q3_hi, q3_lo);
+    return padd(q3_hi, padd(q2_lo, q3_lo));
+  }
+};
+
 // This function implements the non-trivial case of pow(x,y) where x is
 // positive and y is (possibly) non-integer.
-// Formally, pow(x,y) = 2**(y * log2(x))
-template<typename Packet>
-EIGEN_STRONG_INLINE
-Packet generic_pow_impl(const Packet& x, const Packet& y) {
+// Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
+// TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it
+// easier to specialize or turn off for specific types and/or backends.x
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   // Split x into exponent e_x and mantissa m_x.
   Packet e_x;
   Packet m_x = pfrexp(x, e_x);
 
-  // Adjust m_x to lie in [0.75:1.5) to minimize absolute error in log2(m_x).
-  Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(Scalar(0.75)));
+  // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
+  EIGEN_CONSTEXPR Scalar sqrt_half = Scalar(0.70710678118654752440);
+  const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
   m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
   e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
 
-  Packet r_x = plog2(m_x);
+  // Compute log2(m_x) with 6 extra bits of accuracy.
+  Packet rx_hi, rx_lo;
+  accurate_log2<Scalar>()(m_x, rx_hi, rx_lo);
 
   // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled
-  // precision using Dekker's algorithm.
+  // precision using double word arithmetic.
   Packet f1_hi, f1_lo, f2_hi, f2_lo;
-  double_dekker(y, e_x, r_x, f1_hi, f1_lo, f2_hi, f2_lo);
+  twoprod(e_x, y, f1_hi, f1_lo);
+  twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo);
+  // Sum the two terms in f using double word arithmetic. We know
+  // that |e_x| > |log2(m_x)|, except for the case where e_x==0.
+  // This means that we can use fast_twosum(f1,f2).
+  // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any
+  // accuracy by violating the assumption of fast_twosum, because
+  // it's a no-op.
+  Packet f_hi, f_lo;
+  fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo);
 
-  // Separate f into integer and fractional parts, keeping f1_hi, and f2_hi
-  // separate to avoid cancellation.
-  Packet n1, r1, n2, r2;
-  integer_split(f1_hi, n1, r1);
-  integer_split(f2_hi, n2, r2);
-
-  // Add up integer parts and sum the remainders.
-  Packet n_z = padd(n1, n2);
-  // Notice: I experimented with using compensated (Kahan) summation here,
-  // but it does not seem to matter.
-  Packet rem = padd(padd(f1_lo, f2_lo), padd(r1, r2));
-
-  // Extract any additional integer part that may have accumulated in rem.
-  Packet nrem, r_z;
-  integer_split(rem, nrem, r_z);
-  n_z = padd(n_z, nrem);
+  // Split f into integer and fractional parts.
+  Packet n_z, r_z;
+  absolute_split(f_hi, n_z, r_z);
+  r_z = padd(r_z, f_lo);
+  Packet n_r;
+  absolute_split(r_z, n_r, r_z);
+  n_z = padd(n_z, n_r);
 
   // We now have an accurate split of f = n_z + r_z and can compute
-  // x^y = 2**{n_z + r_z) = exp(ln(2) * r_z) * 2**{n_z}.
-  // The first factor we compute by calling pexp(), while multiplication
-  // by an integer power of 2 can be done exactly using pldexp().
-  // Note: I experimented with using Dekker's algorithms for the
-  // multiplication by ln(2) here, but did not see any difference.
-  Packet e_r = pexp(pmul(pset1<Packet>(Scalar(EIGEN_LN2)), r_z));
-  // TODO: investigate bounds of e_r and n_z, potentially using faster
-  //       implementation of ldexp.
+  //   x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
+  // Since r_z is in [-0.5;0.5], we compute the first factor to high accuracy
+  // using a specialized algorithm. Multiplication by the second factor can
+  // be done exactly using pldexp(), since it is an integer power of 2.
+  //  Packet e_r = fast_accurate_exp2<Scalar>()(r_z);
+  const Packet e_r = fast_accurate_exp2<Scalar>()(r_z);
   return pldexp(e_r, n_z);
 }
 
@@ -979,66 +1245,75 @@
 EIGEN_UNUSED
 Packet generic_pow(const Packet& x, const Packet& y) {
   typedef typename unpacket_traits<Packet>::type Scalar;
+
   const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
   const Packet cst_zero = pset1<Packet>(Scalar(0));
   const Packet cst_one = pset1<Packet>(Scalar(1));
-  const Packet cst_half = pset1<Packet>(Scalar(0.5));
   const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
 
-  Packet abs_x = pabs(x);
+  const Packet abs_x = pabs(x);
   // Predicates for sign and magnitude of x.
-  Packet x_is_zero = pcmp_eq(x, cst_zero);
-  Packet x_is_neg = pcmp_lt(x, cst_zero);
-  Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
-  Packet abs_x_is_one =  pcmp_eq(abs_x, cst_one);
-  Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x);
-  Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one);
-  Packet x_is_one =  pandnot(abs_x_is_one, x_is_neg);
-  Packet x_is_neg_one =  pand(abs_x_is_one, x_is_neg);
-  Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x));
+  const Packet x_is_zero = pcmp_eq(x, cst_zero);
+  const Packet x_is_neg = pcmp_lt(x, cst_zero);
+  const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
+  const Packet abs_x_is_one =  pcmp_eq(abs_x, cst_one);
+  const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x);
+  const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one);
+  const Packet x_is_one =  pandnot(abs_x_is_one, x_is_neg);
+  const Packet x_is_neg_one =  pand(abs_x_is_one, x_is_neg);
+  const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x));
 
   // Predicates for sign and magnitude of y.
-  Packet y_is_zero = pcmp_eq(y, cst_zero);
-  Packet y_is_neg = pcmp_lt(y, cst_zero);
-  Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg));
-  Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y));
-  Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf);
-  // |y| is so large that (1+eps)^y over- or underflows.
+  const Packet y_is_one = pcmp_eq(y, cst_one);
+  const Packet y_is_zero = pcmp_eq(y, cst_zero);
+  const Packet y_is_neg = pcmp_lt(y, cst_zero);
+  const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg));
+  const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y));
+  const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf);
   EIGEN_CONSTEXPR Scalar huge_exponent =
-      (std::numeric_limits<Scalar>::digits * Scalar(EIGEN_LOG2E)) /
+      (std::numeric_limits<Scalar>::max_exponent * Scalar(EIGEN_LN2)) /
       std::numeric_limits<Scalar>::epsilon();
-  Packet abs_y_is_huge = pcmp_lt(pset1<Packet>(huge_exponent), pabs(y));
+  const Packet abs_y_is_huge = pcmp_le(pset1<Packet>(huge_exponent), pabs(y));
 
   // Predicates for whether y is integer and/or even.
-  Packet y_is_int = pcmp_eq(pfloor(y), y);
-  Packet y_div_2 = pmul(y, cst_half);
-  Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
+  const Packet y_is_int = pcmp_eq(pfloor(y), y);
+  const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
+  const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
 
   // Predicates encoding special cases for the value of pow(x,y)
-  Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf);
-  Packet pow_is_one = por(por(x_is_one, y_is_zero),
-                          pand(x_is_neg_one,
-                               por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
-  Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
-  Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)),
-                               pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_pos)),
-                           pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_neg));
-  Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)),
-                              pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)),
-                          pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos));
+  const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf),
+                                                    y_is_int),
+                                            abs_y_is_inf);
+  const Packet pow_is_one = por(por(x_is_one, y_is_zero),
+                                pand(x_is_neg_one,
+                                     por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
+  const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
+  const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos),
+                                         pand(abs_x_is_inf, y_is_neg)),
+                                     pand(pand(abs_x_is_lt_one, abs_y_is_huge),
+                                          y_is_pos)),
+                                 pand(pand(abs_x_is_gt_one, abs_y_is_huge),
+                                      y_is_neg));
+  const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg),
+                                        pand(abs_x_is_inf, y_is_pos)),
+                                    pand(pand(abs_x_is_lt_one, abs_y_is_huge),
+                                         y_is_neg)),
+                                pand(pand(abs_x_is_gt_one, abs_y_is_huge),
+                                     y_is_pos));
 
   // General computation of pow(x,y) for positive x or negative x and integer y.
-  Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
-  Packet pow_abs = generic_pow_impl(abs_x, y);
-
-  return pselect(pow_is_one, cst_one,
-                 pselect(pow_is_nan, cst_nan,
-                         pselect(pow_is_inf, cst_pos_inf,
-                                 pselect(pow_is_zero, cst_zero,
-                                         pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))));
+  const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
+  const Packet pow_abs = generic_pow_impl(abs_x, y);
+  return pselect(y_is_one, x,
+                 pselect(pow_is_one, cst_one,
+                         pselect(pow_is_nan, cst_nan,
+                                 pselect(pow_is_inf, cst_pos_inf,
+                                         pselect(pow_is_zero, cst_zero,
+                                                 pselect(negate_pow_abs, pnegate(pow_abs), pow_abs))))));
 }
 
 
+
 /* polevl (modified for Eigen)
  *
  *      Evaluate polynomial
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 96c572f..da8499b 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -25,29 +25,23 @@
  * Some generic implementations to be used by implementors
 ***************************************************************************/
 
-/** Default implementation of pfrexp for float.
+/** Default implementation of pfrexp.
   * It is expected to be called by implementers of template<> pfrexp.
   */
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pfrexp_float(const Packet& a, Packet& exponent);
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pfrexp_generic(const Packet& a, Packet& exponent);
 
-/** Default implementation of pfrexp for double.
-  * It is expected to be called by implementers of template<> pfrexp.
-  */
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pfrexp_double(const Packet& a, Packet& exponent);
+// Extracts the biased exponent value from Packet p, and casts the results to
+// a floating-point Packet type. Used by pfrexp_generic. Override this if
+// there is no unpacket_traits<Packet>::integer_packet.
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pfrexp_generic_get_biased_exponent(const Packet& a);
 
-/** Default implementation of pldexp for float.
+/** Default implementation of pldexp.
   * It is expected to be called by implementers of template<> pldexp.
   */
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_float(const Packet& a, const Packet& exponent);
-
-/** Default implementation of pldexp for double.
-  * It is expected to be called by implementers of template<> pldexp.
-  */
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_double(const Packet& a, const Packet& exponent);
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pldexp_generic(const Packet& a, const Packet& exponent);
 
 /** \internal \returns log(x) for single precision float */
 template <typename Packet>
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index c7d5397..2e06bef 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -2402,14 +2402,14 @@
 template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; }
 
 template<> EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent)
-{ return pfrexp_float(a,exponent); }
+{ return pfrexp_generic(a,exponent); }
 template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent)
-{ return pfrexp_float(a,exponent); }
+{ return pfrexp_generic(a,exponent); }
 
 template<> EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent)
-{ return pldexp_float(a,exponent); }
+{ return pldexp_generic(a,exponent); }
 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent)
-{ return pldexp_float(a,exponent); }
+{ return pldexp_generic(a,exponent); }
 
 template<> EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); }
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
@@ -3019,6 +3019,20 @@
   kernel.packet[3] = vreinterpretq_s16_u32(zip32_2.val[1]);
 }
 
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel)
+{
+  const int8x16x2_t zip8_1 = vzipq_s8(kernel.packet[0], kernel.packet[1]);
+  const int8x16x2_t zip8_2 = vzipq_s8(kernel.packet[2], kernel.packet[3]);
+
+  const int16x8x2_t zip16_1 = vzipq_s16(vreinterpretq_s16_s8(zip8_1.val[0]), vreinterpretq_s16_s8(zip8_2.val[0]));
+  const int16x8x2_t zip16_2 = vzipq_s16(vreinterpretq_s16_s8(zip8_1.val[1]), vreinterpretq_s16_s8(zip8_2.val[1]));
+
+  kernel.packet[0] = vreinterpretq_s8_s16(zip16_1.val[0]);
+  kernel.packet[1] = vreinterpretq_s8_s16(zip16_1.val[1]);
+  kernel.packet[2] = vreinterpretq_s8_s16(zip16_2.val[0]);
+  kernel.packet[3] = vreinterpretq_s8_s16(zip16_2.val[1]);
+}
+
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel)
 {
   const uint8x16x2_t zip8_1 = vzipq_u8(kernel.packet[0], kernel.packet[1]);
@@ -3893,10 +3907,10 @@
 { return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
 
 template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
-{ return pldexp_double(a, exponent); }
+{ return pldexp_generic(a, exponent); }
 
 template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent)
-{ return pfrexp_double(a,exponent); }
+{ return pfrexp_generic(a,exponent); }
 
 template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from)
 { return vreinterpretq_f64_u64(vdupq_n_u64(from)); }
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 422b0fc..401a497 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -887,21 +887,24 @@
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
-  return pfrexp_float(a,exponent);
+  return pfrexp_generic(a,exponent);
+}
+
+// Extract exponent without existence of Packet2l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+  const Packet2d cst_exp_mask  = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
+  return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
-  const Packet2d cst_1022d = pset1<Packet2d>(1022.0);
-  const Packet2d cst_half = pset1<Packet2d>(0.5);
-  const Packet2d cst_exp_mask  = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
-  __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
-  exponent = psub(_mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3)), cst_1022d);
-  const Packet2d cst_mant_mask  = pset1frombits<Packet2d>(static_cast<uint64_t>(~0x7ff0000000000000ull));
-  return por(pand(a, cst_mant_mask), cst_half);
+  return pfrexp_generic(a, exponent);
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
-  return pldexp_float(a,exponent);
+  return pldexp_generic(a,exponent);
 }
 
 // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 98585e6..4877b6d 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -669,7 +669,7 @@
 template <>
 EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent)
 {
-  return pfrexp_float(a, exponent);
+  return pfrexp_generic(a, exponent);
 }
 
 template <>
@@ -747,7 +747,7 @@
 template<>
 EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent)
 {
-  return pldexp_float(a, exponent);
+  return pldexp_generic(a, exponent);
 }
 
 }  // namespace internal
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 6c05914..8691aa1 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -16,8 +16,8 @@
 //------------------------------------------------------------------------------------------
 
 #define EIGEN_WORLD_VERSION 3
-#define EIGEN_MAJOR_VERSION 3
-#define EIGEN_MINOR_VERSION 90
+#define EIGEN_MAJOR_VERSION 4
+#define EIGEN_MINOR_VERSION 99
 
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 14713aaa..bb9144d 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -781,31 +781,48 @@
 #else
 
 #if EIGEN_MAX_ALIGN_BYTES!=0
+#if !defined(EIGEN_HIPCC)
+  #define EIGEN_DEVICE_FUNC_NO_HIPCC EIGEN_DEVICE_FUNC
+#else
+  #define EIGEN_DEVICE_FUNC_NO_HIPCC
+#endif
   #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      EIGEN_DEVICE_FUNC_NO_HIPCC \
       void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
         EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
         EIGEN_CATCH (...) { return 0; } \
       }
   #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
+      EIGEN_DEVICE_FUNC_NO_HIPCC \
       void *operator new(std::size_t size) { \
         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
       } \
+      EIGEN_DEVICE_FUNC_NO_HIPCC \
       void *operator new[](std::size_t size) { \
         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
       } \
+      EIGEN_DEVICE_FUNC_NO_HIPCC \
       void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC_NO_HIPCC \
       void operator delete[](void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC_NO_HIPCC \
       void operator delete(void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC_NO_HIPCC \
       void operator delete[](void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
       /* in-place new and delete. since (at least afaik) there is no actual   */ \
       /* memory allocated we can safely let the default implementation handle */ \
       /* this particular case. */ \
+      EIGEN_DEVICE_FUNC_NO_HIPCC \
       static void *operator new(std::size_t size, void *ptr) { return ::operator new(size,ptr); } \
+      EIGEN_DEVICE_FUNC_NO_HIPCC \
       static void *operator new[](std::size_t size, void* ptr) { return ::operator new[](size,ptr); } \
+      EIGEN_DEVICE_FUNC_NO_HIPCC \
       void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \
+      EIGEN_DEVICE_FUNC_NO_HIPCC \
       void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \
       /* nothrow-new (returns zero instead of std::bad_alloc) */ \
       EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      EIGEN_DEVICE_FUNC_NO_HIPCC \
       void operator delete(void *ptr, const std::nothrow_t&) EIGEN_NO_THROW { \
         Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
       } \
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index ab1bc01..32a03ae 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -194,9 +194,15 @@
 template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
 #endif
 
-// TODO: Some platforms define int64_t as long long even for C++03. In this case
-// we are missing the definition for make_unsigned. If we just define it, we get
-// duplicated definitions for platforms defining int64_t as signed long for C++03
+// Some platforms define int64_t as long long even for C++03. In this case we
+// are missing the definition for make_unsigned. If we just define it, we get
+// duplicated definitions for platforms defining int64_t as signed long for
+// C++03. We therefore add the specialization for C++03 long long for these
+// platforms only.
+#if EIGEN_OS_MAC
+template<> struct make_unsigned<unsigned long long> { typedef unsigned long long type; };
+template<> struct make_unsigned<long long>          { typedef unsigned long long type; };
+#endif
 #endif
 
 template <typename T> struct add_const { typedef const T type; };
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index 354e33d..190210d 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -650,9 +650,8 @@
 {
   eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
 
-  const Index size = m_matrix.rows();
   const Index rhsCols = b.cols();
-  eigen_assert(size==b.rows());
+  eigen_assert(m_matrix.rows()==b.rows());
 
   m_sluOptions.Trans = NOTRANS;
   m_sluOptions.Fact = FACTORED;
@@ -974,9 +973,8 @@
 {
   eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
 
-  const int size = m_matrix.rows();
   const int rhsCols = b.cols();
-  eigen_assert(size==b.rows());
+  eigen_assert(m_matrix.rows()==b.rows());
 
   m_sluOptions.Trans = NOTRANS;
   m_sluOptions.Fact = FACTORED;
diff --git a/debug/msvc/eigen.natvis b/debug/msvc/eigen.natvis
index da89857..22cf346 100644
--- a/debug/msvc/eigen.natvis
+++ b/debug/msvc/eigen.natvis
@@ -1,235 +1,235 @@
-<?xml version="1.0" encoding="utf-8"?>

-

-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">

-

-  <!-- Fixed x Fixed Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,*,*,*,*,*&gt;">      

-      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>

-      <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>

-      <Expand>

-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

-          <Rank>2</Rank>

-          <Size>$i==0 ? $T2 : $T3</Size>

-          <ValuePointer>m_storage.m_data.array</ValuePointer>

-        </ArrayItems>

-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

-          <Direction>Backward</Direction>

-          <Rank>2</Rank>

-          <Size>$i==0 ? $T2 : $T3</Size>

-          <ValuePointer>m_storage.m_data.array</ValuePointer>

-        </ArrayItems>

-      </Expand>

-  </Type>

-  

-  <!-- 2 x 2 Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,2,2,*,*,*&gt;">      

-      <AlternativeType Name="Eigen::Array&lt;*,2,2,*,*,*&gt;"/>

-      <DisplayString>[2, 2] (fixed matrix)</DisplayString>

-      <Expand>

-        <Synthetic Name="[row 0]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 1]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>

-        </Synthetic>        

-      </Expand>

-  </Type>

-  

-  <!-- 3 x 3 Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,3,3,*,*,*&gt;">      

-      <AlternativeType Name="Eigen::Array&lt;*,3,3,*,*,*&gt;"/>

-      <DisplayString>[3, 3] (fixed matrix)</DisplayString>

-      <Expand>

-        <Synthetic Name="[row 0]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 1]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 2]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 2]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>

-        </Synthetic>        

-      </Expand>

-  </Type>

-  

-  <!-- 4 x 4 Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,4,4,*,*,*&gt;">      

-      <AlternativeType Name="Eigen::Array&lt;*,4,4,*,*,*&gt;"/>

-      <DisplayString>[4, 4] (fixed matrix)</DisplayString>

-      <Expand>

-        <Synthetic Name="[row 0]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 1]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 2]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 2]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 3]" Condition="Flags%2">

-          <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>

-        </Synthetic>

-        <Synthetic Name="[row 3]" Condition="!(Flags%2)">

-          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>

-        </Synthetic>        

-      </Expand>

-  </Type>  

-  

-  <!-- Dynamic x Dynamic Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,-1,-1,*,*,*&gt;">      

-      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>

-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>

-      <Expand>

-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

-          <Rank>2</Rank>

-          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

-          <Direction>Backward</Direction>

-          <Rank>2</Rank>

-          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-      </Expand>

-  </Type>

-  

-  <!-- Fixed x Dynamic Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,*,-1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Array&lt;*,*,-1,*,*,*&gt;"/>

-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

-      <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>

-      <Expand>

-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

-          <Rank>2</Rank>

-          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

-          <Direction>Backward</Direction>

-          <Rank>2</Rank>

-          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-      </Expand>

-  </Type>

-  

-  <!-- Dynamic x Fixed Matrix -->

-  <Type Name="Eigen::Matrix&lt;*,-1,*,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Array&lt;*,-1,*,*,*,*&gt;"/>

-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>

-      <Expand>

-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

-          <Rank>2</Rank>

-          <Size>$i==0 ? m_storage.m_rows : $T2</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

-          <Direction>Backward</Direction>

-          <Rank>2</Rank>

-          <Size>$i==0 ? m_storage.m_rows : $T2</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-      </Expand>

-  </Type>

-  

-  <!-- Dynamic Column Vector -->

-  <Type Name="Eigen::Matrix&lt;*,1,-1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Array&lt;*,1,-1,*,*,*&gt;"/>

-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>

-      <Expand>

-        <Item Name="[size]">m_storage.m_cols</Item>

-        <ArrayItems>

-          <Size>m_storage.m_cols</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-      </Expand>

-  </Type>

-  

-  <!-- Dynamic Row Vector -->

-  <Type Name="Eigen::Matrix&lt;*,-1,1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Array&lt;*,-1,1,*,*,*&gt;"/>

-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>

-      <Expand>

-        <Item Name="[size]">m_storage.m_rows</Item>

-        <ArrayItems>

-          <Size>m_storage.m_rows</Size>

-          <ValuePointer>m_storage.m_data</ValuePointer>

-        </ArrayItems>

-      </Expand>

-  </Type>

-  

-  <!-- Fixed Vector -->

-  <Type Name="Eigen::Matrix&lt;*,1,1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Array&lt;*,1,1,*,*,*&gt;"/>

-      <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>

-      <Expand>

-        <Item Name="[x]">m_storage.m_data.array[0]</Item>

-      </Expand>

-  </Type>

-  

-  <Type Name="Eigen::Matrix&lt;*,2,1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Matrix&lt;*,1,2,*,*,*&gt;"/>

-      <AlternativeType Name="Eigen::Array&lt;*,2,1,*,*,*&gt;"/>

-      <AlternativeType Name="Eigen::Array&lt;*,1,2,*,*,*&gt;"/>

-      <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>

-      <Expand>

-        <Item Name="[x]">m_storage.m_data.array[0]</Item>

-        <Item Name="[y]">m_storage.m_data.array[1]</Item>

-      </Expand>

-  </Type>

-  

-  <Type Name="Eigen::Matrix&lt;*,3,1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Matrix&lt;*,1,3,*,*,*&gt;"/>

-      <AlternativeType Name="Eigen::Array&lt;*,3,1,*,*,*&gt;"/>

-      <AlternativeType Name="Eigen::Array&lt;*,1,3,*,*,*&gt;"/>

-      <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>

-      <Expand>

-        <Item Name="[x]">m_storage.m_data.array[0]</Item>

-        <Item Name="[y]">m_storage.m_data.array[1]</Item>

-        <Item Name="[z]">m_storage.m_data.array[2]</Item>

-      </Expand>

-  </Type>

-  

-    <Type Name="Eigen::Matrix&lt;*,4,1,*,*,*&gt;">

-      <AlternativeType Name="Eigen::Matrix&lt;*,1,4,*,*,*&gt;"/>

-      <AlternativeType Name="Eigen::Array&lt;*,4,1,*,*,*&gt;"/>

-      <AlternativeType Name="Eigen::Array&lt;*,1,4,*,*,*&gt;"/>

-      <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

-      <Expand>

-        <Item Name="[x]">m_storage.m_data.array[0]</Item>

-        <Item Name="[y]">m_storage.m_data.array[1]</Item>

-        <Item Name="[z]">m_storage.m_data.array[2]</Item>

-        <Item Name="[w]">m_storage.m_data.array[3]</Item>

-      </Expand>

-  </Type>

-

-</AutoVisualizer>

+<?xml version="1.0" encoding="utf-8"?>
+
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+
+  <!-- Fixed x Fixed Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,*,*,*,*,*&gt;">      
+      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>
+      <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>
+      <Expand>
+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+          <Rank>2</Rank>
+          <Size>$i==0 ? $T2 : $T3</Size>
+          <ValuePointer>m_storage.m_data.array</ValuePointer>
+        </ArrayItems>
+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+          <Direction>Backward</Direction>
+          <Rank>2</Rank>
+          <Size>$i==0 ? $T2 : $T3</Size>
+          <ValuePointer>m_storage.m_data.array</ValuePointer>
+        </ArrayItems>
+      </Expand>
+  </Type>
+  
+  <!-- 2 x 2 Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,2,2,*,*,*&gt;">      
+      <AlternativeType Name="Eigen::Array&lt;*,2,2,*,*,*&gt;"/>
+      <DisplayString>[2, 2] (fixed matrix)</DisplayString>
+      <Expand>
+        <Synthetic Name="[row 0]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 1]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>
+        </Synthetic>        
+      </Expand>
+  </Type>
+  
+  <!-- 3 x 3 Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,3,3,*,*,*&gt;">      
+      <AlternativeType Name="Eigen::Array&lt;*,3,3,*,*,*&gt;"/>
+      <DisplayString>[3, 3] (fixed matrix)</DisplayString>
+      <Expand>
+        <Synthetic Name="[row 0]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 1]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 2]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 2]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>
+        </Synthetic>        
+      </Expand>
+  </Type>
+  
+  <!-- 4 x 4 Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,4,4,*,*,*&gt;">      
+      <AlternativeType Name="Eigen::Array&lt;*,4,4,*,*,*&gt;"/>
+      <DisplayString>[4, 4] (fixed matrix)</DisplayString>
+      <Expand>
+        <Synthetic Name="[row 0]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 1]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 2]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 2]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 3]" Condition="Flags%2">
+          <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>
+        </Synthetic>
+        <Synthetic Name="[row 3]" Condition="!(Flags%2)">
+          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>
+        </Synthetic>        
+      </Expand>
+  </Type>  
+  
+  <!-- Dynamic x Dynamic Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,-1,-1,*,*,*&gt;">      
+      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>
+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>
+      <Expand>
+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+          <Rank>2</Rank>
+          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+          <Direction>Backward</Direction>
+          <Rank>2</Rank>
+          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+      </Expand>
+  </Type>
+  
+  <!-- Fixed x Dynamic Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,*,-1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Array&lt;*,*,-1,*,*,*&gt;"/>
+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+      <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>
+      <Expand>
+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+          <Rank>2</Rank>
+          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+          <Direction>Backward</Direction>
+          <Rank>2</Rank>
+          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+      </Expand>
+  </Type>
+  
+  <!-- Dynamic x Fixed Matrix -->
+  <Type Name="Eigen::Matrix&lt;*,-1,*,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Array&lt;*,-1,*,*,*,*&gt;"/>
+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>
+      <Expand>
+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+          <Rank>2</Rank>
+          <Size>$i==0 ? m_storage.m_rows : $T2</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+          <Direction>Backward</Direction>
+          <Rank>2</Rank>
+          <Size>$i==0 ? m_storage.m_rows : $T2</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+      </Expand>
+  </Type>
+  
+  <!-- Dynamic Column Vector -->
+  <Type Name="Eigen::Matrix&lt;*,1,-1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Array&lt;*,1,-1,*,*,*&gt;"/>
+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>
+      <Expand>
+        <Item Name="[size]">m_storage.m_cols</Item>
+        <ArrayItems>
+          <Size>m_storage.m_cols</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+      </Expand>
+  </Type>
+  
+  <!-- Dynamic Row Vector -->
+  <Type Name="Eigen::Matrix&lt;*,-1,1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Array&lt;*,-1,1,*,*,*&gt;"/>
+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>
+      <Expand>
+        <Item Name="[size]">m_storage.m_rows</Item>
+        <ArrayItems>
+          <Size>m_storage.m_rows</Size>
+          <ValuePointer>m_storage.m_data</ValuePointer>
+        </ArrayItems>
+      </Expand>
+  </Type>
+  
+  <!-- Fixed Vector -->
+  <Type Name="Eigen::Matrix&lt;*,1,1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Array&lt;*,1,1,*,*,*&gt;"/>
+      <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>
+      <Expand>
+        <Item Name="[x]">m_storage.m_data.array[0]</Item>
+      </Expand>
+  </Type>
+  
+  <Type Name="Eigen::Matrix&lt;*,2,1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Matrix&lt;*,1,2,*,*,*&gt;"/>
+      <AlternativeType Name="Eigen::Array&lt;*,2,1,*,*,*&gt;"/>
+      <AlternativeType Name="Eigen::Array&lt;*,1,2,*,*,*&gt;"/>
+      <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
+      <Expand>
+        <Item Name="[x]">m_storage.m_data.array[0]</Item>
+        <Item Name="[y]">m_storage.m_data.array[1]</Item>
+      </Expand>
+  </Type>
+  
+  <Type Name="Eigen::Matrix&lt;*,3,1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Matrix&lt;*,1,3,*,*,*&gt;"/>
+      <AlternativeType Name="Eigen::Array&lt;*,3,1,*,*,*&gt;"/>
+      <AlternativeType Name="Eigen::Array&lt;*,1,3,*,*,*&gt;"/>
+      <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
+      <Expand>
+        <Item Name="[x]">m_storage.m_data.array[0]</Item>
+        <Item Name="[y]">m_storage.m_data.array[1]</Item>
+        <Item Name="[z]">m_storage.m_data.array[2]</Item>
+      </Expand>
+  </Type>
+  
+    <Type Name="Eigen::Matrix&lt;*,4,1,*,*,*&gt;">
+      <AlternativeType Name="Eigen::Matrix&lt;*,1,4,*,*,*&gt;"/>
+      <AlternativeType Name="Eigen::Array&lt;*,4,1,*,*,*&gt;"/>
+      <AlternativeType Name="Eigen::Array&lt;*,1,4,*,*,*&gt;"/>
+      <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+      <Expand>
+        <Item Name="[x]">m_storage.m_data.array[0]</Item>
+        <Item Name="[y]">m_storage.m_data.array[1]</Item>
+        <Item Name="[z]">m_storage.m_data.array[2]</Item>
+        <Item Name="[w]">m_storage.m_data.array[3]</Item>
+      </Expand>
+  </Type>
+
+</AutoVisualizer>
diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat
index 35ef580..273c10d 100644
--- a/debug/msvc/eigen_autoexp_part.dat
+++ b/debug/msvc/eigen_autoexp_part.dat
@@ -1,295 +1,295 @@
-; ***************************************************************

-; * Eigen Visualizer

-; *

-; * Author: Hauke Heibel <hauke.heibel@gmail.com>

-; *

-; * Support the enhanced debugging of the following Eigen

-; * types (*: any, +:fixed dimension) :

-; *

-; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>

-; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>

-; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>

-; * - Eigen::Matrix<*,-1,-1,*,*,*>

-; * - Eigen::Matrix<*,+,-1,*,*,*>

-; * - Eigen::Matrix<*,-1,+,*,*,*>

-; * - Eigen::Matrix<*,+,+,*,*,*>

-; *

-; * Matrices are displayed properly independently of the memory

-; * alignment (RowMajor vs. ColMajor).

-; *

-; * This file is distributed WITHOUT ANY WARRANTY. Please ensure

-; * that your original autoexp.dat file is copied to a safe 

-; * place before proceeding with its modification.

-; ***************************************************************

-

-[Visualizer]

-

-; Fixed size 4-vectors

-Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{

-   children

-   (

-      #(

-        [internals]: [$c,!],

-         x : ($c.m_storage.m_data.array)[0],

-         y : ($c.m_storage.m_data.array)[1],

-         z : ($c.m_storage.m_data.array)[2],

-         w : ($c.m_storage.m_data.array)[3]

-      )

-   )

-

-   preview

-   (

-      #(

-        "[",

-        4,

-        "](",

-        #array(expr: $e.m_storage.m_data.array[$i], size: 4),

-        ")"

-      )

-   )

-}

-

-; Fixed size 3-vectors

-Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{

-   children

-   (

-      #(

-        [internals]: [$c,!],

-         x : ($c.m_storage.m_data.array)[0],

-         y : ($c.m_storage.m_data.array)[1],

-         z : ($c.m_storage.m_data.array)[2]

-      )

-   )

-

-   preview

-   (

-      #(

-        "[",

-        3,

-        "](",

-        #array(expr: $e.m_storage.m_data.array[$i], size: 3),

-        ")"

-      )

-   )

-}

-

-; Fixed size 2-vectors

-Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{

-   children

-   (

-      #(

-        [internals]: [$c,!],

-         x : ($c.m_storage.m_data.array)[0],

-         y : ($c.m_storage.m_data.array)[1]

-      )

-   )

-

-   preview

-   (

-      #(

-        "[",

-        2,

-        "](",

-        #array(expr: $e.m_storage.m_data.array[$i], size: 2),

-        ")"

-      )

-   )

-}

-

-; Fixed size 1-vectors

-Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{

-   children

-   (

-      #(

-        [internals]: [$c,!],

-         x : ($c.m_storage.m_data.array)[0]

-      )

-   )

-

-   preview

-   (

-      #(

-        "[",

-        1,

-        "](",

-        #array(expr: $e.m_storage.m_data.array[$i], size: 1),

-        ")"

-      )

-   )

-}

-

-; Dynamic matrices (ColMajor and RowMajor support)

-Eigen::Matrix<*,-1,-1,*,*,*>{

-  children

-   (

-      #(

-         [internals]: [$c,!],

-         rows: $c.m_storage.m_rows,

-         cols: $c.m_storage.m_cols,

-         ; Check for RowMajorBit

-         #if ($c.Flags & 0x1) (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 

-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols

-             )

-         ) #else (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data)[$i],

-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols

-             )

-         )

-      )

-   )

-

-   preview

-   (

-     #(

-         "[",

-           $c.m_storage.m_rows,

-         ",",

-           $c.m_storage.m_cols,

-         "](",

-           #array(

-            expr :    [($c.m_storage.m_data)[$i],g],

-            size :    $c.m_storage.m_rows*$c.m_storage.m_cols

-           ),

-         ")"

-      )

-   )

-}

-

-; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)

-Eigen::Matrix<*,*,-1,*,*,*>{

-  children

-   (

-      #(

-         [internals]: [$c,!],

-         rows: $c.RowsAtCompileTime,

-         cols: $c.m_storage.m_cols,

-         ; Check for RowMajorBit

-         #if ($c.Flags & 0x1) (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],

-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols

-             )

-         ) #else (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data)[$i],

-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols

-             )

-         )

-      )

-   )

-

-   preview

-   (

-     #(

-         "[",

-           $c.RowsAtCompileTime,

-         ",",

-           $c.m_storage.m_cols,

-         "](",

-           #array(

-            expr :    [($c.m_storage.m_data)[$i],g],

-            size :    $c.RowsAtCompileTime*$c.m_storage.m_cols

-           ),

-         ")"

-      )

-   )

-}

-

-; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)

-Eigen::Matrix<*,-1,*,*,*,*>{

-  children

-   (

-      #(

-         [internals]: [$c,!],

-         rows: $c.m_storage.m_rows,

-         cols: $c.ColsAtCompileTime,

-         ; Check for RowMajorBit

-         #if ($c.Flags & 0x1) (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 

-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime

-             )

-         ) #else (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data)[$i],

-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime

-             )

-         )

-      )

-   )

-

-   preview

-   (

-     #(

-         "[",

-           $c.m_storage.m_rows,

-         ",",

-           $c.ColsAtCompileTime,

-         "](",

-           #array(

-            expr :    [($c.m_storage.m_data)[$i],g],

-            size :    $c.m_storage.m_rows*$c.ColsAtCompileTime

-           ),

-         ")"

-      )

-   )

-}

-

-; Fixed size matrix (ColMajor and RowMajor support)

-Eigen::Matrix<*,*,*,*,*,*>{

-  children

-   (

-      #(

-         [internals]: [$c,!],

-         rows: $c.RowsAtCompileTime,

-         cols: $c.ColsAtCompileTime,

-         ; Check for RowMajorBit

-         #if ($c.Flags & 0x1) (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], 

-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime

-             )

-         ) #else (

-             #array(

-                rank: 2,

-                base: 0,

-                expr: ($c.m_storage.m_data.array)[$i],

-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime

-             )

-         )

-      )

-   )

-

-   preview

-   (

-     #(

-         "[",

-           $c.RowsAtCompileTime,

-         ",",

-           $c.ColsAtCompileTime,

-         "](",

-           #array(

-            expr :    [($c.m_storage.m_data.array)[$i],g],

-            size :    $c.RowsAtCompileTime*$c.ColsAtCompileTime

-           ),

-         ")"

-      )

-   )

-}

+; ***************************************************************
+; * Eigen Visualizer
+; *
+; * Author: Hauke Heibel <hauke.heibel@gmail.com>
+; *
+; * Support the enhanced debugging of the following Eigen
+; * types (*: any, +:fixed dimension) :
+; *
+; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>
+; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>
+; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>
+; * - Eigen::Matrix<*,-1,-1,*,*,*>
+; * - Eigen::Matrix<*,+,-1,*,*,*>
+; * - Eigen::Matrix<*,-1,+,*,*,*>
+; * - Eigen::Matrix<*,+,+,*,*,*>
+; *
+; * Matrices are displayed properly independently of the memory
+; * alignment (RowMajor vs. ColMajor).
+; *
+; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
+; * that your original autoexp.dat file is copied to a safe 
+; * place before proceeding with its modification.
+; ***************************************************************
+
+[Visualizer]
+
+; Fixed size 4-vectors
+Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{
+   children
+   (
+      #(
+        [internals]: [$c,!],
+         x : ($c.m_storage.m_data.array)[0],
+         y : ($c.m_storage.m_data.array)[1],
+         z : ($c.m_storage.m_data.array)[2],
+         w : ($c.m_storage.m_data.array)[3]
+      )
+   )
+
+   preview
+   (
+      #(
+        "[",
+        4,
+        "](",
+        #array(expr: $e.m_storage.m_data.array[$i], size: 4),
+        ")"
+      )
+   )
+}
+
+; Fixed size 3-vectors
+Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{
+   children
+   (
+      #(
+        [internals]: [$c,!],
+         x : ($c.m_storage.m_data.array)[0],
+         y : ($c.m_storage.m_data.array)[1],
+         z : ($c.m_storage.m_data.array)[2]
+      )
+   )
+
+   preview
+   (
+      #(
+        "[",
+        3,
+        "](",
+        #array(expr: $e.m_storage.m_data.array[$i], size: 3),
+        ")"
+      )
+   )
+}
+
+; Fixed size 2-vectors
+Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{
+   children
+   (
+      #(
+        [internals]: [$c,!],
+         x : ($c.m_storage.m_data.array)[0],
+         y : ($c.m_storage.m_data.array)[1]
+      )
+   )
+
+   preview
+   (
+      #(
+        "[",
+        2,
+        "](",
+        #array(expr: $e.m_storage.m_data.array[$i], size: 2),
+        ")"
+      )
+   )
+}
+
+; Fixed size 1-vectors
+Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{
+   children
+   (
+      #(
+        [internals]: [$c,!],
+         x : ($c.m_storage.m_data.array)[0]
+      )
+   )
+
+   preview
+   (
+      #(
+        "[",
+        1,
+        "](",
+        #array(expr: $e.m_storage.m_data.array[$i], size: 1),
+        ")"
+      )
+   )
+}
+
+; Dynamic matrices (ColMajor and RowMajor support)
+Eigen::Matrix<*,-1,-1,*,*,*>{
+  children
+   (
+      #(
+         [internals]: [$c,!],
+         rows: $c.m_storage.m_rows,
+         cols: $c.m_storage.m_cols,
+         ; Check for RowMajorBit
+         #if ($c.Flags & 0x1) (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 
+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
+             )
+         ) #else (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data)[$i],
+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
+             )
+         )
+      )
+   )
+
+   preview
+   (
+     #(
+         "[",
+           $c.m_storage.m_rows,
+         ",",
+           $c.m_storage.m_cols,
+         "](",
+           #array(
+            expr :    [($c.m_storage.m_data)[$i],g],
+            size :    $c.m_storage.m_rows*$c.m_storage.m_cols
+           ),
+         ")"
+      )
+   )
+}
+
+; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,*,-1,*,*,*>{
+  children
+   (
+      #(
+         [internals]: [$c,!],
+         rows: $c.RowsAtCompileTime,
+         cols: $c.m_storage.m_cols,
+         ; Check for RowMajorBit
+         #if ($c.Flags & 0x1) (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
+             )
+         ) #else (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data)[$i],
+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
+             )
+         )
+      )
+   )
+
+   preview
+   (
+     #(
+         "[",
+           $c.RowsAtCompileTime,
+         ",",
+           $c.m_storage.m_cols,
+         "](",
+           #array(
+            expr :    [($c.m_storage.m_data)[$i],g],
+            size :    $c.RowsAtCompileTime*$c.m_storage.m_cols
+           ),
+         ")"
+      )
+   )
+}
+
+; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,-1,*,*,*,*>{
+  children
+   (
+      #(
+         [internals]: [$c,!],
+         rows: $c.m_storage.m_rows,
+         cols: $c.ColsAtCompileTime,
+         ; Check for RowMajorBit
+         #if ($c.Flags & 0x1) (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 
+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
+             )
+         ) #else (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data)[$i],
+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
+             )
+         )
+      )
+   )
+
+   preview
+   (
+     #(
+         "[",
+           $c.m_storage.m_rows,
+         ",",
+           $c.ColsAtCompileTime,
+         "](",
+           #array(
+            expr :    [($c.m_storage.m_data)[$i],g],
+            size :    $c.m_storage.m_rows*$c.ColsAtCompileTime
+           ),
+         ")"
+      )
+   )
+}
+
+; Fixed size matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,*,*,*,*,*>{
+  children
+   (
+      #(
+         [internals]: [$c,!],
+         rows: $c.RowsAtCompileTime,
+         cols: $c.ColsAtCompileTime,
+         ; Check for RowMajorBit
+         #if ($c.Flags & 0x1) (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], 
+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
+             )
+         ) #else (
+             #array(
+                rank: 2,
+                base: 0,
+                expr: ($c.m_storage.m_data.array)[$i],
+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
+             )
+         )
+      )
+   )
+
+   preview
+   (
+     #(
+         "[",
+           $c.RowsAtCompileTime,
+         ",",
+           $c.ColsAtCompileTime,
+         "](",
+           #array(
+            expr :    [($c.m_storage.m_data.array)[$i],g],
+            size :    $c.RowsAtCompileTime*$c.ColsAtCompileTime
+           ),
+         ")"
+      )
+   )
+}
diff --git a/doc/Pitfalls.dox b/doc/Pitfalls.dox
index 6760ce6..85282bd 100644
--- a/doc/Pitfalls.dox
+++ b/doc/Pitfalls.dox
@@ -124,5 +124,26 @@
   - If the value you are passing is guaranteed to be around for the life of the functor, you can use boost::ref() to wrap the value as you pass it to boost::bind. Generally this is not a solution for values on the stack as if the functor ever gets passed to a lower or independent scope, the object may be gone by the time it's attempted to be used.
   - The other option is to make your functions take a reference counted pointer like boost::shared_ptr as the argument. This avoids needing to worry about managing the lifetime of the object being passed.
 
+
+\section TopicPitfalls_matrix_bool Matrices with boolean coefficients
+
+The current behaviour of using \c Matrix with boolean coefficients is inconsistent and likely to change in future versions of Eigen, so please use it carefully!
+
+A simple example for such an inconsistency is 
+
+\code
+template<int Size>
+void foo() {
+  Eigen::Matrix<bool, Size, Size> A, B, C;
+  A.setOnes();
+  B.setOnes();
+
+  C = A * B - A * B;
+  std::cout << C << "\n";
+}
+\endcode
+
+since calling \c foo<3>() prints the zero matrix while calling \c foo<10>() prints the identity matrix.
+
 */
 }
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index a1529bc..7f7e44f 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -14,6 +14,7 @@
 template<typename Scalar>
 void pow_test() {
   const Scalar zero = Scalar(0);
+  const Scalar eps = std::numeric_limits<Scalar>::epsilon();
   const Scalar one = Scalar(1);
   const Scalar two = Scalar(2);
   const Scalar three = Scalar(3);
@@ -21,20 +22,25 @@
   const Scalar sqrt2 = Scalar(std::sqrt(2));
   const Scalar inf = std::numeric_limits<Scalar>::infinity();
   const Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  const Scalar denorm_min = std::numeric_limits<Scalar>::denorm_min();
   const Scalar min = (std::numeric_limits<Scalar>::min)();
   const Scalar max = (std::numeric_limits<Scalar>::max)();
+  const Scalar max_exp = (static_cast<Scalar>(std::numeric_limits<Scalar>::max_exponent) * Scalar(EIGEN_LN2)) / eps;
+
   const static Scalar abs_vals[] = {zero,
+                                    denorm_min,
+                                    min,
+                                    eps,
                                     sqrt_half,
                                     one,
                                     sqrt2,
                                     two,
                                     three,
-                                    min,
+                                    max_exp,
                                     max,
                                     inf,
                                     nan};
-
-  const int abs_cases = 10;
+  const int abs_cases = 13;
   const int num_cases = 2*abs_cases * 2*abs_cases;
   // Repeat the same value to make sure we hit the vectorized path.
   const int num_repeats = 32;
@@ -64,10 +70,7 @@
   bool all_pass = true;
   for (int i = 0; i < 1; ++i) {
     for (int j = 0; j < num_cases; ++j) {
-      // TODO(rmlarsen): Skip tests that trigger a known bug in pldexp for now.
-      if (std::abs(x(i,j)) == max || std::abs(x(i,j)) == min) continue;
-
-      Scalar e = numext::pow(x(i,j), y(i,j));
+      Scalar e = static_cast<Scalar>(std::pow(x(i,j), y(i,j)));
       Scalar a = actual(i, j);
       bool fail = !(a==e) && !internal::isApprox(a, e, tol) && !((numext::isnan)(a) && (numext::isnan)(e));
       all_pass &= !fail;
@@ -79,7 +82,6 @@
   VERIFY(all_pass);
 }
 
-
 template<typename ArrayType> void array(const ArrayType& m)
 {
   typedef typename ArrayType::Scalar Scalar;
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index c388f3a..94a83c0 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -567,7 +567,36 @@
     data2[i] = Scalar(internal::random<double>(-87, 88));
   }
   CHECK_CWISE1_IF(PacketTraits::HasExp, std::exp, internal::pexp);
+  
   CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+  if (PacketTraits::HasExp) {
+    // Check denormals:
+    for (int j=0; j<3; ++j) {
+      data1[0] = Scalar(std::ldexp(1, std::numeric_limits<Scalar>::min_exponent-j));
+      CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+      data1[0] = -data1[0];
+      CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+    }
+    
+    // zero
+    data1[0] = Scalar(0);
+    CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+    
+    // inf and NaN only compare output fraction, not exponent.
+    test::packet_helper<PacketTraits::HasExp,Packet> h;
+    Packet pout;
+    Scalar sout;
+    Scalar special[] = { NumTraits<Scalar>::infinity(), 
+                        -NumTraits<Scalar>::infinity(),
+                         NumTraits<Scalar>::quiet_NaN()};
+    for (int i=0; i<3; ++i) {
+      data1[0] = special[i];
+      ref[0] = Scalar(REF_FREXP(data1[0], ref[PacketSize]));
+      h.store(data2, internal::pfrexp(h.load(data1), h.forward_reference(pout, sout)));
+      VERIFY(test::areApprox(ref, data2, 1) && "internal::pfrexp");
+    }
+  }
+  
   for (int i = 0; i < PacketSize; ++i) {
     data1[i] = Scalar(internal::random<double>(-1, 1));
     data2[i] = Scalar(internal::random<double>(-1, 1));