Update Eigen to commit:6156797016164b87b3e360e02d0e4107f7f66fbc

CHANGELOG
=========
615679701 - Revert "Add template to specify QR permutation index type, Fix ColPivHouseholderQR Lapacke bindings"
be7791e09 - Add template to specify QR permutation index type, Fix ColPivHouseholderQR Lapacke bindings
9463fc95f - change insert strategy
c54785b07 - Fix error: unused parameter '\''tmp'\'' [-Werror,-Wunused-parameter] on clang/32-bit arm
f47472603 - Add missing header for GPU tests.
81172cbdc - Overhaul Sparse Core
925518189 - Modified spbenchsolver help message because it could be misunderstood
d20fe21ae - Improve performance for Power10 MMA bfloat16 GEMM
fe7f52778 - Fix guard macros for emulated FP16 operators on GPU
b8422c99c - Update file jacobisvd.cpp
262194f12 - Fix a bunch of minor build and test issues.

PiperOrigin-RevId: 501682808
Change-Id: If89a17cd8160863cb158c4cd4f0814260bd5857d
diff --git a/Eigen/SPQRSupport b/Eigen/SPQRSupport
index 33c3370..d83495e 100644
--- a/Eigen/SPQRSupport
+++ b/Eigen/SPQRSupport
@@ -28,7 +28,7 @@
   *
   */
 
-#include "Eigen/CholmodSupport"
+#include "CholmodSupport"
 #include "src/SPQRSupport/SuiteSparseQRSupport.h"
 
 #endif
diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index b2db46b..352f1ec 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -17,6 +17,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <algorithm>
+#include <numeric>
 
 /** 
   * \defgroup SparseCore_Module SparseCore module
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index e1c17fc..5e5eead 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -294,7 +294,7 @@
       Packet mask = pcmp_eq(pset1<Packet>(value), p);
       Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
       this->res = value;
-      this->row = Derived::IsRowMajor ? i : i + max_idx;;
+      this->row = Derived::IsRowMajor ? i : i + max_idx;
       this->col = Derived::IsRowMajor ? j + max_idx : j;
     }
   }
@@ -342,7 +342,7 @@
 struct functor_traits<minmax_coeff_visitor<Scalar, is_min, NaNPropagation> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = true
+    PacketAccess = packet_traits<Scalar>::HasCmp
   };
 };
 
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index 2429c81..61d9618 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -841,6 +841,345 @@
   }
 };
 
+#ifdef __MMA__
+// General template for lhs packing, bfloat16 specialization.
+template<typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, true>
+{
+  EIGEN_STRONG_INLINE void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<bfloat16>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for(; j + 2*vectorSize <= rows; j+=2*vectorSize)
+    {
+      const DataMapper lhs2 = lhs.getSubMapper(j, 0);
+      Index i = 0;
+
+      if(PanelMode) ri += 2*vectorSize*offset;
+
+      if(StorageOrder == ColMajor)
+      {
+        for(; i + 2 <= depth; i+=2)
+        {
+          PacketBlock<Packet8bf,4> block;
+
+          block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
+          block.packet[1] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 0);
+          block.packet[2] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 1);
+          block.packet[3] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 1);
+
+          Packet8bf t0, t1;
+          t0              = vec_mergeh(block.packet[0].m_val, block.packet[2].m_val);
+          t1              = vec_mergel(block.packet[0].m_val, block.packet[2].m_val);
+          block.packet[2] = vec_mergeh(block.packet[1].m_val, block.packet[3].m_val);
+          block.packet[3] = vec_mergel(block.packet[1].m_val, block.packet[3].m_val);
+          block.packet[0] = t0;
+          block.packet[1] = t1;
+
+          storeBlock<bfloat16, Packet8bf, 4>(blockA + ri, block);
+
+          ri += 2*2*vectorSize;
+        }
+        if (depth & 1)
+        {
+          PacketBlock<Packet8bf,2> block;
+
+          block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
+          block.packet[1] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 0);
+
+          storeBlock<bfloat16, Packet8bf, 2>(blockA + ri, block);
+
+          ri += 2*vectorSize;
+        }
+      } else {
+        for(; i + vectorSize <= depth; i+=vectorSize)
+        {
+          PacketBlock<Packet8bf,8> block1, block2;
+
+          bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block1, lhs2, 0 * vectorSize, i);
+          bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block2, lhs2, 1 * vectorSize, i);
+
+          Packet2ul v1[8], v2[8];
+
+          v1[0] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val)));
+          v1[1] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val)));
+          v1[2] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val)));
+          v1[3] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val)));
+          v1[4] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val)));
+          v1[5] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val)));
+          v1[6] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val)));
+          v1[7] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val)));
+          v2[0] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[0].m_val), reinterpret_cast<Packet4ui>(block2.packet[1].m_val)));
+          v2[1] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[0].m_val), reinterpret_cast<Packet4ui>(block2.packet[1].m_val)));
+          v2[2] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[2].m_val), reinterpret_cast<Packet4ui>(block2.packet[3].m_val)));
+          v2[3] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[2].m_val), reinterpret_cast<Packet4ui>(block2.packet[3].m_val)));
+          v2[4] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[4].m_val), reinterpret_cast<Packet4ui>(block2.packet[5].m_val)));
+          v2[5] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[4].m_val), reinterpret_cast<Packet4ui>(block2.packet[5].m_val)));
+          v2[6] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[6].m_val), reinterpret_cast<Packet4ui>(block2.packet[7].m_val)));
+          v2[7] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[6].m_val), reinterpret_cast<Packet4ui>(block2.packet[7].m_val)));
+
+          block1.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(v1[0],v1[2]));
+          block1.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(v1[0],v1[2]));
+          block1.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(v1[1],v1[3]));
+          block1.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(v1[1],v1[3]));
+          block1.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(v1[4],v1[6]));
+          block1.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(v1[4],v1[6]));
+          block1.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(v1[5],v1[7]));
+          block1.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(v1[5],v1[7]));
+          block2.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(v2[0],v2[2]));
+          block2.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(v2[0],v2[2]));
+          block2.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(v2[1],v2[3]));
+          block2.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(v2[1],v2[3]));
+          block2.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(v2[4],v2[6]));
+          block2.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(v2[4],v2[6]));
+          block2.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(v2[5],v2[7]));
+          block2.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(v2[5],v2[7]));
+
+
+          for(Index M = 0; M < 8; M+=2) {
+            pstore<bfloat16>(blockA + ri + (0 * vectorSize) + (2*vectorSize * M), block1.packet[M+0]);
+            pstore<bfloat16>(blockA + ri + (1 * vectorSize) + (2*vectorSize * M), block1.packet[M+1]);
+            pstore<bfloat16>(blockA + ri + (2 * vectorSize) + (2*vectorSize * M), block2.packet[M+0]);
+            pstore<bfloat16>(blockA + ri + (3 * vectorSize) + (2*vectorSize * M), block2.packet[M+1]);
+          }
+
+          ri += 2*vectorSize*vectorSize;
+        }
+        for(; i + 2 <= depth; i+=2)
+        {
+          for(Index M = 0; M < 2*vectorSize; M++) {
+            blockA[ri + (M * 2) + 0] = lhs2(M, i + 0);
+            blockA[ri + (M * 2) + 1] = lhs2(M, i + 1);
+          }
+
+          ri += 2*2*vectorSize;
+        }
+        if (depth & 1)
+        {
+          for(Index M = 0; M < 2*vectorSize; M++) {
+            blockA[ri + M] = lhs2(M, i);
+          }
+          ri += 2*vectorSize;
+        }
+      }
+
+      if(PanelMode) ri += 2*vectorSize*(stride - offset - depth);
+    }
+    for(; j + vectorSize <= rows; j+=vectorSize)
+    {
+      const DataMapper lhs2 = lhs.getSubMapper(j, 0);
+      Index i = 0;
+
+      if(PanelMode) ri += vectorSize*offset;
+
+      if(StorageOrder == ColMajor)
+      {
+        for(; i + 2 <= depth; i+=2)
+        {
+          PacketBlock<Packet8bf,2> block;
+
+          block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
+          block.packet[1] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 1);
+
+          Packet8bf t0;
+          t0              = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+          block.packet[1] = vec_mergel(block.packet[0].m_val, block.packet[1].m_val);
+          block.packet[0] = t0;
+
+          storeBlock<bfloat16, Packet8bf, 2>(blockA + ri, block);
+
+          ri += 2*vectorSize;
+        }
+        if (depth & 1)
+        {
+          Packet8bf lhsV = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
+          pstore<bfloat16>(blockA + ri, lhsV);
+
+          ri += vectorSize;
+        }
+      } else {
+        for(; i + vectorSize <= depth; i+=vectorSize)
+        {
+          PacketBlock<Packet8bf,8> block1;
+
+          bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block1, lhs2, 0 * vectorSize, i);
+
+          Packet2ul v1[8];
+
+          v1[0] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val)));
+          v1[1] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val)));
+          v1[2] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val)));
+          v1[3] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val)));
+          v1[4] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val)));
+          v1[5] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val)));
+          v1[6] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val)));
+          v1[7] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val)));
+
+          block1.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(v1[0],v1[2]));
+          block1.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(v1[0],v1[2]));
+          block1.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(v1[1],v1[3]));
+          block1.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(v1[1],v1[3]));
+          block1.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(v1[4],v1[6]));
+          block1.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(v1[4],v1[6]));
+          block1.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(v1[5],v1[7]));
+          block1.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(v1[5],v1[7]));
+
+          for(Index M = 0; M < 8; M++) {
+            pstore<bfloat16>(blockA + ri + (vectorSize * M), block1.packet[M]);
+          }
+
+          ri += vectorSize*vectorSize;
+        }
+        for(; i + 2 <= depth; i+=2)
+        {
+          for(Index M = 0; M < vectorSize; M++) {
+            blockA[ri + (M * 2) + 0] = lhs2(M, i + 0);
+            blockA[ri + (M * 2) + 1] = lhs2(M, i + 1);
+          }
+
+          ri += 2*vectorSize;
+        }
+        if (depth & 1)
+        {
+          for(Index M = 0; M < vectorSize; M++) {
+            blockA[ri + M] = lhs2(M, i);
+          }
+
+          ri += vectorSize;
+        }
+      }
+
+      if(PanelMode) ri += vectorSize*(stride - offset - depth);
+    }
+
+    if(PanelMode) ri += offset;
+
+    for(; j < rows; j++)
+    {
+      const DataMapper lhs2 = lhs.getSubMapper(j, 0);
+      for(Index i = 0; i < depth; i++)
+      {
+        blockA[ri] = lhs2(0, i);
+        ri += 1;
+      }
+
+      if(PanelMode) ri += stride - depth;
+    }
+  }
+};
+
+// General template for rhs packing, bfloat16 specialization.
+template<typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, false>
+{
+  EIGEN_STRONG_INLINE void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<bfloat16>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for(; j + 4 <= cols; j+=4)
+    {
+      const DataMapper rhs2 = rhs.getSubMapper(0, j);
+      Index i = 0;
+
+      if(PanelMode) ri += 4*offset;
+
+      for(; i + vectorSize <= depth; i+=vectorSize)
+      {
+        if(StorageOrder == ColMajor)
+        {
+          PacketBlock<Packet8bf,4> block;
+
+          bload<DataMapper, Packet8bf, 4, StorageOrder, false, 4>(block, rhs2, i, 0);
+
+          Packet2ul t0, t1, t2, t3;
+          t0 = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[0].m_val), reinterpret_cast<Packet4ui>(block.packet[1].m_val)));
+          t1 = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[2].m_val), reinterpret_cast<Packet4ui>(block.packet[3].m_val)));
+          t2 = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block.packet[0].m_val), reinterpret_cast<Packet4ui>(block.packet[1].m_val)));
+          t3 = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block.packet[2].m_val), reinterpret_cast<Packet4ui>(block.packet[3].m_val)));
+          block.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(t0, t1));
+          block.packet[1] = reinterpret_cast<Packet8us>(vec_mergel(t0, t1));
+          block.packet[2] = reinterpret_cast<Packet8us>(vec_mergeh(t2, t3));
+          block.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(t2, t3));
+
+          storeBlock<bfloat16, Packet8bf, 4>(blockB + ri, block);
+        } else {
+          PacketBlock<Packet8bf,8> block;
+
+          for (int M = 0; M < 8; M++) {
+            block.packet[M] = rhs2.template loadPacketPartial<Packet8bf>(i + M, 0, 4);
+          }
+
+          block.packet[0] = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+          block.packet[1] = vec_mergeh(block.packet[2].m_val, block.packet[3].m_val);
+          block.packet[2] = vec_mergeh(block.packet[4].m_val, block.packet[5].m_val);
+          block.packet[3] = vec_mergeh(block.packet[6].m_val, block.packet[7].m_val);
+
+          const Index size = 16 / sizeof(bfloat16);
+
+          for (int M = 0; M < 4; M++) {
+            pstore<bfloat16>(blockB + ri + (M * size), block.packet[M]);
+          }
+        }
+
+        ri += 4*vectorSize;
+      }
+      for (; i + 2 <= depth; i += 2) {
+        if(StorageOrder == ColMajor)
+        {
+          blockB[ri+0] = rhs2(i + 0, 0);
+          blockB[ri+1] = rhs2(i + 1, 0);
+          blockB[ri+2] = rhs2(i + 0, 1);
+          blockB[ri+3] = rhs2(i + 1, 1);
+          blockB[ri+4] = rhs2(i + 0, 2);
+          blockB[ri+5] = rhs2(i + 1, 2);
+          blockB[ri+6] = rhs2(i + 0, 3);
+          blockB[ri+7] = rhs2(i + 1, 3);
+        } else {
+          PacketBlock<Packet8bf,2> block;
+
+          for (int M = 0; M < 2; M++) {
+            block.packet[M] = rhs2.template loadPacketPartial<Packet8bf>(i + M, 0, 4);
+          }
+
+          block.packet[0] = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+
+          pstore<bfloat16>(blockB + ri, block.packet[0]);
+        }
+
+        ri += 4*2;
+      }
+      if (depth & 1)
+      {
+        blockB[ri+0] = rhs2(i, 0);
+        blockB[ri+1] = rhs2(i, 1);
+        blockB[ri+2] = rhs2(i, 2);
+        blockB[ri+3] = rhs2(i, 3);
+
+        ri += 4;
+      }
+
+      if(PanelMode) ri += 4*(stride - offset - depth);
+    }
+
+    if(PanelMode) ri += offset;
+
+    for(; j < cols; j++)
+    {
+      const DataMapper rhs2 = rhs.getSubMapper(0, j);
+      for(Index i = 0; i < depth; i++)
+      {
+        blockB[ri] = rhs2(i, 0);
+        ri += 1;
+      }
+
+      if(PanelMode) ri += stride - depth;
+    }
+  }
+};
+#endif
+
 // General template for lhs complex packing, float64 specialization.
 template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
 struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
@@ -2322,6 +2661,64 @@
 }
 #endif
 
+#ifdef __MMA__
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_pack<bfloat16, DataMapper, Packet8bf, ColMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+  ::operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_pack<bfloat16, DataMapper, Packet8bf, ColMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+  ::operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+#endif
+
 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
 struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
 {
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
index b3e063d..01465d3 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
@@ -1,117 +1,64 @@
- #ifndef EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
- #define EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
+#ifndef EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
+#define EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
+
+#if EIGEN_COMP_LLVM
+#define BFLOAT16_UNROLL _Pragma("unroll 8")
+#else
+#define BFLOAT16_UNROLL _Pragma("GCC unroll(8)")
+#endif
 
 namespace Eigen {
 
 namespace internal {
 
-EIGEN_STRONG_INLINE void pgerMMAbfloat16(__vector_quad* acc, const Packet8bf& a, const Packet8bf& b, int maskX, int maskY)
-{
-  switch(maskX){
-    case 15:
-      switch(maskY){
-        case 0b1111:          
-          __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val));
-          break;
-        case 0b0011:
-          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1111, 0b11, 0b11);
-          break;
-        case 0b0001:
-          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1111, 0b1, 0b11);
-          break;
-        case 0b0111:
-          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1111, 0b111, 0b11);
-          break;
-      }
-      break;
-    case 3:
-      switch(maskY){
-        case 0b1111:
-          __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val));
-          break;
-        case 0b0011:
-          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b11, 0b11, 0b11);
-          break;
-        case 0b0001:
-          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b11, 0b1, 0b11);
-          break;
-        case 0b0111:
-          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b11, 0b111, 0b11);
-          break;
-      }
-      break;
-    case 1:
-      switch(maskY){
-        case 0b1111:
-          __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val));
-          break;
-        case 0b0011:
-          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1, 0b11, 0b11);
-          break;
-        case 0b0001:
-          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1, 0b1, 0b11);
-          break;
-        case 0b0111:
-          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1, 0b111, 0b11);
-          break;
-      }
-      break;
-    case 0b0111:
-      switch(maskY){
-        case 0b1111:
-          __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val));
-          break;
-        case 0b0011:
-          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b111, 0b11, 0b11);
-          break;
-        case 0b0001:
-          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b111, 0b1, 0b11);
-          break;
-        case 0b0111:
-          __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b111, 0b111, 0b11);
-          break;
-      }
-      break;
-  }
-}
-
-EIGEN_STRONG_INLINE void scaleAndStore(float* result, float* acc, Packet4f pAlpha)
+EIGEN_ALWAYS_INLINE void scaleAndStore(float* result, Packet4f& acc, const Packet4f& pAlpha)
 {
   Packet4f result_block = ploadu<Packet4f>(result);
-  Packet4f packet_pmadd = pmadd(pload<Packet4f>(acc), pAlpha, result_block);
-  pstoreu(result, packet_pmadd);
+  result_block = pmadd(acc, pAlpha, result_block);
+  pstoreu(result, result_block);
 }
 
-template<int num_packets, bool zero>
-EIGEN_STRONG_INLINE Packet8bf loadLhsBfloat16(const bfloat16* indexA)
+template<Index num_packets, bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadLhsBfloat16(const bfloat16* indexA)
 {
   Packet8bf lhs1 = ploadu<Packet8bf>(indexA);
-  Packet8bf lhs2;
-  const int packet_size = 8; //We fit 8 bfloat16 on a 128 register
   if(zero){
-    lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
+    Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
+    return vec_mergeh(lhs1.m_val, lhs2.m_val);
+  } else {
+    return lhs1;
   }
-  else lhs2 = ploadu<Packet8bf>(indexA + num_packets*packet_size);
-  return vec_mergeh(lhs1.m_val, lhs2.m_val);
 }
 
 template<bool zero>
-EIGEN_STRONG_INLINE Packet8bf loadLhsBfloat16ExtraRows(const bfloat16* indexA, Index strideA, Index row, int extra_rows)
+EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16Extra(const bfloat16* indexA, Index strideA, Index extra_rows)
 {
-  EIGEN_ALIGN16 bfloat16 lhs_array[8] = {Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0)};
-  int count = 0;
-  const bfloat16* idxA = indexA + row*strideA;
-  for(int row_count = 0; row_count < extra_rows; row_count++){
-    lhs_array[count++] = *idxA;
-    if(!zero) lhs_array[count] = *(idxA+1);
-    count++;
-    idxA += strideA;
+  Index row_count = 0;
+  if (zero) {
+    EIGEN_ALIGN16 bfloat16 lhs_array[8] = { Eigen::bfloat16(0) };
+    do{
+      lhs_array[row_count] = *indexA;
+      indexA += strideA;
+    } while ((row_count += 2) < extra_rows*2);
+    return pload_partial<Packet8bf>(lhs_array, extra_rows*2);
+  } else {
+    EIGEN_ALIGN16 int lhs_array[4];
+    do{
+      lhs_array[row_count] = *reinterpret_cast<const int *>(indexA);
+      indexA += strideA;
+    } while ((row_count += 1) < extra_rows);
+    return reinterpret_cast<Packet8us>(pload_partial<Packet4i>(lhs_array, extra_rows));
   }
-  return pload<Packet8bf>(lhs_array);
 }
 
 template<bool zero>
-EIGEN_STRONG_INLINE Packet8bf loadRhsBfloat16(const bfloat16* baseB, Index strideB, int i, int k)
+EIGEN_ALWAYS_INLINE Packet8bf loadLhsBfloat16ExtraRows(const bfloat16* indexA, Index strideA, Index row, Index extra_rows)
+{
+  return loadBfloat16Extra<zero>(indexA + row*strideA, strideA, extra_rows);
+}
+
+template<bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16(const bfloat16* baseB, Index strideB, Index i, Index k)
 {
   const bfloat16* indexB = baseB + strideB*4*i + (k*4);
   Packet8bf rhs1 = ploadu<Packet8bf>(indexB);
@@ -119,28 +66,16 @@
     Packet8bf rhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
     return vec_mergeh(rhs1.m_val, rhs2.m_val);
   }
-  //r = vec_perm (a, b, c)
-  //Let v be the concatenation of a and b.
-  //Each byte of r selected by using the least-significant 5 bits of the corresponding byte of c as an index into v
-  //We need this elements from rhs: 0, 4, 1, 5, 2, 6, 3, 7
-  Packet16uc c = {0x0u, 0x1u, 0x8u, 0x9u, 0x2u, 0x3u, 0xAu, 0xB, 0x4, 0x5, 0xCu, 0xDu, 0x6u, 0x7u, 0xEu, 0xFu};
-  return vec_perm(rhs1.m_val, rhs1.m_val, c);
+  return rhs1;
 }
 
 template<bool zero>
-EIGEN_STRONG_INLINE Packet8bf loadRhsBfloat16ExtraCols(const bfloat16* blockB, Index strideB, Index offsetB, Index col, int i, int k, int extra_cols)
+EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16ExtraCols(const bfloat16* blockB, Index strideB, Index offsetB, Index col, Index i, Index k, Index extra_cols)
 {
-  EIGEN_ALIGN16 bfloat16 rhs_vector[8] = {Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0)};
-  const bfloat16* indexB = blockB + ((col+4*i)*strideB)+k+offsetB;
-  for(int c = 0; c < extra_cols; c++){
-    rhs_vector[2*c] = *indexB;
-    if(!zero) rhs_vector[2*c+1] = *(indexB+1);
-    indexB += strideB;
-  }
-  return pload<Packet8bf>(rhs_vector);
+  return loadBfloat16Extra<zero>(blockB + ((col+4*i)*strideB)+k+offsetB, strideB, extra_cols);
 }
 
-template<int num_acc, int num_packets, bool zero, bool rhs_extra_cols, bool lhs_extra_rows>
+template<Index num_acc, Index num_packets, bool zero, bool rhs_extra_cols, bool lhs_extra_rows>
 EIGEN_STRONG_INLINE void KLoop
 (
   const bfloat16* indexA,
@@ -152,107 +87,95 @@
   Index k,
   Index row,
   Index col,
-  int extra_rows,
-  int extra_cols,
-  int mask_rows = 0xF,
-  int mask_cols = 0xF
+  Index extra_rows,
+  Index extra_cols
 )
 {
   Packet8bf lhs;
   Packet8bf rhs[num_acc];
   if(lhs_extra_rows) lhs = loadLhsBfloat16ExtraRows<zero>(indexA+k, strideA, row, extra_rows);
-  else lhs = loadLhsBfloat16<num_packets, zero>(indexA + k*num_packets*8); //a packet of bfloat16 has 8 elements
-  for(int i = 0; i < num_acc; i++){
+  else lhs = loadLhsBfloat16<num_packets, zero>(indexA + k*num_packets); //a packet of bfloat16 has 8 elements
+  BFLOAT16_UNROLL
+  for(Index i = 0; i < num_acc; i++){
     if(!rhs_extra_cols)
       rhs[i] = loadRhsBfloat16<zero>(indexB, strideB, i, k);
     else{
       rhs[i] = loadRhsBfloat16ExtraCols<zero>(indexB, strideB, offsetB, col, i, k, extra_cols);
     }
-    pgerMMAbfloat16(&(quad_acc[i]), rhs[i], lhs, mask_cols, mask_rows);
+    __builtin_mma_xvbf16ger2pp(&(quad_acc[i]), reinterpret_cast<Packet16uc>(rhs[i].m_val), reinterpret_cast<Packet16uc>(lhs.m_val));
   }
 }
 
-template<const int num_acc, const int standard_block_size, const int num_packets, bool rhsExtraCols = false, bool lhsExtraRows = false>
-void colLoopBody(Index* p_col, Index row, Index depth, Index cols, Index rows, int offset_row, int block_index, Packet4f pAlpha, const bfloat16* indexA, Index strideA, const bfloat16* blockB, Index strideB, Index offsetB, float* result, int extra_cols = 0, int extra_rows = 0, int mask_cols = 0xF, int mask_rows = 0xF)
+template<const Index num_acc, const Index num_packets, bool rhsExtraCols = false, bool lhsExtraRows = false>
+void colLoopBody(Index& col, Index row, Index depth, Index cols, Index rows, Index offset_row, Index block_index, const Packet4f& pAlpha, const bfloat16* indexA, Index strideA, const bfloat16* blockB, Index strideB, Index offsetB, float* result, Index extra_cols = 0, Index extra_rows = 0)
 {
-  int col = *p_col;
-  int count;
-  int max, step, bound;
-  const bfloat16* indexB;
+  const Index step = rhsExtraCols ? 1 : (num_acc * 4); //each accumulator has 4 elements
+  const bfloat16* indexB = rhsExtraCols ? blockB : (blockB + 4*offsetB + strideB*col);
 
-  if(num_acc == 1) bound = 0;
-  else bound = 1;
-
-  if(rhsExtraCols){
-    count = 0;
-    max = 1;
-    step = 1;
-    indexB = blockB;
-  }
-  else{
-    count = col;
-    step = num_acc * 4; //each accumulator has 4 elements
-    max = cols/step;
-    indexB = blockB + 4*offsetB + strideB*col;
-  }
-
-  while(count/step + bound < max){
+  while(col + step <= cols){
     Index k = 0;
-    EIGEN_ALIGN32 float acc[num_acc][4][4];
+    Packet4f acc[num_acc][4];
     __vector_quad quad_acc[num_acc];
  
-    for(int i = 0; i < num_acc; i++)
+    BFLOAT16_UNROLL
+    for(Index i = 0; i < num_acc; i++)
       __builtin_mma_xxsetaccz(&(quad_acc[i]));
 
-    if(depth%2 != 0){
-      KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows>(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols, mask_rows, mask_cols);
-      k = 1;
+    for(; k + 2 <= depth; k += 2){
+      KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows>(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols);
     }
-    for(; k/2 < depth/2; k += 2){ 
-      KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows>(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols, mask_rows, mask_cols);
+    if(depth&1){
+      KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows>(indexA-offset_row, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols);
     }
-    for(int i = 0; i < num_acc; i++){
+
+    BFLOAT16_UNROLL
+    for(Index i = 0; i < num_acc; i++)
       __builtin_mma_disassemble_acc((void*)acc[i], &(quad_acc[i]));
+
+    for(Index i = 0; i < num_acc; i++){
       if(lhsExtraRows){
-        for(int x = 0; x < extra_cols; x++){
-          for(int y = 0; y < extra_rows; y++){
-            result[((col+i*4)+x)*rows + row + y] += acc[i][x][y]*(pAlpha[0]);
-          }
+        float *r = result + (col+i*4)*rows + row;
+        for(Index x = 0; x < extra_cols; x++, r += rows){
+          Packet4f result_block = ploadu_partial<Packet4f>(r, extra_rows);
+          result_block = pmadd(acc[i][x], pAlpha, result_block);
+          pstoreu_partial<float>(r, result_block, extra_rows);
         }
       }
       else{
         if(rhsExtraCols){
-          for(int x = 0; x < cols-col; x++){
-            scaleAndStore(result + ((col+i*4)+x)*rows + row + offset_row,acc[i][x], pAlpha);
+          float *r = result + (col+i*4)*rows + row + offset_row;
+          for(Index x = 0; x < cols-col; x++, r += rows){
+            scaleAndStore(r,acc[i][x], pAlpha);
           }
         }
         else{
-          for(int x = 0; x < 4; x++){
-            scaleAndStore(result + ((col+i*4)+x)*rows + (block_index*16) + offset_row,acc[i][x], pAlpha);
+          float *r = result + (col+i*4)*rows + (block_index*16) + offset_row;
+          for(Index x = 0; x < 4; x++, r += rows){
+            scaleAndStore(r,acc[i][x], pAlpha);
           }
         }
       }
     }
-    count += step;
-    if(!rhsExtraCols) {
-      indexB += strideB*step;
-      col += step;
-    }
+    if(rhsExtraCols) return;
+    indexB += strideB*step;
+    col += step;
   }
-  *p_col = col;
 }
 
 template<typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
 void gemmMMAbfloat16(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, Index rows, Index depth, Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
-
   if(rows == 0 || cols == 0 || depth == 0) return;
-  const Packet4f pAlpha = pset1<Packet4f>(Eigen::bfloat16_impl::bfloat16_to_float(alpha));
+  float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
+  if (falpha == float(0)) return;
+  const Packet4f pAlpha = pset1<Packet4f>(falpha);
   ei_declare_aligned_stack_constructed_variable(float, result, cols*rows, 0);
 
-  for(int j = 0; j < cols; j++){
-    for(int i = 0; i < rows; i++){
-      result[j*rows + i] = res(i,j);
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  for(Index j = 0; j < cols; j++){
+    const LinearMapper res2 = res.getLinearMapper(0, j);
+    for(Index i = 0; i < rows; i++){
+      result[j*rows + i] = res2(i);
     }
   }
 
@@ -268,26 +191,27 @@
   //Blocks of 8 columns with <8 elements as row major. This happens when there's less than 8 remaining rows
 
   //Loop for LHS standard block (8x16)
-  int standard_block_size = 16;
-  const int standard_blocks_quantity = rows/standard_block_size; //Number of standard blocks
-  int bigSuffix = (2*8) * (strideA-offsetA-depth);
+  const Index standard_block_size = 16;
+  const Index standard_blocks_quantity = rows/standard_block_size; //Number of standard blocks
+  Index bigSuffix = (2*8) * (strideA-offsetA-depth);
   const bfloat16* indexA = blockA;
-  int block_index;
+  const Index offset_factor = 2;
+  Index block_index;
   for(block_index = 0; block_index < standard_blocks_quantity; block_index++){
     indexA += 2*8*offsetA;
-    for(int offset_row = 0; offset_row < standard_block_size; offset_row += 4){ //This block size has 16 rows maximum
+    for(Index offset_row = 0; offset_row < standard_block_size; offset_row += 4){ //This block size has 16 rows maximum
       col = 0;
-      colLoopBody<5, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<4, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<3, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<2, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<1, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<7, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<6, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<5, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<4, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<3, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<2, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<1, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
       if(cols > col){
-        int extra_cols= cols-col;
-        int shift = (4-extra_cols>= 0) ? 4-extra_cols: 0;
-        int mask_cols= 0xF >> shift;
+        Index extra_cols= cols-col;
         //Remember: It doesnt make sense use multiple acc to extra_cols as we are unrolling col loop
-        colLoopBody<1, 16, 2, true>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result, extra_cols, 4, mask_cols, 0xF);
+        colLoopBody<1, 16, true>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result, extra_cols, 4);
       }
     }
     row += 16;
@@ -296,75 +220,66 @@
   //LHS (8x8) block
   if(rows - standard_blocks_quantity*16 >= 8){
     indexA += 1*8*offsetA + 2*8*offsetA;
-    for(int offset_row = 0; offset_row < 8; offset_row += 4){
+    for(Index offset_row = 0; offset_row < 8; offset_row += 4){
       col = 0;
-      colLoopBody<5, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<4, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<3, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<2, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
-      colLoopBody<1, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<7, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<6, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<5, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<4, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<3, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<2, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+      colLoopBody<1, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
     }
     if(cols > col){
-      int extra_cols= cols-col;
-      int shift = (4-extra_cols>= 0) ? 4-extra_cols: 0;
-      int mask_cols= 0xF >> shift;
+      Index extra_cols= cols-col;
 
-      for(int offset_row = 0; offset_row < 8; offset_row += 4){
-        colLoopBody<1, 8, 1, true>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result, extra_cols, 4, mask_cols, 0xF);
+      for(Index offset_row = 0; offset_row < 8; offset_row += 4){
+        colLoopBody<1, 8, true>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result, extra_cols, 4);
       }
     } //end extra cols
     row += 8;
   }
   //extra rows
   while(row < rows){
-    int extra_rows = rows-row;
-    int shift = (4-extra_rows >= 0) ? 4-extra_rows : 0;
-    int mask_rows = 0xF >> shift;
-    int extra_rows_or_four = (extra_rows <= 4) ? extra_rows : 4;
+    Index extra_rows = rows-row;
+    Index extra_rows_or_four = (extra_rows <= 4) ? extra_rows : 4;
 
     //This index is the beginning of remaining block. 
     //This last block for LHS is organized as RowMajor
     col = 0;
-    colLoopBody<5, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
-    colLoopBody<4, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
-    colLoopBody<3, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
-    colLoopBody<2, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
-    colLoopBody<1, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
+    colLoopBody<7, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
+    colLoopBody<6, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
+    colLoopBody<5, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
+    colLoopBody<4, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
+    colLoopBody<3, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
+    colLoopBody<2, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
+    colLoopBody<1, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
     if(cols > col){
-      int extra_cols= cols-col;
-      int shift = (4-extra_cols>= 0) ? 4-extra_cols: 0;
-      int mask_cols= 0xF >> shift;
-      int extra_cols_or_four = (extra_cols <= 4) ? extra_cols : 4;
+      Index extra_cols= cols-col;
 
-      colLoopBody<1, 8, 1, true, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, extra_cols_or_four, extra_rows_or_four, mask_cols, mask_rows);
+      colLoopBody<1, 8, true, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, extra_cols, extra_rows_or_four);
     }
     row += extra_rows_or_four;
   }
 
   //Convert back to bfloat16
-  for(col = 0; col/4 < cols/4; col += 4){
-    int row;
-    for(row = 0; row/8 < rows/8; row += 8){
+  for(col = 0; col + 4 <= cols; col += 4){
+    const DataMapper res2 = res.getSubMapper(0, col);
+    for(row = 0; row + 8 <= rows; row += 8){
       //get and save block
       PacketBlock<Packet8bf,4> block;
-      for(int j = 0; j < 4; j++){
-        Packet4f temp_even, temp_odd;
-        EIGEN_ALIGN32 float even[4], odd[4];
-        for(int i = 0; i < 4; i++){
-          even[i] = result[(col + j)*rows + row + i*2];
-          odd[i] = result[(col + j)*rows + row + i*2+1];
-        }
-        temp_even = pload<Packet4f>(even);
-        temp_odd = pload<Packet4f>(odd);
-        block.packet[j] = F32ToBf16(temp_even, temp_odd);
+      for(Index j = 0; j < 4; j++){
+        Packet16uc fp16_0 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(pload<Packet4f>(result + (col + j)*rows + row)));
+        Packet16uc fp16_1 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(pload<Packet4f>(result + (col + j)*rows + row + 4)));
+        block.packet[j].m_val = vec_pack(reinterpret_cast<Packet4ui>(fp16_0), reinterpret_cast<Packet4ui>(fp16_1));
       }
 
-      res.template storePacketBlock<Packet8bf,4>(row, col, block);
+      res2.template storePacketBlock<Packet8bf,4>(row, 0, block);
     }
     //extra rows
     while(row < rows){
-      for(int col_off = 0; col_off < 4; col_off++){
-        res(row, col+col_off) = Eigen::bfloat16(result[(col+col_off)*rows+row]);
+      for(Index col_off = 0; col_off < 4; col_off++){
+        res2(row, col_off) = Eigen::bfloat16(result[(col+col_off)*rows+row]);
       }
       row++;
     }
@@ -372,8 +287,9 @@
   }
   //extra cols
   while(col < cols){
-    for(int r = 0; r < rows; r++){
-      res(r, col) = Eigen::bfloat16(result[col*rows + r]);
+    const LinearMapper res2 = res.getLinearMapper(0, col);
+    for(Index r= 0; r< rows; r++){
+      res2(r) = Eigen::bfloat16(result[col*rows + r]);
     }
     col++;
   }
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 75d6228..6ff3e20 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -445,7 +445,7 @@
 // of the functions, while the latter can only deal with one of them.
 #elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
 
-#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
 // We need to provide emulated *host-side* FP16 operators for clang.
 #pragma push_macro("EIGEN_DEVICE_FUNC")
 #undef EIGEN_DEVICE_FUNC
@@ -510,7 +510,7 @@
   return float(a) >= float(b);
 }
 
-#if defined(__clang__) && defined(__CUDA__)
+#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
 #pragma pop_macro("EIGEN_DEVICE_FUNC")
 #endif
 #endif  // Emulate support for half floats
diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
index 5dc5ef8..c892d39 100644
--- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
@@ -26,7 +26,7 @@
 
   template <typename LaneIdType>
   EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
-                                Packet4f& c, Packet4f& tmp,
+                                Packet4f& c, Packet4f&,
                                 const LaneIdType&) const {
     acc(a, b, c);
   }
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 3f7638e..a2486af 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -380,11 +380,6 @@
     EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
       spbh.store(sup, i,j,block);
       sup->template storePacket<SubPacket>(i, j+idx, block.packet[idx]);
-      //for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
-      //{
-      //  Scalar_ *v = &sup->operator()(i+l, j+idx);
-      //  *v = *reinterpret_cast<Scalar_ *>(&block.packet[idx][l]);
-      //}
     }
   };
 
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index 733b1aa..d89d99e 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -132,15 +132,7 @@
     /** \returns the largest \c k in [start,end) such that for all \c j in [start,k) index[\c j]\<\a key */
     inline Index searchLowerIndex(Index start, Index end, Index key) const
     {
-      while(end>start)
-      {
-        Index mid = (end+start)>>1;
-        if (m_indices[mid]<key)
-          start = mid+1;
-        else
-          end = mid;
-      }
-      return start;
+      return static_cast<Index>(std::distance(m_indices, std::lower_bound(m_indices + start, m_indices + end, key)));
     }
 
     /** \returns the stored value at index \a key
@@ -198,19 +190,12 @@
       return m_values[id];
     }
 
-    void moveChunk(Index from, Index to, Index chunkSize)
+    inline void moveChunk(Index from, Index to, Index chunkSize)
     {
-      eigen_internal_assert(to+chunkSize <= m_size);
-      if(to>from && from+chunkSize>to)
-      {
-        // move backward
-        internal::smart_memmove(m_values+from,  m_values+from+chunkSize,  m_values+to);
-        internal::smart_memmove(m_indices+from, m_indices+from+chunkSize, m_indices+to);
-      }
-      else
-      {
-        internal::smart_copy(m_values+from,  m_values+from+chunkSize,  m_values+to);
-        internal::smart_copy(m_indices+from, m_indices+from+chunkSize, m_indices+to);
+      eigen_internal_assert(chunkSize >= 0 && to+chunkSize <= m_size);
+      if (chunkSize > 0) {
+        internal::smart_memmove(m_values + from, m_values + from + chunkSize, m_values + to);
+        internal::smart_memmove(m_indices + from, m_indices + from + chunkSize, m_indices + to);
       }
     }
 
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 6806812..eaefcef 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -92,6 +92,31 @@
   };
 };
 
+template <typename StorageIndex>
+struct sparse_reserve_op {
+  EIGEN_DEVICE_FUNC sparse_reserve_op(Index begin, Index end, Index size) {
+    Index range = numext::mini(end - begin, size);
+    m_begin = begin;
+    m_end = begin + range;
+    m_val = StorageIndex(size / range);
+    m_remainder = StorageIndex(size % range);
+  }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex operator()(IndexType i) const {
+    if ((i >= m_begin) && (i < m_end))
+      return m_val + ((i - m_begin) < m_remainder ? 1 : 0);
+    else
+      return 0;
+  }
+  StorageIndex m_val, m_remainder;
+  Index m_begin, m_end;
+};
+
+template <typename Scalar>
+struct functor_traits<sparse_reserve_op<Scalar>> {
+  enum { Cost = 1, PacketAccess = false, IsRepeatable = true };
+};
+
 } // end namespace internal
 
 template<typename Scalar_, int Options_, typename StorageIndex_>
@@ -196,7 +221,7 @@
       const Index outer = IsRowMajor ? row : col;
       const Index inner = IsRowMajor ? col : row;
       Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer+1];
-      return m_data.atInRange(m_outerIndex[outer], end, StorageIndex(inner));
+      return m_data.atInRange(m_outerIndex[outer], end, inner);
     }
 
     /** \returns a non-const reference to the value of the matrix at position \a i, \a j
@@ -210,20 +235,17 @@
     inline Scalar& coeffRef(Index row, Index col)
     {
       eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
-      
       const Index outer = IsRowMajor ? row : col;
       const Index inner = IsRowMajor ? col : row;
-
       Index start = m_outerIndex[outer];
-      Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer+1];
-      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
-      if(end<=start)
-        return insert(row,col);
-      const Index p = m_data.searchLowerIndex(start,end-1,StorageIndex(inner));
-      if((p<end) && (m_data.index(p)==inner))
-        return m_data.value(p);
+      Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer + 1];
+      eigen_assert(end >= start && "you probably called coeffRef on a non finalized matrix");
+      if (end <= start) return insertAtByOuterInner(outer, inner, start);
+      Index dst = m_data.searchLowerIndex(start, end, inner);
+      if ((dst < end) && (m_data.index(dst) == inner))
+        return m_data.value(dst);
       else
-        return insert(row,col);
+        return insertAtByOuterInner(outer, inner, dst);
     }
 
     /** \returns a reference to a novel non zero coefficient with coordinates \a row x \a col.
@@ -241,7 +263,7 @@
       * if the elements of each inner vector are inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
       *
       */
-    Scalar& insert(Index row, Index col);
+    inline Scalar& insert(Index row, Index col);
 
   public:
 
@@ -301,6 +323,11 @@
       if(isCompressed())
       {
         Index totalReserveSize = 0;
+        for (Index j = 0; j < m_outerSize; ++j) totalReserveSize += reserveSizes[j];
+
+        // if reserveSizes is empty, don't do anything!
+        if (totalReserveSize == 0) return;
+
         // turn the matrix into non-compressed mode
         m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
         
@@ -312,18 +339,18 @@
         {
           newOuterIndex[j] = count;
           count += reserveSizes[j] + (m_outerIndex[j+1]-m_outerIndex[j]);
-          totalReserveSize += reserveSizes[j];
         }
+
         m_data.reserve(totalReserveSize);
         StorageIndex previousOuterIndex = m_outerIndex[m_outerSize];
         for(Index j=m_outerSize-1; j>=0; --j)
         {
           StorageIndex innerNNZ = previousOuterIndex - m_outerIndex[j];
-          for(Index i=innerNNZ-1; i>=0; --i)
-          {
-            m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
-            m_data.value(newOuterIndex[j]+i) = m_data.value(m_outerIndex[j]+i);
-          }
+          StorageIndex begin = m_outerIndex[j];
+          StorageIndex end = begin + innerNNZ;
+          StorageIndex target = newOuterIndex[j];
+          internal::smart_memmove(innerIndexPtr() + begin, innerIndexPtr() + end, innerIndexPtr() + target);
+          internal::smart_memmove(valuePtr() + begin, valuePtr() + end, valuePtr() + target);
           previousOuterIndex = m_outerIndex[j];
           m_outerIndex[j] = newOuterIndex[j];
           m_innerNonZeros[j] = innerNNZ;
@@ -350,15 +377,15 @@
         m_data.resize(count);
         for(Index j=m_outerSize-1; j>=0; --j)
         {
-          Index offset = newOuterIndex[j] - m_outerIndex[j];
+          StorageIndex offset = newOuterIndex[j] - m_outerIndex[j];
           if(offset>0)
           {
             StorageIndex innerNNZ = m_innerNonZeros[j];
-            for(Index i=innerNNZ-1; i>=0; --i)
-            {
-              m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
-              m_data.value(newOuterIndex[j]+i) = m_data.value(m_outerIndex[j]+i);
-            }
+            StorageIndex begin = m_outerIndex[j];
+            StorageIndex end = begin + innerNNZ;
+            StorageIndex target = newOuterIndex[j];
+            internal::smart_memmove(innerIndexPtr() + begin, innerIndexPtr() + end, innerIndexPtr() + target);
+            internal::smart_memmove(valuePtr() + begin, valuePtr() + end, valuePtr() + target);
           }
         }
         
@@ -392,7 +419,7 @@
     {
       eigen_assert(Index(m_outerIndex[outer+1]) == m_data.size() && "Invalid ordered insertion (invalid outer index)");
       eigen_assert( (m_outerIndex[outer+1]-m_outerIndex[outer]==0 || m_data.index(m_data.size()-1)<inner) && "Invalid ordered insertion (invalid inner index)");
-      Index p = m_outerIndex[outer+1];
+      StorageIndex p = m_outerIndex[outer+1];
       ++m_outerIndex[outer+1];
       m_data.append(Scalar(0), inner);
       return m_data.value(p);
@@ -402,7 +429,7 @@
       * \warning use it only if you know what you are doing */
     inline Scalar& insertBackByOuterInnerUnordered(Index outer, Index inner)
     {
-      Index p = m_outerIndex[outer+1];
+      StorageIndex p = m_outerIndex[outer+1];
       ++m_outerIndex[outer+1];
       m_data.append(Scalar(0), inner);
       return m_data.value(p);
@@ -446,10 +473,14 @@
     template<typename InputIterators,typename DupFunctor>
     void setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
 
-    void sumupDuplicates() { collapseDuplicates(internal::scalar_sum_op<Scalar,Scalar>()); }
+    template<typename Derived, typename DupFunctor>
+    void collapseDuplicates(DenseBase<Derived>& wi, DupFunctor dup_func = DupFunctor());
 
-    template<typename DupFunctor>
-    void collapseDuplicates(DupFunctor dup_func = DupFunctor());
+    template<typename InputIterators>
+    void setFromSortedTriplets(const InputIterators& begin, const InputIterators& end);
+
+    template<typename InputIterators, typename DupFunctor>
+    void setFromSortedTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
 
     //---
     
@@ -464,27 +495,23 @@
       */
     void makeCompressed()
     {
-      if(isCompressed())
-        return;
+      if (isCompressed()) return;
       
       eigen_internal_assert(m_outerIndex!=0 && m_outerSize>0);
       
-      Index oldStart = m_outerIndex[1];
+      StorageIndex start = m_outerIndex[1];
       m_outerIndex[1] = m_innerNonZeros[0];
       for(Index j=1; j<m_outerSize; ++j)
       {
-        Index nextOldStart = m_outerIndex[j+1];
-        Index offset = oldStart - m_outerIndex[j];
-        if(offset>0)
+        StorageIndex end = start + m_innerNonZeros[j];
+        StorageIndex target = m_outerIndex[j];
+        if (start != target)
         {
-          for(Index k=0; k<m_innerNonZeros[j]; ++k)
-          {
-            m_data.index(m_outerIndex[j]+k) = m_data.index(oldStart+k);
-            m_data.value(m_outerIndex[j]+k) = m_data.value(oldStart+k);
-          }
+          internal::smart_memmove(innerIndexPtr() + start, innerIndexPtr() + end, innerIndexPtr() + target);
+          internal::smart_memmove(valuePtr() + start, valuePtr() + end, valuePtr() + target);
         }
-        m_outerIndex[j+1] = m_outerIndex[j] + m_innerNonZeros[j];
-        oldStart = nextOldStart;
+        start = m_outerIndex[j + 1];
+        m_outerIndex[j + 1] = m_outerIndex[j] + m_innerNonZeros[j];
       }
       internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
       m_innerNonZeros = 0;
@@ -495,13 +522,12 @@
     /** Turns the matrix into the uncompressed mode */
     void uncompress()
     {
-      if(m_innerNonZeros != 0)
-        return; 
+      if(m_innerNonZeros != 0) return; 
       m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
-      for (Index i = 0; i < m_outerSize; i++)
-      {
-        m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; 
-      }
+      typename IndexVector::AlignedMapType innerNonZeroMap(m_innerNonZeros, m_outerSize);
+      typename IndexVector::ConstAlignedMapType outerIndexMap(m_outerIndex, m_outerSize);
+      typename IndexVector::ConstMapType nextOuterIndexMap(m_outerIndex + 1, m_outerSize);
+      innerNonZeroMap = nextOuterIndexMap - outerIndexMap;
     }
 
     /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
@@ -520,27 +546,32 @@
     template<typename KeepFunc>
     void prune(const KeepFunc& keep = KeepFunc())
     {
-      // TODO optimize the uncompressed mode to avoid moving and allocating the data twice
-      makeCompressed();
-
       StorageIndex k = 0;
       for(Index j=0; j<m_outerSize; ++j)
       {
-        Index previousStart = m_outerIndex[j];
-        m_outerIndex[j] = k;
-        Index end = m_outerIndex[j+1];
-        for(Index i=previousStart; i<end; ++i)
+        StorageIndex previousStart = m_outerIndex[j];
+        if (isCompressed())
+          m_outerIndex[j] = k;
+        else
+          k = m_outerIndex[j];
+        StorageIndex end = isCompressed() ? m_outerIndex[j+1] : previousStart + m_innerNonZeros[j];
+        for(StorageIndex i=previousStart; i<end; ++i)
         {
-          if(keep(IsRowMajor?j:m_data.index(i), IsRowMajor?m_data.index(i):j, m_data.value(i)))
-          {
+          StorageIndex row = IsRowMajor ? StorageIndex(j) : m_data.index(i);
+          StorageIndex col = IsRowMajor ? m_data.index(i) : StorageIndex(j);
+          bool keepEntry = keep(row, col, m_data.value(i));
+          if (keepEntry) {
             m_data.value(k) = m_data.value(i);
             m_data.index(k) = m_data.index(i);
             ++k;
-          }
+          } else if (!isCompressed())
+            m_innerNonZeros[j]--;
         }
       }
-      m_outerIndex[m_outerSize] = k;
-      m_data.resize(k,0);
+      if (isCompressed()) {
+        m_outerIndex[m_outerSize] = k;
+        m_data.resize(k, 0);
+      }
     }
 
     /** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched.
@@ -551,64 +582,50 @@
       *
       * \sa reserve(), setZero(), makeCompressed()
       */
-    void conservativeResize(Index rows, Index cols) 
-    {
-      // No change
-      if (this->rows() == rows && this->cols() == cols) return;
-      
+    void conservativeResize(Index rows, Index cols) {
+
       // If one dimension is null, then there is nothing to be preserved
-      if(rows==0 || cols==0) return resize(rows,cols);
+      if (rows == 0 || cols == 0) return resize(rows, cols);
 
-      Index innerChange = IsRowMajor ? cols - this->cols() : rows - this->rows();
-      Index outerChange = IsRowMajor ? rows - this->rows() : cols - this->cols();
-      StorageIndex newInnerSize = convert_index(IsRowMajor ? cols : rows);
+      Index newOuterSize = IsRowMajor ? rows : cols;
+      Index newInnerSize = IsRowMajor ? cols : rows;
 
-      // Deals with inner non zeros
-      if (m_innerNonZeros)
-      {
-        // Resize m_innerNonZeros
-        m_innerNonZeros = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(
-              m_innerNonZeros, m_outerSize + outerChange, m_outerSize);
-        
-        for(Index i=m_outerSize; i<m_outerSize+outerChange; i++)          
-          m_innerNonZeros[i] = 0;
-      } 
-      else if (innerChange < 0) 
-      {
-        // Inner size decreased: allocate a new m_innerNonZeros
-        m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize + outerChange);
-        for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++)
-          m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i];
-        for(Index i = m_outerSize; i < m_outerSize + outerChange; i++)
-          m_innerNonZeros[i] = 0;
-      }
-      
-      // Change the m_innerNonZeros in case of a decrease of inner size
-      if (m_innerNonZeros && innerChange < 0)
-      {
-        for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++)
-        {
-          StorageIndex &n = m_innerNonZeros[i];
-          StorageIndex start = m_outerIndex[i];
-          while (n > 0 && m_data.index(start+n-1) >= newInnerSize) --n; 
+      Index innerChange = newInnerSize - innerSize();
+      Index outerChange = newOuterSize - outerSize();
+
+      if (outerChange != 0) {
+        m_outerIndex = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(
+            outerIndexPtr(), newOuterSize + 1, outerSize() + 1);
+
+        if (!isCompressed())
+          m_innerNonZeros = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(
+              innerNonZeroPtr(), newOuterSize, outerSize());
+
+        if (outerChange > 0) {
+          StorageIndex lastIdx = outerSize() == 0 ? StorageIndex(0) : outerIndexPtr()[outerSize()];
+          std::fill_n(outerIndexPtr() + outerSize(), outerChange + 1, lastIdx);
+
+          if (!isCompressed()) std::fill_n(innerNonZeroPtr() + outerSize(), outerChange, StorageIndex(0));
         }
       }
-      
+      m_outerSize = newOuterSize;
+
+      if (innerChange < 0) {
+        for (Index j = 0; j < outerSize(); j++) {
+          Index start = outerIndexPtr()[j];
+          Index end = isCompressed() ? outerIndexPtr()[j + 1] : start + innerNonZeroPtr()[j];
+          Index lb = data().searchLowerIndex(start, end, newInnerSize);
+          if (lb != end) {
+            uncompress();
+            innerNonZeroPtr()[j] = StorageIndex(lb - start);
+          }
+        }
+      }
       m_innerSize = newInnerSize;
 
-      // Re-allocate outer index structure if necessary
-      if (outerChange == 0)
-        return;
-          
-      m_outerIndex = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(
-          m_outerIndex, m_outerSize + outerChange + 1, m_outerSize + 1);
-      if (outerChange > 0)
-      {
-        StorageIndex lastIdx = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
-        for(Index i=m_outerSize; i<m_outerSize+outerChange+1; i++)          
-          m_outerIndex[i] = lastIdx; 
-      }
-      m_outerSize += outerChange;
+      Index newSize = outerIndexPtr()[outerSize()];
+      eigen_assert(newSize <= m_data.size());
+      m_data.resize(newSize);
     }
     
     /** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero.
@@ -740,12 +757,19 @@
     inline void setIdentity()
     {
       eigen_assert(rows() == cols() && "ONLY FOR SQUARED MATRICES");
-      this->m_data.resize(rows());
-      Eigen::Map<IndexVector>(this->m_data.indexPtr(), rows()).setLinSpaced(0, StorageIndex(rows()-1));
-      Eigen::Map<ScalarVector>(this->m_data.valuePtr(), rows()).setOnes();
-      Eigen::Map<IndexVector>(this->m_outerIndex, rows()+1).setLinSpaced(0, StorageIndex(rows()));
-      internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
-      m_innerNonZeros = 0;
+      if (m_innerNonZeros) {
+        internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+        m_innerNonZeros = 0;
+      }
+      m_data.resize(rows());
+      // is it necessary to squeeze?
+      m_data.squeeze();
+      typename IndexVector::AlignedMapType outerIndexMap(outerIndexPtr(), rows() + 1);
+      typename IndexVector::AlignedMapType innerIndexMap(innerIndexPtr(), rows());
+      typename ScalarVector::AlignedMapType valueMap(valuePtr(), rows());
+      outerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
+      innerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
+      valueMap.setOnes();
     }
 
     inline SparseMatrix& operator=(const SparseMatrix& other)
@@ -865,7 +889,7 @@
 
     /** \internal
       * \sa insert(Index,Index) */
-    EIGEN_DONT_INLINE Scalar& insertCompressed(Index row, Index col);
+    EIGEN_DEPRECATED EIGEN_DONT_INLINE Scalar& insertCompressed(Index row, Index col);
 
     /** \internal
       * A vector object that is equal to 0 everywhere but v at the position i */
@@ -884,7 +908,7 @@
 
     /** \internal
       * \sa insert(Index,Index) */
-    EIGEN_DONT_INLINE Scalar& insertUncompressed(Index row, Index col);
+    EIGEN_DEPRECATED EIGEN_DONT_INLINE Scalar& insertUncompressed(Index row, Index col);
 
 public:
     /** \internal
@@ -913,17 +937,18 @@
       *   1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression.
       *   2 - otherwise, for each diagonal coeff,
       *     2.a - if it already exists, then we update it,
-      *     2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion.
-      *     2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector.
-      *   3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements.
-      * 
-      * TODO: some piece of code could be isolated and reused for a general in-place update strategy.
-      * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once),
-      *       then it *might* be better to disable case 2.b since they will have to be copied anyway.
+      *     2.b - if the correct position is at the end of the vector, and there is capacity, push to back
+      *     2.b - otherwise, the insertion requires a data move, record insertion locations and handle in a second pass
+      *   3 - at the end, if some entries failed to be updated in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements.
       */
     template<typename DiagXpr, typename Func>
     void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc)
     {
+      
+      constexpr StorageIndex kEmptyIndexVal(-1);
+      typedef typename IndexVector::AlignedMapType IndexMap;
+      typedef typename ScalarVector::AlignedMapType ValueMap;
+
       Index n = diagXpr.size();
 
       const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar,Scalar> >::value;
@@ -933,82 +958,106 @@
           this->resize(n, n);
       }
 
-      if(m_data.size()==0 || overwrite)
+      if(data().size()==0 || overwrite)
       {
-        typedef Array<StorageIndex,Dynamic,1> ArrayXI;  
-        this->makeCompressed();
-        this->resizeNonZeros(n);
-        Eigen::Map<ArrayXI>(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1);
-        Eigen::Map<ArrayXI>(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n));
-        Eigen::Map<Array<Scalar,Dynamic,1> > values = this->coeffs();
-        values.setZero();
-        internal::call_assignment_no_alias(values, diagXpr, assignFunc);
+        if (!isCompressed()) {
+          internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+          m_innerNonZeros = 0;
+        }
+        resizeNonZeros(n);
+        IndexMap outerIndexMap(outerIndexPtr(), n + 1);
+        IndexMap innerIndexMap(innerIndexPtr(), n);
+        ValueMap valueMap(valuePtr(), n);
+        outerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
+        innerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
+        valueMap.setZero();
+        internal::call_assignment_no_alias(valueMap, diagXpr, assignFunc);
       }
       else
       {
-        bool isComp = isCompressed();
-        internal::evaluator<DiagXpr> diaEval(diagXpr);
-        std::vector<IndexPosPair> newEntries;
+          internal::evaluator<DiagXpr> diaEval(diagXpr);
 
-        // 1 - try in-place update and record insertion failures
-        for(Index i = 0; i<n; ++i)
-        {
-          internal::LowerBoundIndex lb = this->lower_bound(i,i);
-          Index p = lb.value;
-          if(lb.found)
-          {
-            // the coeff already exists
-            assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
+          ei_declare_aligned_stack_constructed_variable(StorageIndex, tmp, n, 0);
+          typename IndexVector::AlignedMapType insertionLocations(tmp, n);
+          insertionLocations.setConstant(kEmptyIndexVal);
+
+          Index deferredInsertions = 0;
+          Index shift = 0;
+
+          for (Index j = 0; j < n; j++) {
+            Index begin = outerIndexPtr()[j];
+            Index end = isCompressed() ? outerIndexPtr()[j + 1] : begin + innerNonZeroPtr()[j];
+            Index capacity = outerIndexPtr()[j + 1] - end;
+            Index dst = data().searchLowerIndex(begin, end, j);
+            // the entry exists: update it now
+            if (dst != end && data().index(dst) == j) assignFunc.assignCoeff(data().value(dst), diaEval.coeff(j));
+            // the entry belongs at the back of the vector: push to back
+            else if (dst == end && capacity > 0)
+              assignFunc.assignCoeff(insertBackUncompressed(j, j), diaEval.coeff(j));
+            // the insertion requires a data move, record insertion location and handle in second pass
+            else {
+              insertionLocations.coeffRef(j) = StorageIndex(dst);
+              deferredInsertions++;
+              // if there is no capacity, all vectors to the right of this are shifted
+              if (capacity == 0) shift++;
+            }
           }
-          else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i]))
-          {
-            // non compressed mode with local room for inserting one element
-            m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p);
-            m_innerNonZeros[i]++;
-            m_data.value(p) = Scalar(0);
-            m_data.index(p) = StorageIndex(i);
-            assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
-          }
-          else
-          {
-            // defer insertion
-            newEntries.push_back(IndexPosPair(i,p));
-          }
-        }
-        // 2 - insert deferred entries
-        Index n_entries = Index(newEntries.size());
-        if(n_entries>0)
-        {
-          Storage newData(m_data.size()+n_entries);
-          Index prev_p = 0;
-          Index prev_i = 0;
-          for(Index k=0; k<n_entries;++k)
-          {
-            Index i = newEntries[k].i;
-            Index p = newEntries[k].p;
-            internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+p, newData.valuePtr()+prev_p+k);
-            internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+p, newData.indexPtr()+prev_p+k);
-            for(Index j=prev_i;j<i;++j)
-              m_outerIndex[j+1] += k;
-            if(!isComp)
-              m_innerNonZeros[i]++;
-            prev_p = p;
-            prev_i = i;
-            newData.value(p+k) = Scalar(0);
-            newData.index(p+k) = StorageIndex(i);
-            assignFunc.assignCoeff(newData.value(p+k), diaEval.coeff(i));
-          }
-          {
-            internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+m_data.size(), newData.valuePtr()+prev_p+n_entries);
-            internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+m_data.size(), newData.indexPtr()+prev_p+n_entries);
-            for(Index j=prev_i+1;j<=m_outerSize;++j)
-              m_outerIndex[j] += n_entries;
-          }
-          m_data.swap(newData);
-        }
+          
+          if (deferredInsertions > 0) {
+
+            data().resize(data().size() + shift);
+            Index copyEnd = isCompressed() ? outerIndexPtr()[outerSize()]
+                                           : outerIndexPtr()[outerSize() - 1] + innerNonZeroPtr()[outerSize() - 1];
+            for (Index j = outerSize() - 1; deferredInsertions > 0; j--) {
+              Index begin = outerIndexPtr()[j];
+              Index end = isCompressed() ? outerIndexPtr()[j + 1] : begin + innerNonZeroPtr()[j];
+              Index capacity = outerIndexPtr()[j + 1] - end;
+
+              bool doInsertion = insertionLocations(j) >= 0;
+              bool breakUpCopy = doInsertion && (capacity > 0);
+              // break up copy for sorted insertion into inactive nonzeros
+              // optionally, add another criterium, i.e. 'breakUpCopy || (capacity > threhsold)'
+              // where `threshold >= 0` to skip inactive nonzeros in each vector
+              // this reduces the total number of copied elements, but requires more moveChunk calls
+              if (breakUpCopy) {
+                Index copyBegin = outerIndexPtr()[j + 1];
+                Index to = copyBegin + shift;
+                Index chunkSize = copyEnd - copyBegin;
+                if (chunkSize > 0) data().moveChunk(copyBegin, to, chunkSize);
+                copyEnd = end;
+              }
+
+              outerIndexPtr()[j + 1] += shift;
+              
+              if (doInsertion) {
+                // if there is capacity, shift into the inactive nonzeros
+                if (capacity > 0) shift++;
+                Index copyBegin = insertionLocations(j);
+                Index to = copyBegin + shift;
+                Index chunkSize = copyEnd - copyBegin;
+                if (chunkSize > 0) data().moveChunk(copyBegin, to, chunkSize);
+                Index dst = to - 1;
+                data().index(dst) = StorageIndex(j);
+                assignFunc.assignCoeff(data().value(dst) = Scalar(0), diaEval.coeff(j));
+                if (!isCompressed()) innerNonZeroPtr()[j]++;
+                shift--;
+                deferredInsertions--;
+                copyEnd = copyBegin;
+              }
+            }
+          }     
+          eigen_assert((shift == 0) && (deferredInsertions == 0));
       }
     }
 
+    /* Provides a consistent reserve and reallocation strategy for insertCompressed and insertUncompressed
+    If there is insufficient space for one insertion, the size of the memory buffer is doubled */
+    inline void checkAllocatedSpaceAndMaybeExpand();
+    /* These functions are used to avoid a redundant binary search operation in functions such as coeffRef() and assume `dst` is the appropriate sorted insertion point */
+    EIGEN_STRONG_INLINE Scalar& insertAtByOuterInner(Index outer, Index inner, Index dst);
+    Scalar& insertCompressedAtByOuterInner(Index outer, Index inner, Index dst);
+    Scalar& insertUncompressedAtByOuterInner(Index outer, Index inner, Index dst);
+
 private:
   EIGEN_STATIC_ASSERT(NumTraits<StorageIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE)
   EIGEN_STATIC_ASSERT((Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS)
@@ -1026,36 +1075,106 @@
 
 namespace internal {
 
-template<typename InputIterator, typename SparseMatrixType, typename DupFunctor>
-void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat, DupFunctor dup_func)
-{
-  enum { IsRowMajor = SparseMatrixType::IsRowMajor };
-  typedef typename SparseMatrixType::Scalar Scalar;
+// Creates a compressed sparse matrix from a range of unsorted triplets
+// Requires temporary storage to handle duplicate entries
+template <typename InputIterator, typename SparseMatrixType, typename DupFunctor>
+void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat,
+                       DupFunctor dup_func) {
+
+  constexpr bool IsRowMajor = SparseMatrixType::IsRowMajor;
   typedef typename SparseMatrixType::StorageIndex StorageIndex;
-  SparseMatrix<Scalar,IsRowMajor?ColMajor:RowMajor,StorageIndex> trMat(mat.rows(),mat.cols());
+  typedef typename VectorX<StorageIndex>::AlignedMapType IndexMap;
+  if (begin == end) return;
 
-  if(begin!=end)
-  {
-    // pass 1: count the nnz per inner-vector
-    typename SparseMatrixType::IndexVector wi(trMat.outerSize());
-    wi.setZero();
-    for(InputIterator it(begin); it!=end; ++it)
-    {
-      eigen_assert(it->row()>=0 && it->row()<mat.rows() && it->col()>=0 && it->col()<mat.cols());
-      wi(IsRowMajor ? it->col() : it->row())++;
-    }
-
-    // pass 2: insert all the elements into trMat
-    trMat.reserve(wi);
-    for(InputIterator it(begin); it!=end; ++it)
-      trMat.insertBackUncompressed(it->row(),it->col()) = it->value();
-
-    // pass 3:
-    trMat.collapseDuplicates(dup_func);
+  // free innerNonZeroPtr (if present) and zero outerIndexPtr
+  mat.resize(mat.rows(), mat.cols());
+  // allocate temporary storage for nonzero insertion (outer size) and duplicate removal (inner size)
+  ei_declare_aligned_stack_constructed_variable(StorageIndex, tmp, numext::maxi(mat.innerSize(), mat.outerSize()), 0);
+  // scan triplets to determine allocation size before constructing matrix
+  IndexMap outerIndexMap(mat.outerIndexPtr(), mat.outerSize() + 1);
+  for (InputIterator it(begin); it != end; ++it) {
+    eigen_assert(it->row() >= 0 && it->row() < mat.rows() && it->col() >= 0 && it->col() < mat.cols());
+    StorageIndex j = IsRowMajor ? it->row() : it->col();
+    outerIndexMap.coeffRef(j + 1)++;
   }
 
-  // pass 4: transposed copy -> implicit sorting
-  mat = trMat;
+  // finalize outer indices and allocate memory
+  std::partial_sum(outerIndexMap.begin(), outerIndexMap.end(), outerIndexMap.begin());
+  Index nonZeros = mat.outerIndexPtr()[mat.outerSize()];
+  mat.resizeNonZeros(nonZeros);
+
+  // use tmp to track nonzero insertions
+  IndexMap back(tmp, mat.outerSize());
+  back = outerIndexMap.head(mat.outerSize());
+
+  // push triplets to back of each inner vector
+  for (InputIterator it(begin); it != end; ++it) {
+    StorageIndex j = IsRowMajor ? it->row() : it->col();
+    StorageIndex i = IsRowMajor ? it->col() : it->row();
+    mat.data().index(back.coeff(j)) = i;
+    mat.data().value(back.coeff(j)) = it->value();
+    back.coeffRef(j)++;
+  }
+
+  // use tmp to collapse duplicates
+  IndexMap wi(tmp, mat.innerSize());
+  mat.collapseDuplicates(wi, dup_func);
+  mat.sortInnerIndices();
+}
+
+// Creates a compressed sparse matrix from a sorted range of triplets
+template <typename InputIterator, typename SparseMatrixType, typename DupFunctor>
+void set_from_triplets_sorted(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat,
+                              DupFunctor dup_func) {
+  constexpr bool IsRowMajor = SparseMatrixType::IsRowMajor;
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  typedef typename VectorX<StorageIndex>::AlignedMapType IndexMap;
+  if (begin == end) return;
+
+  constexpr StorageIndex kEmptyIndexValue(-1);
+  // deallocate inner nonzeros if present and zero outerIndexPtr
+  mat.resize(mat.rows(), mat.cols());
+  // use outer indices to count non zero entries (excluding duplicate entries)
+  StorageIndex previous_j = kEmptyIndexValue;
+  StorageIndex previous_i = kEmptyIndexValue;
+  // scan triplets to determine allocation size before constructing matrix
+  IndexMap outerIndexMap(mat.outerIndexPtr(), mat.outerSize() + 1);
+  for (InputIterator it(begin); it != end; ++it) {
+    eigen_assert(it->row() >= 0 && it->row() < mat.rows() && it->col() >= 0 && it->col() < mat.cols());
+    StorageIndex j = IsRowMajor ? it->row() : it->col();
+    StorageIndex i = IsRowMajor ? it->col() : it->row();
+    eigen_assert(j > previous_j || (j == previous_j && i >= previous_i));
+    // identify duplicates by examining previous location
+    bool duplicate = (previous_j == j) && (previous_i == i);
+    if (!duplicate) outerIndexMap.coeffRef(j + 1)++;
+    previous_j = j;
+    previous_i = i;
+  }
+  
+  // finalize outer indices and allocate memory
+  std::partial_sum(outerIndexMap.begin(), outerIndexMap.end(), outerIndexMap.begin());
+  Index nonZeros = mat.outerIndexPtr()[mat.outerSize()];
+  mat.resizeNonZeros(nonZeros);
+
+  previous_i = kEmptyIndexValue;
+  previous_j = kEmptyIndexValue;
+  Index back = 0;
+  for (InputIterator it(begin); it != end; ++it) {
+    StorageIndex j = IsRowMajor ? it->row() : it->col();
+    StorageIndex i = IsRowMajor ? it->col() : it->row();
+    bool duplicate = (previous_j == j) && (previous_i == i);
+    if (duplicate) {
+      mat.data().value(back - 1) = dup_func(mat.data().value(back - 1), it->value());
+    } else {
+      // push triplets to back
+      mat.data().index(back) = i;
+      mat.data().value(back) = it->value();
+      previous_j = j;
+      previous_i = i;
+      back++;
+    }
+  }
+  // matrix is finalized
 }
 
 }
@@ -1105,47 +1224,71 @@
   internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,Options_,StorageIndex_> >(begin, end, *this, internal::scalar_sum_op<Scalar,Scalar>());
 }
 
+/** The same as setFromTriplets but triplets are assumed to be pre-sorted. This is faster and requires less temporary storage. 
+  * Two triplets `a` and `b` are appropriately ordered if:
+  * \code
+  * ColMajor: ((a.col() != b.col()) ? (a.col() < b.col()) : (a.row() < b.row())
+  * RowMajor: ((a.row() != b.row()) ? (a.row() < b.row()) : (a.col() < b.col())
+  * \endcode
+  */
+template<typename Scalar, int Options_, typename StorageIndex_>
+template<typename InputIterators>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::setFromSortedTriplets(const InputIterators& begin, const InputIterators& end)
+{
+    internal::set_from_triplets_sorted<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_> >(begin, end, *this, internal::scalar_sum_op<Scalar, Scalar>());
+}
+
 /** The same as setFromTriplets but when duplicates are met the functor \a dup_func is applied:
   * \code
   * value = dup_func(OldValue, NewValue)
-  * \endcode 
+  * \endcode
   * Here is a C++11 example keeping the latest entry only:
   * \code
   * mat.setFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
   * \endcode
   */
 template<typename Scalar, int Options_, typename StorageIndex_>
-template<typename InputIterators,typename DupFunctor>
-void SparseMatrix<Scalar,Options_,StorageIndex_>::setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func)
+template<typename InputIterators, typename DupFunctor>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func)
 {
-  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,Options_,StorageIndex_>, DupFunctor>(begin, end, *this, dup_func);
+    internal::set_from_triplets<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_>, DupFunctor>(begin, end, *this, dup_func);
+}
+
+/** The same as setFromSortedTriplets but when duplicates are met the functor \a dup_func is applied:
+  * \code
+  * value = dup_func(OldValue, NewValue)
+  * \endcode
+  * Here is a C++11 example keeping the latest entry only:
+  * \code
+  * mat.setFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
+  * \endcode
+  */
+template<typename Scalar, int Options_, typename StorageIndex_>
+template<typename InputIterators, typename DupFunctor>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::setFromSortedTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func)
+{
+    internal::set_from_triplets_sorted<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_>, DupFunctor>(begin, end, *this, dup_func);
 }
 
 /** \internal */
 template<typename Scalar, int Options_, typename StorageIndex_>
-template<typename DupFunctor>
-void SparseMatrix<Scalar,Options_,StorageIndex_>::collapseDuplicates(DupFunctor dup_func)
+template<typename Derived, typename DupFunctor>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::collapseDuplicates(DenseBase<Derived>& wi, DupFunctor dup_func)
 {
-  eigen_assert(!isCompressed());
-  // TODO, in practice we should be able to use m_innerNonZeros for that task
-  IndexVector wi(innerSize());
-  wi.fill(-1);
+  eigen_assert(wi.size() >= m_innerSize);
+  constexpr StorageIndex kEmptyIndexValue(-1);
+  wi.setConstant(kEmptyIndexValue);
   StorageIndex count = 0;
   // for each inner-vector, wi[inner_index] will hold the position of first element into the index/value buffers
-  for(Index j=0; j<outerSize(); ++j)
-  {
-    StorageIndex start   = count;
-    Index oldEnd  = m_outerIndex[j]+m_innerNonZeros[j];
-    for(Index k=m_outerIndex[j]; k<oldEnd; ++k)
-    {
-      Index i = m_data.index(k);
-      if(wi(i)>=start)
-      {
+  for (Index j = 0; j < m_outerSize; ++j) {
+    StorageIndex start = count;
+    StorageIndex oldEnd = isCompressed() ? m_outerIndex[j + 1] : m_outerIndex[j] + m_innerNonZeros[j];
+    for (StorageIndex k = m_outerIndex[j]; k < oldEnd; ++k) {
+      StorageIndex i = m_data.index(k);
+      if (wi(i) >= start) {
         // we already meet this entry => accumulate it
         m_data.value(wi(i)) = dup_func(m_data.value(wi(i)), m_data.value(k));
-      }
-      else
-      {
+      } else {
         m_data.value(count) = m_data.value(k);
         m_data.index(count) = m_data.index(k);
         wi(i) = count;
@@ -1155,13 +1298,17 @@
     m_outerIndex[j] = start;
   }
   m_outerIndex[m_outerSize] = count;
-
   // turn the matrix into compressed form
-  internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
-  m_innerNonZeros = 0;
+  if (m_innerNonZeros) {
+    internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+    m_innerNonZeros = 0;
+  }
   m_data.resize(m_outerIndex[m_outerSize]);
 }
 
+
+
+/** \internal */
 template<typename Scalar, int Options_, typename StorageIndex_>
 template<typename OtherDerived>
 EIGEN_DONT_INLINE SparseMatrix<Scalar,Options_,StorageIndex_>& SparseMatrix<Scalar,Options_,StorageIndex_>::operator=(const SparseMatrixBase<OtherDerived>& other)
@@ -1235,274 +1382,149 @@
   }
 }
 
-template<typename Scalar_, int Options_, typename StorageIndex_>
-typename SparseMatrix<Scalar_,Options_,StorageIndex_>::Scalar& SparseMatrix<Scalar_,Options_,StorageIndex_>::insert(Index row, Index col)
-{
-  eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
-  
-  const Index outer = IsRowMajor ? row : col;
-  const Index inner = IsRowMajor ? col : row;
-  
-  if(isCompressed())
-  {
-    if(nonZeros()==0)
-    {
-      // reserve space if not already done
-      if(m_data.allocatedSize()==0)
-        m_data.reserve(2*m_innerSize);
-      
-      // turn the matrix into non-compressed mode
-      m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
-      
-      std::fill(m_innerNonZeros, m_innerNonZeros + m_outerSize, StorageIndex(0));
-      
-      // pack all inner-vectors to the end of the pre-allocated space
-      // and allocate the entire free-space to the first inner-vector
-      StorageIndex end = convert_index(m_data.allocatedSize());
-      for(Index j=1; j<=m_outerSize; ++j)
-        m_outerIndex[j] = end;
-    }
-    else
-    {
-      // turn the matrix into non-compressed mode
-      m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
-      for(Index j=0; j<m_outerSize; ++j)
-        m_innerNonZeros[j] = m_outerIndex[j+1]-m_outerIndex[j];
-    }
-  }
-  
-  // check whether we can do a fast "push back" insertion
-  Index data_end = m_data.allocatedSize();
-  
-  // First case: we are filling a new inner vector which is packed at the end.
-  // We assume that all remaining inner-vectors are also empty and packed to the end.
-  if(m_outerIndex[outer]==data_end)
-  {
-    eigen_internal_assert(m_innerNonZeros[outer]==0);
-    
-    // pack previous empty inner-vectors to end of the used-space
-    // and allocate the entire free-space to the current inner-vector.
-    StorageIndex p = convert_index(m_data.size());
-    Index j = outer;
-    while(j>=0 && m_innerNonZeros[j]==0)
-      m_outerIndex[j--] = p;
-    
-    // push back the new element
-    ++m_innerNonZeros[outer];
-    m_data.append(Scalar(0), inner);
-    
-    // check for reallocation
-    if(data_end != m_data.allocatedSize())
-    {
-      // m_data has been reallocated
-      //  -> move remaining inner-vectors back to the end of the free-space
-      //     so that the entire free-space is allocated to the current inner-vector.
-      eigen_internal_assert(data_end < m_data.allocatedSize());
-      StorageIndex new_end = convert_index(m_data.allocatedSize());
-      for(Index k=outer+1; k<=m_outerSize; ++k)
-        if(m_outerIndex[k]==data_end)
-          m_outerIndex[k] = new_end;
-    }
-    return m_data.value(p);
-  }
-  
-  // Second case: the next inner-vector is packed to the end
-  // and the current inner-vector end match the used-space.
-  if(m_outerIndex[outer+1]==data_end && m_outerIndex[outer]+m_innerNonZeros[outer]==m_data.size())
-  {
-    eigen_internal_assert(outer+1==m_outerSize || m_innerNonZeros[outer+1]==0);
-    
-    // add space for the new element
-    ++m_innerNonZeros[outer];
-    m_data.resize(m_data.size()+1);
-    
-    // check for reallocation
-    if(data_end != m_data.allocatedSize())
-    {
-      // m_data has been reallocated
-      //  -> move remaining inner-vectors back to the end of the free-space
-      //     so that the entire free-space is allocated to the current inner-vector.
-      eigen_internal_assert(data_end < m_data.allocatedSize());
-      StorageIndex new_end = convert_index(m_data.allocatedSize());
-      for(Index k=outer+1; k<=m_outerSize; ++k)
-        if(m_outerIndex[k]==data_end)
-          m_outerIndex[k] = new_end;
-    }
-    
-    // and insert it at the right position (sorted insertion)
-    Index startId = m_outerIndex[outer];
-    Index p = m_outerIndex[outer]+m_innerNonZeros[outer]-1;
-    while ( (p > startId) && (m_data.index(p-1) > inner) )
-    {
-      m_data.index(p) = m_data.index(p-1);
-      m_data.value(p) = m_data.value(p-1);
-      --p;
-    }
-    
-    m_data.index(p) = convert_index(inner);
-    return (m_data.value(p) = Scalar(0));
-  }
-  
-  if(m_data.size() != m_data.allocatedSize())
-  {
-    // make sure the matrix is compatible to random un-compressed insertion:
-    m_data.resize(m_data.allocatedSize());
-    this->reserveInnerVectors(Array<StorageIndex,Dynamic,1>::Constant(m_outerSize, 2));
-  }
-  
-  return insertUncompressed(row,col);
+template <typename Scalar_, int Options_, typename StorageIndex_>
+inline typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar& SparseMatrix<Scalar_, Options_, StorageIndex_>::insert(
+    Index row, Index col) {
+  Index outer = IsRowMajor ? row : col;
+  Index inner = IsRowMajor ? col : row;
+  Index start = outerIndexPtr()[outer];
+  Index end = isCompressed() ? outerIndexPtr()[outer + 1] : start + innerNonZeroPtr()[outer];
+  Index dst = data().searchLowerIndex(start, end, inner);
+  eigen_assert((dst == end || data().index(dst) != inner) &&
+      "you cannot insert an element that already exists, you must call coeffRef to this end");
+  return insertAtByOuterInner(outer, inner, dst);
 }
-    
-template<typename Scalar_, int Options_, typename StorageIndex_>
-EIGEN_DONT_INLINE typename SparseMatrix<Scalar_,Options_,StorageIndex_>::Scalar& SparseMatrix<Scalar_,Options_,StorageIndex_>::insertUncompressed(Index row, Index col)
-{
+
+template <typename Scalar_, int Options_, typename StorageIndex_>
+EIGEN_STRONG_INLINE typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertAtByOuterInner(Index outer, Index inner, Index dst) {
+  uncompress();
+  return insertUncompressedAtByOuterInner(outer, inner, dst);
+}
+
+template <typename Scalar_, int Options_, typename StorageIndex_>
+EIGEN_DEPRECATED EIGEN_DONT_INLINE typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertUncompressed(Index row, Index col) {
   eigen_assert(!isCompressed());
-
-  const Index outer = IsRowMajor ? row : col;
-  const StorageIndex inner = convert_index(IsRowMajor ? col : row);
-
-  Index room = m_outerIndex[outer+1] - m_outerIndex[outer];
-  StorageIndex innerNNZ = m_innerNonZeros[outer];
-  if(innerNNZ>=room)
-  {
-    // this inner vector is full, we need to reallocate the whole buffer :(
-    reserve(SingletonVector(outer,std::max<StorageIndex>(2,innerNNZ)));
-  }
-
-  Index startId = m_outerIndex[outer];
-  Index p = startId + m_innerNonZeros[outer];
-  while ( (p > startId) && (m_data.index(p-1) > inner) )
-  {
-    m_data.index(p) = m_data.index(p-1);
-    m_data.value(p) = m_data.value(p-1);
-    --p;
-  }
-  eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exists, you must call coeffRef to this end");
-
-  m_innerNonZeros[outer]++;
-
-  m_data.index(p) = inner;
-  return (m_data.value(p) = Scalar(0));
+  Index outer = IsRowMajor ? row : col;
+  Index inner = IsRowMajor ? col : row;
+  Index start = outerIndexPtr()[outer];
+  Index end = start + innerNonZeroPtr()[outer];
+  Index dst = data().searchLowerIndex(start, end, inner);
+  eigen_assert((dst == end || data().index(dst) != inner) &&
+               "you cannot insert an element that already exists, you must call coeffRef to this end");
+  return insertUncompressedAtByOuterInner(outer, inner, dst);
 }
 
-template<typename Scalar_, int Options_, typename StorageIndex_>
-EIGEN_DONT_INLINE typename SparseMatrix<Scalar_,Options_,StorageIndex_>::Scalar& SparseMatrix<Scalar_,Options_,StorageIndex_>::insertCompressed(Index row, Index col)
-{
+template <typename Scalar_, int Options_, typename StorageIndex_>
+EIGEN_DEPRECATED EIGEN_DONT_INLINE typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertCompressed(Index row, Index col) {
   eigen_assert(isCompressed());
+  Index outer = IsRowMajor ? row : col;
+  Index inner = IsRowMajor ? col : row;
+  Index start = outerIndexPtr()[outer];
+  Index end = outerIndexPtr()[outer + 1];
+  Index dst = data().searchLowerIndex(start, end, inner);
+  eigen_assert((dst == end || data().index(dst) != inner) &&
+               "you cannot insert an element that already exists, you must call coeffRef to this end");
+  return insertCompressedAtByOuterInner(outer, inner, dst);
+}
 
-  const Index outer = IsRowMajor ? row : col;
-  const Index inner = IsRowMajor ? col : row;
-
-  Index previousOuter = outer;
-  if (m_outerIndex[outer+1]==0)
-  {
-    // we start a new inner vector
-    while (previousOuter>=0 && m_outerIndex[previousOuter]==0)
-    {
-      m_outerIndex[previousOuter] = convert_index(m_data.size());
-      --previousOuter;
-    }
-    m_outerIndex[outer+1] = m_outerIndex[outer];
+template <typename Scalar_, int Options_, typename StorageIndex_>
+EIGEN_STRONG_INLINE void SparseMatrix<Scalar_, Options_, StorageIndex_>::checkAllocatedSpaceAndMaybeExpand() {
+  // if there is no capacity for a single insertion, double the capacity
+  if (data().allocatedSize() <= data().size()) {
+    // increase capacity by a mininum of 32
+    Index minReserve = 32;
+    Index reserveSize = numext::maxi(minReserve, data().allocatedSize());
+    data().reserve(reserveSize);
   }
+}
 
-  // here we have to handle the tricky case where the outerIndex array
-  // starts with: [ 0 0 0 0 0 1 ...] and we are inserted in, e.g.,
-  // the 2nd inner vector...
-  bool isLastVec = (!(previousOuter==-1 && m_data.size()!=0))
-                && (std::size_t(m_outerIndex[outer+1]) == m_data.size());
+template <typename Scalar_, int Options_, typename StorageIndex_>
+typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertCompressedAtByOuterInner(Index outer, Index inner, Index dst) {
+  eigen_assert(isCompressed());
+  // compressed insertion always requires expanding the buffer
+  checkAllocatedSpaceAndMaybeExpand();
+  data().resize(data().size() + 1);
+  Index chunkSize = outerIndexPtr()[outerSize()] - dst;
+  // shift the existing data to the right if necessary
+  if (chunkSize > 0) data().moveChunk(dst, dst + 1, chunkSize);
+  // update nonzero counts
+  typename IndexVector::AlignedMapType outerIndexMap(outerIndexPtr(), outerSize() + 1);
+  outerIndexMap.segment(outer + 1, outerSize() - outer).array() += 1;
+  // initialize the coefficient
+  data().index(dst) = StorageIndex(inner);
+  // return a reference to the coefficient
+  return data().value(dst) = Scalar(0);
+}
 
-  std::size_t startId = m_outerIndex[outer];
-  // FIXME let's make sure sizeof(long int) == sizeof(std::size_t)
-  std::size_t p = m_outerIndex[outer+1];
-  ++m_outerIndex[outer+1];
-
-  double reallocRatio = 1;
-  if (m_data.allocatedSize()<=m_data.size())
-  {
-    // if there is no preallocated memory, let's reserve a minimum of 32 elements
-    if (m_data.size()==0)
-    {
-      m_data.reserve(32);
-    }
-    else
-    {
-      // we need to reallocate the data, to reduce multiple reallocations
-      // we use a smart resize algorithm based on the current filling ratio
-      // in addition, we use double to avoid integers overflows
-      double nnzEstimate = double(m_outerIndex[outer])*double(m_outerSize)/double(outer+1);
-      reallocRatio = (nnzEstimate-double(m_data.size()))/double(m_data.size());
-      // furthermore we bound the realloc ratio to:
-      //   1) reduce multiple minor realloc when the matrix is almost filled
-      //   2) avoid to allocate too much memory when the matrix is almost empty
-      reallocRatio = (std::min)((std::max)(reallocRatio,1.5),8.);
+template <typename Scalar_, int Options_, typename StorageIndex_>
+typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertUncompressedAtByOuterInner(Index outer, Index inner, Index dst) {
+  eigen_assert(!isCompressed());
+  // find nearest outer vector to the right with capacity (if any) to minimize copy size
+  Index target = outer;
+  for (; target < outerSize(); target++) {
+    Index start = outerIndexPtr()[target];
+    Index end = start + innerNonZeroPtr()[target];
+    Index capacity = outerIndexPtr()[target + 1] - end;
+    if (capacity > 0) {
+      // `target` has room for interior insertion
+      Index chunkSize = end - dst;
+      // shift the existing data to the right if necessary
+      if (chunkSize > 0) data().moveChunk(dst, dst + 1, chunkSize);
+      break;
     }
   }
-  m_data.resize(m_data.size()+1,reallocRatio);
-
-  if (!isLastVec)
-  {
-    if (previousOuter==-1)
-    {
-      // oops wrong guess.
-      // let's correct the outer offsets
-      for (Index k=0; k<=(outer+1); ++k)
-        m_outerIndex[k] = 0;
-      Index k=outer+1;
-      while(m_outerIndex[k]==0)
-        m_outerIndex[k++] = 1;
-      while (k<=m_outerSize && m_outerIndex[k]!=0)
-        m_outerIndex[k++]++;
-      p = 0;
-      --k;
-      k = m_outerIndex[k]-1;
-      while (k>0)
-      {
-        m_data.index(k) = m_data.index(k-1);
-        m_data.value(k) = m_data.value(k-1);
-        k--;
-      }
+  if (target == outerSize()) {
+    // no room for interior insertion (to the right of `outer`)
+    target = outer;
+    Index dst_offset = dst - outerIndexPtr()[target];
+    Index totalCapacity = data().allocatedSize() - data().size();
+    eigen_assert(totalCapacity >= 0);
+    if (totalCapacity == 0) {
+      // there is no room left. we must reallocate. reserve space in each vector
+      constexpr StorageIndex kReserveSizePerVector(2);
+      reserveInnerVectors(IndexVector::Constant(outerSize(), kReserveSizePerVector));
+    } else {
+      // dont reallocate, but re-distribute the remaining capacity to the right of `outer`
+      // each vector in the range [outer,outerSize) will receive totalCapacity / (outerSize - outer) nonzero
+      // reservations each vector in the range [outer,remainder) will receive an additional nonzero reservation where
+      // remainder = totalCapacity % (outerSize - outer)
+      typedef internal::sparse_reserve_op<StorageIndex> ReserveSizesOp;
+      typedef CwiseNullaryOp<ReserveSizesOp, IndexVector> ReserveSizesXpr;
+      ReserveSizesXpr reserveSizesXpr(outerSize(), 1, ReserveSizesOp(target, outerSize(), totalCapacity));
+      eigen_assert(reserveSizesXpr.sum() == totalCapacity);
+      reserveInnerVectors(reserveSizesXpr);
     }
-    else
-    {
-      // we are not inserting into the last inner vec
-      // update outer indices:
-      Index j = outer+2;
-      while (j<=m_outerSize && m_outerIndex[j]!=0)
-        m_outerIndex[j++]++;
-      --j;
-      // shift data of last vecs:
-      Index k = m_outerIndex[j]-1;
-      while (k>=Index(p))
-      {
-        m_data.index(k) = m_data.index(k-1);
-        m_data.value(k) = m_data.value(k-1);
-        k--;
-      }
-    }
+    Index start = outerIndexPtr()[target];
+    Index end = start + innerNonZeroPtr()[target];
+    dst = start + dst_offset;
+    Index chunkSize = end - dst;
+    if (chunkSize > 0) data().moveChunk(dst, dst + 1, chunkSize);
   }
-
-  while ( (p > startId) && (m_data.index(p-1) > inner) )
-  {
-    m_data.index(p) = m_data.index(p-1);
-    m_data.value(p) = m_data.value(p-1);
-    --p;
-  }
-
-  m_data.index(p) = inner;
-  return (m_data.value(p) = Scalar(0));
+  // update nonzero counts
+  innerNonZeroPtr()[outer]++;
+  typename IndexVector::AlignedMapType outerIndexMap(outerIndexPtr(), outerSize() + 1);
+  outerIndexMap.segment(outer + 1, target - outer).array() += 1;
+  // initialize the coefficient
+  data().index(dst) = StorageIndex(inner);
+  // return a reference to the coefficient
+  return data().value(dst) = Scalar(0);
 }
 
 namespace internal {
 
-template<typename Scalar_, int Options_, typename StorageIndex_>
-struct evaluator<SparseMatrix<Scalar_,Options_,StorageIndex_> >
-  : evaluator<SparseCompressedBase<SparseMatrix<Scalar_,Options_,StorageIndex_> > >
-{
-  typedef evaluator<SparseCompressedBase<SparseMatrix<Scalar_,Options_,StorageIndex_> > > Base;
-  typedef SparseMatrix<Scalar_,Options_,StorageIndex_> SparseMatrixType;
-  evaluator() : Base() {}
-  explicit evaluator(const SparseMatrixType &mat) : Base(mat) {}
-};
+    template <typename Scalar_, int Options_, typename StorageIndex_>
+    struct evaluator<SparseMatrix<Scalar_, Options_, StorageIndex_>>
+        : evaluator<SparseCompressedBase<SparseMatrix<Scalar_, Options_, StorageIndex_>>> {
+      typedef evaluator<SparseCompressedBase<SparseMatrix<Scalar_, Options_, StorageIndex_>>> Base;
+      typedef SparseMatrix<Scalar_, Options_, StorageIndex_> SparseMatrixType;
+      evaluator() : Base() {}
+      explicit evaluator(const SparseMatrixType& mat) : Base(mat) {}
+    };
 
 }
 
diff --git a/bench/spbench/spbenchsolver.cpp b/bench/spbench/spbenchsolver.cpp
index 2a73511..9d2e381 100644
--- a/bench/spbench/spbenchsolver.cpp
+++ b/bench/spbench/spbenchsolver.cpp
@@ -6,19 +6,20 @@
     cout<< " MATRIX FOLDER : \n";
     cout<< " The matrices for the benchmark should be collected in a folder specified with an environment variable EIGEN_MATRIXDIR \n";
     cout<< " The matrices are stored using the matrix market coordinate format \n";
-    cout<< " The matrix and associated right-hand side (rhs) files are named respectively \n";
-    cout<< " as MatrixName.mtx and MatrixName_b.mtx. If the rhs does not exist, a random one is generated. \n";
-    cout<< " If a matrix is SPD, the matrix should be named as MatrixName_SPD.mtx \n";
-    cout<< " If a true solution exists, it should be named as MatrixName_x.mtx; \n"     ;
-    cout<< " it will be used to compute the norm of the error relative to the computed solutions\n\n";
+    cout<< " The matrix and associated right-hand side (rhs) files are named respectively as MatrixName.mtx and MatrixName_b.mtx.\n";
+    cout<< " If the rhs does not exist, a random one is generated. \n";
+    cout<< " If a true solution exists, it should be named as MatrixName_x.mtx ; it will be used to compute the norm of the error \n";
+    cout<< " relative to the computed solutions. \n";
+    cout<< " If a matrix is SPD, the matrix should be named as MatrixName_SPD.mtx (then use MatrixName_SPD_b.mtx \n";
+    cout<< " and MatrixName_SPD_x.mtx for the rhs and the true solution).\n\n";
     cout<< " OPTIONS : \n"; 
     cout<< " -h or --help \n    print this help and return\n\n";
     cout<< " -d matrixdir \n    Use matrixdir as the matrix folder instead of the one specified in the environment variable EIGEN_MATRIXDIR\n\n"; 
     cout<< " -o outputfile.xml \n    Output the statistics to a xml file \n\n";
     cout<< " --eps <RelErr> Sets the relative tolerance for iterative solvers (default 1e-08) \n\n";
     cout<< " --maxits <MaxIts> Sets the maximum number of iterations (default 1000) \n\n";
-    
 }
+
 int main(int argc, char ** args)
 {
   
diff --git a/test/bfloat16_float.cpp b/test/bfloat16_float.cpp
index b2a22ce..5cc44ac 100644
--- a/test/bfloat16_float.cpp
+++ b/test/bfloat16_float.cpp
@@ -11,8 +11,6 @@
 
 #include "main.h"
 
-#include <Eigen/src/Core/arch/Default/BFloat16.h>
-
 #define VERIFY_BFLOAT16_BITS_EQUAL(h, bits) \
   VERIFY_IS_EQUAL((numext::bit_cast<numext::uint16_t>(h)), (static_cast<numext::uint16_t>(bits)))
 
diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
index e424a93..82f9d61 100644
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@@ -16,6 +16,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 
+#define EIGEN_USE_GPU
 #include "main.h"
 #include "gpu_common.h"
 
diff --git a/test/gpu_common.h b/test/gpu_common.h
index c37eaa1..05080ad 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@@ -12,9 +12,6 @@
 
 #include <iostream>
 
-#define EIGEN_USE_GPU
-#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
-
 #if !defined(__CUDACC__) && !defined(__HIPCC__)
 dim3 threadIdx, blockDim, blockIdx;
 #endif
diff --git a/test/gpu_example.cu b/test/gpu_example.cu
index a69f5ea..d4715cf 100644
--- a/test/gpu_example.cu
+++ b/test/gpu_example.cu
@@ -9,6 +9,7 @@
 
 // The following is an example GPU test.
 
+#define EIGEN_USE_GPU
 #include "main.h"  // Include the main test utilities.
 
 // Define a kernel functor.
diff --git a/test/gpu_test_helper.h b/test/gpu_test_helper.h
index 0942466..008e95c 100644
--- a/test/gpu_test_helper.h
+++ b/test/gpu_test_helper.h
@@ -3,10 +3,8 @@
 
 #include <Eigen/Core>
 
-#ifdef EIGEN_GPUCC
-#define EIGEN_USE_GPU
-#include "../unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h"
-#endif // EIGEN_GPUCC
+// Allow gpu** macros for generic tests.
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
 
 // std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
 // doesn't allow std::tuple to compile for host code either. In these cases,
diff --git a/test/half_float.cpp b/test/half_float.cpp
index 00a8b48..83b057e 100644
--- a/test/half_float.cpp
+++ b/test/half_float.cpp
@@ -9,8 +9,6 @@
 
 #include "main.h"
 
-#include <Eigen/src/Core/arch/Default/Half.h>
-
 #define VERIFY_HALF_BITS_EQUAL(h, bits) \
   VERIFY_IS_EQUAL((numext::bit_cast<numext::uint16_t>(h)), (static_cast<numext::uint16_t>(bits)))
 
diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp
index ecc17f5..72316a2 100644
--- a/test/incomplete_cholesky.cpp
+++ b/test/incomplete_cholesky.cpp
@@ -10,7 +10,6 @@
 // #define EIGEN_MAX_ALIGN_BYTES 0
 #include "sparse_solver.h"
 #include <Eigen/IterativeLinearSolvers>
-#include <unsupported/Eigen/IterativeSolvers>
 
 template<typename T, typename I_> void test_incomplete_cholesky_T()
 {
diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp
index daf24a7..401adf0 100644
--- a/test/jacobisvd.cpp
+++ b/test/jacobisvd.cpp
@@ -60,7 +60,9 @@
 }
 
 template <typename MatrixType>
-void jacobisvd_verify_assert(const MatrixType& m = MatrixType()) {
+void jacobisvd_verify_assert(const MatrixType& input = MatrixType()) {
+  MatrixType m(input.rows(), input.cols());
+  svd_fill_random(m);
   svd_verify_assert<MatrixType, 0>(m);
   svd_verify_assert<MatrixType, ColPivHouseholderQRPreconditioner>(m);
   svd_verify_assert<MatrixType, HouseholderQRPreconditioner>(m);
diff --git a/test/sparse.h b/test/sparse.h
index f3e697d..78e6619 100644
--- a/test/sparse.h
+++ b/test/sparse.h
@@ -152,6 +152,4 @@
   }
 }
 
-
-#include <unsupported/Eigen/SparseExtra>
 #endif // EIGEN_TESTSPARSE_H
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 52ff7b7..6e2661e 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -477,14 +477,45 @@
         refMat_prod(r,c) *= v;
       refMat_last(r,c) = v;
     }
+
     SparseMatrixType m(rows,cols);
     m.setFromTriplets(triplets.begin(), triplets.end());
     VERIFY_IS_APPROX(m, refMat_sum);
+    VERIFY_IS_EQUAL(m.innerIndicesAreSorted(), m.outerSize());
 
     m.setFromTriplets(triplets.begin(), triplets.end(), std::multiplies<Scalar>());
     VERIFY_IS_APPROX(m, refMat_prod);
+    VERIFY_IS_EQUAL(m.innerIndicesAreSorted(), m.outerSize());
     m.setFromTriplets(triplets.begin(), triplets.end(), [] (Scalar,Scalar b) { return b; });
     VERIFY_IS_APPROX(m, refMat_last);
+    VERIFY_IS_EQUAL(m.innerIndicesAreSorted(), m.outerSize());
+
+    // test setFromSortedTriplets
+
+    struct triplet_comp {
+      inline bool operator()(const TripletType& a, const TripletType& b) {
+        return SparseMatrixType::IsRowMajor ? ((a.row() != b.row()) ? (a.row() < b.row()) : (a.col() < b.col()))
+                                            : ((a.col() != b.col()) ? (a.col() < b.col()) : (a.row() < b.row()));
+      }
+    };
+
+    // stable_sort is only necessary when the reduction functor is dependent on the order of the triplets
+    // this is the case with refMat_last
+    // for most cases, std::sort is sufficient and preferred
+    std::stable_sort(triplets.begin(), triplets.end(), triplet_comp());
+
+    m.setZero();
+    m.setFromSortedTriplets(triplets.begin(), triplets.end());
+    VERIFY_IS_APPROX(m, refMat_sum);
+    VERIFY_IS_EQUAL(m.innerIndicesAreSorted(), m.outerSize());
+
+    m.setFromSortedTriplets(triplets.begin(), triplets.end(), std::multiplies<Scalar>());
+    VERIFY_IS_APPROX(m, refMat_prod);
+    VERIFY_IS_EQUAL(m.innerIndicesAreSorted(), m.outerSize());
+
+    m.setFromSortedTriplets(triplets.begin(), triplets.end(), [](Scalar, Scalar b) { return b; });
+    VERIFY_IS_APPROX(m, refMat_last);
+    VERIFY_IS_EQUAL(m.innerIndicesAreSorted(), m.outerSize());
   }
   
   // test Map
diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index dbf549a..8b46fc8 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp
@@ -477,7 +477,7 @@
     initSparse<Real>(0.1, refA, A);
     initSparse<Real>(0.1, refB, B);
 
-    SparseMatrix<Real, OrderC, std::int8_t> result;
+    SparseMatrix<Real, OrderC, std::int16_t> result;
     SparseMatrix<Real, OrderC> result_large;
     DenseMat refResult;
 
@@ -486,8 +486,8 @@
 
   // Case: Small input but large result
   {
-    SparseMatrix<Real, OrderA, std::int8_t> A(127, 8);
-    SparseMatrix<Real, OrderB, std::int8_t> B(8, 127);
+    SparseMatrix<Real, OrderA, std::int16_t> A(127, 8);
+    SparseMatrix<Real, OrderB, std::int16_t> B(8, 127);
     DenseMat refA(127, 8);
     DenseMat refB(8, 127);
 
diff --git a/test/sparselu.cpp b/test/sparselu.cpp
index 84cc6eb..0f9d25f 100644
--- a/test/sparselu.cpp
+++ b/test/sparselu.cpp
@@ -17,7 +17,6 @@
 
 #include "sparse_solver.h"
 #include <Eigen/SparseLU>
-#include <unsupported/Eigen/SparseExtra>
 
 template<typename T> void test_sparselu_T()
 {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 60748cc..4b3d423 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -10,9 +10,6 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
 
-#if defined(EIGEN_USE_LIBXSMM)
-#include "third_party/libxsmm/include/libxsmm.h"
-#endif
 
 #include "./InternalHeaderCheck.h"
 
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index 4a1f938..05a9066 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -7,7 +7,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#include "sparse_product.cpp"
+#include "sparse.h"
 
 #ifdef min
 #undef min