Update Eigen to commit:6156797016164b87b3e360e02d0e4107f7f66fbc
CHANGELOG
=========
615679701 - Revert "Add template to specify QR permutation index type, Fix ColPivHouseholderQR Lapacke bindings"
be7791e09 - Add template to specify QR permutation index type, Fix ColPivHouseholderQR Lapacke bindings
9463fc95f - change insert strategy
c54785b07 - Fix error: unused parameter '\''tmp'\'' [-Werror,-Wunused-parameter] on clang/32-bit arm
f47472603 - Add missing header for GPU tests.
81172cbdc - Overhaul Sparse Core
925518189 - Modified spbenchsolver help message because it could be misunderstood
d20fe21ae - Improve performance for Power10 MMA bfloat16 GEMM
fe7f52778 - Fix guard macros for emulated FP16 operators on GPU
b8422c99c - Update file jacobisvd.cpp
262194f12 - Fix a bunch of minor build and test issues.
PiperOrigin-RevId: 501682808
Change-Id: If89a17cd8160863cb158c4cd4f0814260bd5857d
diff --git a/Eigen/SPQRSupport b/Eigen/SPQRSupport
index 33c3370..d83495e 100644
--- a/Eigen/SPQRSupport
+++ b/Eigen/SPQRSupport
@@ -28,7 +28,7 @@
*
*/
-#include "Eigen/CholmodSupport"
+#include "CholmodSupport"
#include "src/SPQRSupport/SuiteSparseQRSupport.h"
#endif
diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index b2db46b..352f1ec 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -17,6 +17,7 @@
#include <cstdlib>
#include <cstring>
#include <algorithm>
+#include <numeric>
/**
* \defgroup SparseCore_Module SparseCore module
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index e1c17fc..5e5eead 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -294,7 +294,7 @@
Packet mask = pcmp_eq(pset1<Packet>(value), p);
Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
this->res = value;
- this->row = Derived::IsRowMajor ? i : i + max_idx;;
+ this->row = Derived::IsRowMajor ? i : i + max_idx;
this->col = Derived::IsRowMajor ? j + max_idx : j;
}
}
@@ -342,7 +342,7 @@
struct functor_traits<minmax_coeff_visitor<Scalar, is_min, NaNPropagation> > {
enum {
Cost = NumTraits<Scalar>::AddCost,
- PacketAccess = true
+ PacketAccess = packet_traits<Scalar>::HasCmp
};
};
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index 2429c81..61d9618 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -841,6 +841,345 @@
}
};
+#ifdef __MMA__
+// General template for lhs packing, bfloat16 specialization.
+template<typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, true>
+{
+ EIGEN_STRONG_INLINE void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+ {
+ const Index vectorSize = quad_traits<bfloat16>::vectorsize;
+ Index ri = 0, j = 0;
+
+ for(; j + 2*vectorSize <= rows; j+=2*vectorSize)
+ {
+ const DataMapper lhs2 = lhs.getSubMapper(j, 0);
+ Index i = 0;
+
+ if(PanelMode) ri += 2*vectorSize*offset;
+
+ if(StorageOrder == ColMajor)
+ {
+ for(; i + 2 <= depth; i+=2)
+ {
+ PacketBlock<Packet8bf,4> block;
+
+ block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
+ block.packet[1] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 0);
+ block.packet[2] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 1);
+ block.packet[3] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 1);
+
+ Packet8bf t0, t1;
+ t0 = vec_mergeh(block.packet[0].m_val, block.packet[2].m_val);
+ t1 = vec_mergel(block.packet[0].m_val, block.packet[2].m_val);
+ block.packet[2] = vec_mergeh(block.packet[1].m_val, block.packet[3].m_val);
+ block.packet[3] = vec_mergel(block.packet[1].m_val, block.packet[3].m_val);
+ block.packet[0] = t0;
+ block.packet[1] = t1;
+
+ storeBlock<bfloat16, Packet8bf, 4>(blockA + ri, block);
+
+ ri += 2*2*vectorSize;
+ }
+ if (depth & 1)
+ {
+ PacketBlock<Packet8bf,2> block;
+
+ block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
+ block.packet[1] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 0);
+
+ storeBlock<bfloat16, Packet8bf, 2>(blockA + ri, block);
+
+ ri += 2*vectorSize;
+ }
+ } else {
+ for(; i + vectorSize <= depth; i+=vectorSize)
+ {
+ PacketBlock<Packet8bf,8> block1, block2;
+
+ bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block1, lhs2, 0 * vectorSize, i);
+ bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block2, lhs2, 1 * vectorSize, i);
+
+ Packet2ul v1[8], v2[8];
+
+ v1[0] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val)));
+ v1[1] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val)));
+ v1[2] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val)));
+ v1[3] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val)));
+ v1[4] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val)));
+ v1[5] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val)));
+ v1[6] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val)));
+ v1[7] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val)));
+ v2[0] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[0].m_val), reinterpret_cast<Packet4ui>(block2.packet[1].m_val)));
+ v2[1] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[0].m_val), reinterpret_cast<Packet4ui>(block2.packet[1].m_val)));
+ v2[2] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[2].m_val), reinterpret_cast<Packet4ui>(block2.packet[3].m_val)));
+ v2[3] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[2].m_val), reinterpret_cast<Packet4ui>(block2.packet[3].m_val)));
+ v2[4] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[4].m_val), reinterpret_cast<Packet4ui>(block2.packet[5].m_val)));
+ v2[5] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[4].m_val), reinterpret_cast<Packet4ui>(block2.packet[5].m_val)));
+ v2[6] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[6].m_val), reinterpret_cast<Packet4ui>(block2.packet[7].m_val)));
+ v2[7] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[6].m_val), reinterpret_cast<Packet4ui>(block2.packet[7].m_val)));
+
+ block1.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(v1[0],v1[2]));
+ block1.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(v1[0],v1[2]));
+ block1.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(v1[1],v1[3]));
+ block1.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(v1[1],v1[3]));
+ block1.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(v1[4],v1[6]));
+ block1.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(v1[4],v1[6]));
+ block1.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(v1[5],v1[7]));
+ block1.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(v1[5],v1[7]));
+ block2.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(v2[0],v2[2]));
+ block2.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(v2[0],v2[2]));
+ block2.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(v2[1],v2[3]));
+ block2.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(v2[1],v2[3]));
+ block2.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(v2[4],v2[6]));
+ block2.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(v2[4],v2[6]));
+ block2.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(v2[5],v2[7]));
+ block2.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(v2[5],v2[7]));
+
+
+ for(Index M = 0; M < 8; M+=2) {
+ pstore<bfloat16>(blockA + ri + (0 * vectorSize) + (2*vectorSize * M), block1.packet[M+0]);
+ pstore<bfloat16>(blockA + ri + (1 * vectorSize) + (2*vectorSize * M), block1.packet[M+1]);
+ pstore<bfloat16>(blockA + ri + (2 * vectorSize) + (2*vectorSize * M), block2.packet[M+0]);
+ pstore<bfloat16>(blockA + ri + (3 * vectorSize) + (2*vectorSize * M), block2.packet[M+1]);
+ }
+
+ ri += 2*vectorSize*vectorSize;
+ }
+ for(; i + 2 <= depth; i+=2)
+ {
+ for(Index M = 0; M < 2*vectorSize; M++) {
+ blockA[ri + (M * 2) + 0] = lhs2(M, i + 0);
+ blockA[ri + (M * 2) + 1] = lhs2(M, i + 1);
+ }
+
+ ri += 2*2*vectorSize;
+ }
+ if (depth & 1)
+ {
+ for(Index M = 0; M < 2*vectorSize; M++) {
+ blockA[ri + M] = lhs2(M, i);
+ }
+ ri += 2*vectorSize;
+ }
+ }
+
+ if(PanelMode) ri += 2*vectorSize*(stride - offset - depth);
+ }
+ for(; j + vectorSize <= rows; j+=vectorSize)
+ {
+ const DataMapper lhs2 = lhs.getSubMapper(j, 0);
+ Index i = 0;
+
+ if(PanelMode) ri += vectorSize*offset;
+
+ if(StorageOrder == ColMajor)
+ {
+ for(; i + 2 <= depth; i+=2)
+ {
+ PacketBlock<Packet8bf,2> block;
+
+ block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
+ block.packet[1] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 1);
+
+ Packet8bf t0;
+ t0 = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+ block.packet[1] = vec_mergel(block.packet[0].m_val, block.packet[1].m_val);
+ block.packet[0] = t0;
+
+ storeBlock<bfloat16, Packet8bf, 2>(blockA + ri, block);
+
+ ri += 2*vectorSize;
+ }
+ if (depth & 1)
+ {
+ Packet8bf lhsV = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
+ pstore<bfloat16>(blockA + ri, lhsV);
+
+ ri += vectorSize;
+ }
+ } else {
+ for(; i + vectorSize <= depth; i+=vectorSize)
+ {
+ PacketBlock<Packet8bf,8> block1;
+
+ bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block1, lhs2, 0 * vectorSize, i);
+
+ Packet2ul v1[8];
+
+ v1[0] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val)));
+ v1[1] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val)));
+ v1[2] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val)));
+ v1[3] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val)));
+ v1[4] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val)));
+ v1[5] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val)));
+ v1[6] = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val)));
+ v1[7] = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val)));
+
+ block1.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(v1[0],v1[2]));
+ block1.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(v1[0],v1[2]));
+ block1.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(v1[1],v1[3]));
+ block1.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(v1[1],v1[3]));
+ block1.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(v1[4],v1[6]));
+ block1.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(v1[4],v1[6]));
+ block1.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(v1[5],v1[7]));
+ block1.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(v1[5],v1[7]));
+
+ for(Index M = 0; M < 8; M++) {
+ pstore<bfloat16>(blockA + ri + (vectorSize * M), block1.packet[M]);
+ }
+
+ ri += vectorSize*vectorSize;
+ }
+ for(; i + 2 <= depth; i+=2)
+ {
+ for(Index M = 0; M < vectorSize; M++) {
+ blockA[ri + (M * 2) + 0] = lhs2(M, i + 0);
+ blockA[ri + (M * 2) + 1] = lhs2(M, i + 1);
+ }
+
+ ri += 2*vectorSize;
+ }
+ if (depth & 1)
+ {
+ for(Index M = 0; M < vectorSize; M++) {
+ blockA[ri + M] = lhs2(M, i);
+ }
+
+ ri += vectorSize;
+ }
+ }
+
+ if(PanelMode) ri += vectorSize*(stride - offset - depth);
+ }
+
+ if(PanelMode) ri += offset;
+
+ for(; j < rows; j++)
+ {
+ const DataMapper lhs2 = lhs.getSubMapper(j, 0);
+ for(Index i = 0; i < depth; i++)
+ {
+ blockA[ri] = lhs2(0, i);
+ ri += 1;
+ }
+
+ if(PanelMode) ri += stride - depth;
+ }
+ }
+};
+
+// General template for rhs packing, bfloat16 specialization.
+template<typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, false>
+{
+ EIGEN_STRONG_INLINE void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+ {
+ const Index vectorSize = quad_traits<bfloat16>::vectorsize;
+ Index ri = 0, j = 0;
+
+ for(; j + 4 <= cols; j+=4)
+ {
+ const DataMapper rhs2 = rhs.getSubMapper(0, j);
+ Index i = 0;
+
+ if(PanelMode) ri += 4*offset;
+
+ for(; i + vectorSize <= depth; i+=vectorSize)
+ {
+ if(StorageOrder == ColMajor)
+ {
+ PacketBlock<Packet8bf,4> block;
+
+ bload<DataMapper, Packet8bf, 4, StorageOrder, false, 4>(block, rhs2, i, 0);
+
+ Packet2ul t0, t1, t2, t3;
+ t0 = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[0].m_val), reinterpret_cast<Packet4ui>(block.packet[1].m_val)));
+ t1 = reinterpret_cast<Packet2ul>(vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[2].m_val), reinterpret_cast<Packet4ui>(block.packet[3].m_val)));
+ t2 = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block.packet[0].m_val), reinterpret_cast<Packet4ui>(block.packet[1].m_val)));
+ t3 = reinterpret_cast<Packet2ul>(vec_mergel(reinterpret_cast<Packet4ui>(block.packet[2].m_val), reinterpret_cast<Packet4ui>(block.packet[3].m_val)));
+ block.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(t0, t1));
+ block.packet[1] = reinterpret_cast<Packet8us>(vec_mergel(t0, t1));
+ block.packet[2] = reinterpret_cast<Packet8us>(vec_mergeh(t2, t3));
+ block.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(t2, t3));
+
+ storeBlock<bfloat16, Packet8bf, 4>(blockB + ri, block);
+ } else {
+ PacketBlock<Packet8bf,8> block;
+
+ for (int M = 0; M < 8; M++) {
+ block.packet[M] = rhs2.template loadPacketPartial<Packet8bf>(i + M, 0, 4);
+ }
+
+ block.packet[0] = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+ block.packet[1] = vec_mergeh(block.packet[2].m_val, block.packet[3].m_val);
+ block.packet[2] = vec_mergeh(block.packet[4].m_val, block.packet[5].m_val);
+ block.packet[3] = vec_mergeh(block.packet[6].m_val, block.packet[7].m_val);
+
+ const Index size = 16 / sizeof(bfloat16);
+
+ for (int M = 0; M < 4; M++) {
+ pstore<bfloat16>(blockB + ri + (M * size), block.packet[M]);
+ }
+ }
+
+ ri += 4*vectorSize;
+ }
+ for (; i + 2 <= depth; i += 2) {
+ if(StorageOrder == ColMajor)
+ {
+ blockB[ri+0] = rhs2(i + 0, 0);
+ blockB[ri+1] = rhs2(i + 1, 0);
+ blockB[ri+2] = rhs2(i + 0, 1);
+ blockB[ri+3] = rhs2(i + 1, 1);
+ blockB[ri+4] = rhs2(i + 0, 2);
+ blockB[ri+5] = rhs2(i + 1, 2);
+ blockB[ri+6] = rhs2(i + 0, 3);
+ blockB[ri+7] = rhs2(i + 1, 3);
+ } else {
+ PacketBlock<Packet8bf,2> block;
+
+ for (int M = 0; M < 2; M++) {
+ block.packet[M] = rhs2.template loadPacketPartial<Packet8bf>(i + M, 0, 4);
+ }
+
+ block.packet[0] = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+
+ pstore<bfloat16>(blockB + ri, block.packet[0]);
+ }
+
+ ri += 4*2;
+ }
+ if (depth & 1)
+ {
+ blockB[ri+0] = rhs2(i, 0);
+ blockB[ri+1] = rhs2(i, 1);
+ blockB[ri+2] = rhs2(i, 2);
+ blockB[ri+3] = rhs2(i, 3);
+
+ ri += 4;
+ }
+
+ if(PanelMode) ri += 4*(stride - offset - depth);
+ }
+
+ if(PanelMode) ri += offset;
+
+ for(; j < cols; j++)
+ {
+ const DataMapper rhs2 = rhs.getSubMapper(0, j);
+ for(Index i = 0; i < depth; i++)
+ {
+ blockB[ri] = rhs2(i, 0);
+ ri += 1;
+ }
+
+ if(PanelMode) ri += stride - depth;
+ }
+ }
+};
+#endif
+
// General template for lhs complex packing, float64 specialization.
template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
@@ -2322,6 +2661,64 @@
}
#endif
+#ifdef __MMA__
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+ void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+ ::operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+ dhs_pack<bfloat16, DataMapper, Packet8bf, ColMajor, PanelMode, false> pack;
+ pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+ void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+ ::operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+ dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, false> pack;
+ pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+ void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+ ::operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+ dhs_pack<bfloat16, DataMapper, Packet8bf, ColMajor, PanelMode, true> pack;
+ pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+{
+ void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+ ::operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+ dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, true> pack;
+ pack(blockA, lhs, depth, rows, stride, offset);
+}
+#endif
+
template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
{
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
index b3e063d..01465d3 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
@@ -1,117 +1,64 @@
- #ifndef EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
- #define EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
+#ifndef EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
+#define EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
+
+#if EIGEN_COMP_LLVM
+#define BFLOAT16_UNROLL _Pragma("unroll 8")
+#else
+#define BFLOAT16_UNROLL _Pragma("GCC unroll(8)")
+#endif
namespace Eigen {
namespace internal {
-EIGEN_STRONG_INLINE void pgerMMAbfloat16(__vector_quad* acc, const Packet8bf& a, const Packet8bf& b, int maskX, int maskY)
-{
- switch(maskX){
- case 15:
- switch(maskY){
- case 0b1111:
- __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val));
- break;
- case 0b0011:
- __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1111, 0b11, 0b11);
- break;
- case 0b0001:
- __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1111, 0b1, 0b11);
- break;
- case 0b0111:
- __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1111, 0b111, 0b11);
- break;
- }
- break;
- case 3:
- switch(maskY){
- case 0b1111:
- __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val));
- break;
- case 0b0011:
- __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b11, 0b11, 0b11);
- break;
- case 0b0001:
- __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b11, 0b1, 0b11);
- break;
- case 0b0111:
- __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b11, 0b111, 0b11);
- break;
- }
- break;
- case 1:
- switch(maskY){
- case 0b1111:
- __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val));
- break;
- case 0b0011:
- __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1, 0b11, 0b11);
- break;
- case 0b0001:
- __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1, 0b1, 0b11);
- break;
- case 0b0111:
- __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b1, 0b111, 0b11);
- break;
- }
- break;
- case 0b0111:
- switch(maskY){
- case 0b1111:
- __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val));
- break;
- case 0b0011:
- __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b111, 0b11, 0b11);
- break;
- case 0b0001:
- __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b111, 0b1, 0b11);
- break;
- case 0b0111:
- __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast<Packet16uc>(a.m_val), reinterpret_cast<Packet16uc>(b.m_val), 0b111, 0b111, 0b11);
- break;
- }
- break;
- }
-}
-
-EIGEN_STRONG_INLINE void scaleAndStore(float* result, float* acc, Packet4f pAlpha)
+EIGEN_ALWAYS_INLINE void scaleAndStore(float* result, Packet4f& acc, const Packet4f& pAlpha)
{
Packet4f result_block = ploadu<Packet4f>(result);
- Packet4f packet_pmadd = pmadd(pload<Packet4f>(acc), pAlpha, result_block);
- pstoreu(result, packet_pmadd);
+ result_block = pmadd(acc, pAlpha, result_block);
+ pstoreu(result, result_block);
}
-template<int num_packets, bool zero>
-EIGEN_STRONG_INLINE Packet8bf loadLhsBfloat16(const bfloat16* indexA)
+template<Index num_packets, bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadLhsBfloat16(const bfloat16* indexA)
{
Packet8bf lhs1 = ploadu<Packet8bf>(indexA);
- Packet8bf lhs2;
- const int packet_size = 8; //We fit 8 bfloat16 on a 128 register
if(zero){
- lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
+ Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
+ return vec_mergeh(lhs1.m_val, lhs2.m_val);
+ } else {
+ return lhs1;
}
- else lhs2 = ploadu<Packet8bf>(indexA + num_packets*packet_size);
- return vec_mergeh(lhs1.m_val, lhs2.m_val);
}
template<bool zero>
-EIGEN_STRONG_INLINE Packet8bf loadLhsBfloat16ExtraRows(const bfloat16* indexA, Index strideA, Index row, int extra_rows)
+EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16Extra(const bfloat16* indexA, Index strideA, Index extra_rows)
{
- EIGEN_ALIGN16 bfloat16 lhs_array[8] = {Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0)};
- int count = 0;
- const bfloat16* idxA = indexA + row*strideA;
- for(int row_count = 0; row_count < extra_rows; row_count++){
- lhs_array[count++] = *idxA;
- if(!zero) lhs_array[count] = *(idxA+1);
- count++;
- idxA += strideA;
+ Index row_count = 0;
+ if (zero) {
+ EIGEN_ALIGN16 bfloat16 lhs_array[8] = { Eigen::bfloat16(0) };
+ do{
+ lhs_array[row_count] = *indexA;
+ indexA += strideA;
+ } while ((row_count += 2) < extra_rows*2);
+ return pload_partial<Packet8bf>(lhs_array, extra_rows*2);
+ } else {
+ EIGEN_ALIGN16 int lhs_array[4];
+ do{
+ lhs_array[row_count] = *reinterpret_cast<const int *>(indexA);
+ indexA += strideA;
+ } while ((row_count += 1) < extra_rows);
+ return reinterpret_cast<Packet8us>(pload_partial<Packet4i>(lhs_array, extra_rows));
}
- return pload<Packet8bf>(lhs_array);
}
template<bool zero>
-EIGEN_STRONG_INLINE Packet8bf loadRhsBfloat16(const bfloat16* baseB, Index strideB, int i, int k)
+EIGEN_ALWAYS_INLINE Packet8bf loadLhsBfloat16ExtraRows(const bfloat16* indexA, Index strideA, Index row, Index extra_rows)
+{
+ return loadBfloat16Extra<zero>(indexA + row*strideA, strideA, extra_rows);
+}
+
+template<bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16(const bfloat16* baseB, Index strideB, Index i, Index k)
{
const bfloat16* indexB = baseB + strideB*4*i + (k*4);
Packet8bf rhs1 = ploadu<Packet8bf>(indexB);
@@ -119,28 +66,16 @@
Packet8bf rhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
return vec_mergeh(rhs1.m_val, rhs2.m_val);
}
- //r = vec_perm (a, b, c)
- //Let v be the concatenation of a and b.
- //Each byte of r selected by using the least-significant 5 bits of the corresponding byte of c as an index into v
- //We need this elements from rhs: 0, 4, 1, 5, 2, 6, 3, 7
- Packet16uc c = {0x0u, 0x1u, 0x8u, 0x9u, 0x2u, 0x3u, 0xAu, 0xB, 0x4, 0x5, 0xCu, 0xDu, 0x6u, 0x7u, 0xEu, 0xFu};
- return vec_perm(rhs1.m_val, rhs1.m_val, c);
+ return rhs1;
}
template<bool zero>
-EIGEN_STRONG_INLINE Packet8bf loadRhsBfloat16ExtraCols(const bfloat16* blockB, Index strideB, Index offsetB, Index col, int i, int k, int extra_cols)
+EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16ExtraCols(const bfloat16* blockB, Index strideB, Index offsetB, Index col, Index i, Index k, Index extra_cols)
{
- EIGEN_ALIGN16 bfloat16 rhs_vector[8] = {Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0)};
- const bfloat16* indexB = blockB + ((col+4*i)*strideB)+k+offsetB;
- for(int c = 0; c < extra_cols; c++){
- rhs_vector[2*c] = *indexB;
- if(!zero) rhs_vector[2*c+1] = *(indexB+1);
- indexB += strideB;
- }
- return pload<Packet8bf>(rhs_vector);
+ return loadBfloat16Extra<zero>(blockB + ((col+4*i)*strideB)+k+offsetB, strideB, extra_cols);
}
-template<int num_acc, int num_packets, bool zero, bool rhs_extra_cols, bool lhs_extra_rows>
+template<Index num_acc, Index num_packets, bool zero, bool rhs_extra_cols, bool lhs_extra_rows>
EIGEN_STRONG_INLINE void KLoop
(
const bfloat16* indexA,
@@ -152,107 +87,95 @@
Index k,
Index row,
Index col,
- int extra_rows,
- int extra_cols,
- int mask_rows = 0xF,
- int mask_cols = 0xF
+ Index extra_rows,
+ Index extra_cols
)
{
Packet8bf lhs;
Packet8bf rhs[num_acc];
if(lhs_extra_rows) lhs = loadLhsBfloat16ExtraRows<zero>(indexA+k, strideA, row, extra_rows);
- else lhs = loadLhsBfloat16<num_packets, zero>(indexA + k*num_packets*8); //a packet of bfloat16 has 8 elements
- for(int i = 0; i < num_acc; i++){
+ else lhs = loadLhsBfloat16<num_packets, zero>(indexA + k*num_packets); //a packet of bfloat16 has 8 elements
+ BFLOAT16_UNROLL
+ for(Index i = 0; i < num_acc; i++){
if(!rhs_extra_cols)
rhs[i] = loadRhsBfloat16<zero>(indexB, strideB, i, k);
else{
rhs[i] = loadRhsBfloat16ExtraCols<zero>(indexB, strideB, offsetB, col, i, k, extra_cols);
}
- pgerMMAbfloat16(&(quad_acc[i]), rhs[i], lhs, mask_cols, mask_rows);
+ __builtin_mma_xvbf16ger2pp(&(quad_acc[i]), reinterpret_cast<Packet16uc>(rhs[i].m_val), reinterpret_cast<Packet16uc>(lhs.m_val));
}
}
-template<const int num_acc, const int standard_block_size, const int num_packets, bool rhsExtraCols = false, bool lhsExtraRows = false>
-void colLoopBody(Index* p_col, Index row, Index depth, Index cols, Index rows, int offset_row, int block_index, Packet4f pAlpha, const bfloat16* indexA, Index strideA, const bfloat16* blockB, Index strideB, Index offsetB, float* result, int extra_cols = 0, int extra_rows = 0, int mask_cols = 0xF, int mask_rows = 0xF)
+template<const Index num_acc, const Index num_packets, bool rhsExtraCols = false, bool lhsExtraRows = false>
+void colLoopBody(Index& col, Index row, Index depth, Index cols, Index rows, Index offset_row, Index block_index, const Packet4f& pAlpha, const bfloat16* indexA, Index strideA, const bfloat16* blockB, Index strideB, Index offsetB, float* result, Index extra_cols = 0, Index extra_rows = 0)
{
- int col = *p_col;
- int count;
- int max, step, bound;
- const bfloat16* indexB;
+ const Index step = rhsExtraCols ? 1 : (num_acc * 4); //each accumulator has 4 elements
+ const bfloat16* indexB = rhsExtraCols ? blockB : (blockB + 4*offsetB + strideB*col);
- if(num_acc == 1) bound = 0;
- else bound = 1;
-
- if(rhsExtraCols){
- count = 0;
- max = 1;
- step = 1;
- indexB = blockB;
- }
- else{
- count = col;
- step = num_acc * 4; //each accumulator has 4 elements
- max = cols/step;
- indexB = blockB + 4*offsetB + strideB*col;
- }
-
- while(count/step + bound < max){
+ while(col + step <= cols){
Index k = 0;
- EIGEN_ALIGN32 float acc[num_acc][4][4];
+ Packet4f acc[num_acc][4];
__vector_quad quad_acc[num_acc];
- for(int i = 0; i < num_acc; i++)
+ BFLOAT16_UNROLL
+ for(Index i = 0; i < num_acc; i++)
__builtin_mma_xxsetaccz(&(quad_acc[i]));
- if(depth%2 != 0){
- KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows>(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols, mask_rows, mask_cols);
- k = 1;
+ for(; k + 2 <= depth; k += 2){
+ KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows>(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols);
}
- for(; k/2 < depth/2; k += 2){
- KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows>(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols, mask_rows, mask_cols);
+ if(depth&1){
+ KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows>(indexA-offset_row, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols);
}
- for(int i = 0; i < num_acc; i++){
+
+ BFLOAT16_UNROLL
+ for(Index i = 0; i < num_acc; i++)
__builtin_mma_disassemble_acc((void*)acc[i], &(quad_acc[i]));
+
+ for(Index i = 0; i < num_acc; i++){
if(lhsExtraRows){
- for(int x = 0; x < extra_cols; x++){
- for(int y = 0; y < extra_rows; y++){
- result[((col+i*4)+x)*rows + row + y] += acc[i][x][y]*(pAlpha[0]);
- }
+ float *r = result + (col+i*4)*rows + row;
+ for(Index x = 0; x < extra_cols; x++, r += rows){
+ Packet4f result_block = ploadu_partial<Packet4f>(r, extra_rows);
+ result_block = pmadd(acc[i][x], pAlpha, result_block);
+ pstoreu_partial<float>(r, result_block, extra_rows);
}
}
else{
if(rhsExtraCols){
- for(int x = 0; x < cols-col; x++){
- scaleAndStore(result + ((col+i*4)+x)*rows + row + offset_row,acc[i][x], pAlpha);
+ float *r = result + (col+i*4)*rows + row + offset_row;
+ for(Index x = 0; x < cols-col; x++, r += rows){
+ scaleAndStore(r,acc[i][x], pAlpha);
}
}
else{
- for(int x = 0; x < 4; x++){
- scaleAndStore(result + ((col+i*4)+x)*rows + (block_index*16) + offset_row,acc[i][x], pAlpha);
+ float *r = result + (col+i*4)*rows + (block_index*16) + offset_row;
+ for(Index x = 0; x < 4; x++, r += rows){
+ scaleAndStore(r,acc[i][x], pAlpha);
}
}
}
}
- count += step;
- if(!rhsExtraCols) {
- indexB += strideB*step;
- col += step;
- }
+ if(rhsExtraCols) return;
+ indexB += strideB*step;
+ col += step;
}
- *p_col = col;
}
template<typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
void gemmMMAbfloat16(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, Index rows, Index depth, Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
{
-
if(rows == 0 || cols == 0 || depth == 0) return;
- const Packet4f pAlpha = pset1<Packet4f>(Eigen::bfloat16_impl::bfloat16_to_float(alpha));
+ float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
+ if (falpha == float(0)) return;
+ const Packet4f pAlpha = pset1<Packet4f>(falpha);
ei_declare_aligned_stack_constructed_variable(float, result, cols*rows, 0);
- for(int j = 0; j < cols; j++){
- for(int i = 0; i < rows; i++){
- result[j*rows + i] = res(i,j);
+ typedef typename DataMapper::LinearMapper LinearMapper;
+ for(Index j = 0; j < cols; j++){
+ const LinearMapper res2 = res.getLinearMapper(0, j);
+ for(Index i = 0; i < rows; i++){
+ result[j*rows + i] = res2(i);
}
}
@@ -268,26 +191,27 @@
//Blocks of 8 columns with <8 elements as row major. This happens when there's less than 8 remaining rows
//Loop for LHS standard block (8x16)
- int standard_block_size = 16;
- const int standard_blocks_quantity = rows/standard_block_size; //Number of standard blocks
- int bigSuffix = (2*8) * (strideA-offsetA-depth);
+ const Index standard_block_size = 16;
+ const Index standard_blocks_quantity = rows/standard_block_size; //Number of standard blocks
+ Index bigSuffix = (2*8) * (strideA-offsetA-depth);
const bfloat16* indexA = blockA;
- int block_index;
+ const Index offset_factor = 2;
+ Index block_index;
for(block_index = 0; block_index < standard_blocks_quantity; block_index++){
indexA += 2*8*offsetA;
- for(int offset_row = 0; offset_row < standard_block_size; offset_row += 4){ //This block size has 16 rows maximum
+ for(Index offset_row = 0; offset_row < standard_block_size; offset_row += 4){ //This block size has 16 rows maximum
col = 0;
- colLoopBody<5, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
- colLoopBody<4, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
- colLoopBody<3, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
- colLoopBody<2, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
- colLoopBody<1, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<7, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<6, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<5, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<4, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<3, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<2, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<1, 16>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
if(cols > col){
- int extra_cols= cols-col;
- int shift = (4-extra_cols>= 0) ? 4-extra_cols: 0;
- int mask_cols= 0xF >> shift;
+ Index extra_cols= cols-col;
//Remember: It doesnt make sense use multiple acc to extra_cols as we are unrolling col loop
- colLoopBody<1, 16, 2, true>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result, extra_cols, 4, mask_cols, 0xF);
+ colLoopBody<1, 16, true>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result, extra_cols, 4);
}
}
row += 16;
@@ -296,75 +220,66 @@
//LHS (8x8) block
if(rows - standard_blocks_quantity*16 >= 8){
indexA += 1*8*offsetA + 2*8*offsetA;
- for(int offset_row = 0; offset_row < 8; offset_row += 4){
+ for(Index offset_row = 0; offset_row < 8; offset_row += 4){
col = 0;
- colLoopBody<5, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
- colLoopBody<4, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
- colLoopBody<3, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
- colLoopBody<2, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
- colLoopBody<1, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<7, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<6, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<5, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<4, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<3, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<2, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
+ colLoopBody<1, 8>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result);
}
if(cols > col){
- int extra_cols= cols-col;
- int shift = (4-extra_cols>= 0) ? 4-extra_cols: 0;
- int mask_cols= 0xF >> shift;
+ Index extra_cols= cols-col;
- for(int offset_row = 0; offset_row < 8; offset_row += 4){
- colLoopBody<1, 8, 1, true>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result, extra_cols, 4, mask_cols, 0xF);
+ for(Index offset_row = 0; offset_row < 8; offset_row += 4){
+ colLoopBody<1, 8, true>(col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row*offset_factor, strideA, blockB, strideB, offsetB, result, extra_cols, 4);
}
} //end extra cols
row += 8;
}
//extra rows
while(row < rows){
- int extra_rows = rows-row;
- int shift = (4-extra_rows >= 0) ? 4-extra_rows : 0;
- int mask_rows = 0xF >> shift;
- int extra_rows_or_four = (extra_rows <= 4) ? extra_rows : 4;
+ Index extra_rows = rows-row;
+ Index extra_rows_or_four = (extra_rows <= 4) ? extra_rows : 4;
//This index is the beginning of remaining block.
//This last block for LHS is organized as RowMajor
col = 0;
- colLoopBody<5, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
- colLoopBody<4, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
- colLoopBody<3, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
- colLoopBody<2, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
- colLoopBody<1, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows);
+ colLoopBody<7, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
+ colLoopBody<6, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
+ colLoopBody<5, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
+ colLoopBody<4, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
+ colLoopBody<3, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
+ colLoopBody<2, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
+ colLoopBody<1, 8, false, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four);
if(cols > col){
- int extra_cols= cols-col;
- int shift = (4-extra_cols>= 0) ? 4-extra_cols: 0;
- int mask_cols= 0xF >> shift;
- int extra_cols_or_four = (extra_cols <= 4) ? extra_cols : 4;
+ Index extra_cols= cols-col;
- colLoopBody<1, 8, 1, true, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, extra_cols_or_four, extra_rows_or_four, mask_cols, mask_rows);
+ colLoopBody<1, 8, true, true>(col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, extra_cols, extra_rows_or_four);
}
row += extra_rows_or_four;
}
//Convert back to bfloat16
- for(col = 0; col/4 < cols/4; col += 4){
- int row;
- for(row = 0; row/8 < rows/8; row += 8){
+ for(col = 0; col + 4 <= cols; col += 4){
+ const DataMapper res2 = res.getSubMapper(0, col);
+ for(row = 0; row + 8 <= rows; row += 8){
//get and save block
PacketBlock<Packet8bf,4> block;
- for(int j = 0; j < 4; j++){
- Packet4f temp_even, temp_odd;
- EIGEN_ALIGN32 float even[4], odd[4];
- for(int i = 0; i < 4; i++){
- even[i] = result[(col + j)*rows + row + i*2];
- odd[i] = result[(col + j)*rows + row + i*2+1];
- }
- temp_even = pload<Packet4f>(even);
- temp_odd = pload<Packet4f>(odd);
- block.packet[j] = F32ToBf16(temp_even, temp_odd);
+ for(Index j = 0; j < 4; j++){
+ Packet16uc fp16_0 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(pload<Packet4f>(result + (col + j)*rows + row)));
+ Packet16uc fp16_1 = __builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(pload<Packet4f>(result + (col + j)*rows + row + 4)));
+ block.packet[j].m_val = vec_pack(reinterpret_cast<Packet4ui>(fp16_0), reinterpret_cast<Packet4ui>(fp16_1));
}
- res.template storePacketBlock<Packet8bf,4>(row, col, block);
+ res2.template storePacketBlock<Packet8bf,4>(row, 0, block);
}
//extra rows
while(row < rows){
- for(int col_off = 0; col_off < 4; col_off++){
- res(row, col+col_off) = Eigen::bfloat16(result[(col+col_off)*rows+row]);
+ for(Index col_off = 0; col_off < 4; col_off++){
+ res2(row, col_off) = Eigen::bfloat16(result[(col+col_off)*rows+row]);
}
row++;
}
@@ -372,8 +287,9 @@
}
//extra cols
while(col < cols){
- for(int r = 0; r < rows; r++){
- res(r, col) = Eigen::bfloat16(result[col*rows + r]);
+ const LinearMapper res2 = res.getLinearMapper(0, col);
+ for(Index r= 0; r< rows; r++){
+ res2(r) = Eigen::bfloat16(result[col*rows + r]);
}
col++;
}
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 75d6228..6ff3e20 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -445,7 +445,7 @@
// of the functions, while the latter can only deal with one of them.
#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
-#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
// We need to provide emulated *host-side* FP16 operators for clang.
#pragma push_macro("EIGEN_DEVICE_FUNC")
#undef EIGEN_DEVICE_FUNC
@@ -510,7 +510,7 @@
return float(a) >= float(b);
}
-#if defined(__clang__) && defined(__CUDA__)
+#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
#pragma pop_macro("EIGEN_DEVICE_FUNC")
#endif
#endif // Emulate support for half floats
diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
index 5dc5ef8..c892d39 100644
--- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
@@ -26,7 +26,7 @@
template <typename LaneIdType>
EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
- Packet4f& c, Packet4f& tmp,
+ Packet4f& c, Packet4f&,
const LaneIdType&) const {
acc(a, b, c);
}
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 3f7638e..a2486af 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -380,11 +380,6 @@
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
spbh.store(sup, i,j,block);
sup->template storePacket<SubPacket>(i, j+idx, block.packet[idx]);
- //for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
- //{
- // Scalar_ *v = &sup->operator()(i+l, j+idx);
- // *v = *reinterpret_cast<Scalar_ *>(&block.packet[idx][l]);
- //}
}
};
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index 733b1aa..d89d99e 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -132,15 +132,7 @@
/** \returns the largest \c k in [start,end) such that for all \c j in [start,k) index[\c j]\<\a key */
inline Index searchLowerIndex(Index start, Index end, Index key) const
{
- while(end>start)
- {
- Index mid = (end+start)>>1;
- if (m_indices[mid]<key)
- start = mid+1;
- else
- end = mid;
- }
- return start;
+ return static_cast<Index>(std::distance(m_indices, std::lower_bound(m_indices + start, m_indices + end, key)));
}
/** \returns the stored value at index \a key
@@ -198,19 +190,12 @@
return m_values[id];
}
- void moveChunk(Index from, Index to, Index chunkSize)
+ inline void moveChunk(Index from, Index to, Index chunkSize)
{
- eigen_internal_assert(to+chunkSize <= m_size);
- if(to>from && from+chunkSize>to)
- {
- // move backward
- internal::smart_memmove(m_values+from, m_values+from+chunkSize, m_values+to);
- internal::smart_memmove(m_indices+from, m_indices+from+chunkSize, m_indices+to);
- }
- else
- {
- internal::smart_copy(m_values+from, m_values+from+chunkSize, m_values+to);
- internal::smart_copy(m_indices+from, m_indices+from+chunkSize, m_indices+to);
+ eigen_internal_assert(chunkSize >= 0 && to+chunkSize <= m_size);
+ if (chunkSize > 0) {
+ internal::smart_memmove(m_values + from, m_values + from + chunkSize, m_values + to);
+ internal::smart_memmove(m_indices + from, m_indices + from + chunkSize, m_indices + to);
}
}
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 6806812..eaefcef 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -92,6 +92,31 @@
};
};
+template <typename StorageIndex>
+struct sparse_reserve_op {
+ EIGEN_DEVICE_FUNC sparse_reserve_op(Index begin, Index end, Index size) {
+ Index range = numext::mini(end - begin, size);
+ m_begin = begin;
+ m_end = begin + range;
+ m_val = StorageIndex(size / range);
+ m_remainder = StorageIndex(size % range);
+ }
+ template <typename IndexType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex operator()(IndexType i) const {
+ if ((i >= m_begin) && (i < m_end))
+ return m_val + ((i - m_begin) < m_remainder ? 1 : 0);
+ else
+ return 0;
+ }
+ StorageIndex m_val, m_remainder;
+ Index m_begin, m_end;
+};
+
+template <typename Scalar>
+struct functor_traits<sparse_reserve_op<Scalar>> {
+ enum { Cost = 1, PacketAccess = false, IsRepeatable = true };
+};
+
} // end namespace internal
template<typename Scalar_, int Options_, typename StorageIndex_>
@@ -196,7 +221,7 @@
const Index outer = IsRowMajor ? row : col;
const Index inner = IsRowMajor ? col : row;
Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer+1];
- return m_data.atInRange(m_outerIndex[outer], end, StorageIndex(inner));
+ return m_data.atInRange(m_outerIndex[outer], end, inner);
}
/** \returns a non-const reference to the value of the matrix at position \a i, \a j
@@ -210,20 +235,17 @@
inline Scalar& coeffRef(Index row, Index col)
{
eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
-
const Index outer = IsRowMajor ? row : col;
const Index inner = IsRowMajor ? col : row;
-
Index start = m_outerIndex[outer];
- Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer+1];
- eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
- if(end<=start)
- return insert(row,col);
- const Index p = m_data.searchLowerIndex(start,end-1,StorageIndex(inner));
- if((p<end) && (m_data.index(p)==inner))
- return m_data.value(p);
+ Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer + 1];
+ eigen_assert(end >= start && "you probably called coeffRef on a non finalized matrix");
+ if (end <= start) return insertAtByOuterInner(outer, inner, start);
+ Index dst = m_data.searchLowerIndex(start, end, inner);
+ if ((dst < end) && (m_data.index(dst) == inner))
+ return m_data.value(dst);
else
- return insert(row,col);
+ return insertAtByOuterInner(outer, inner, dst);
}
/** \returns a reference to a novel non zero coefficient with coordinates \a row x \a col.
@@ -241,7 +263,7 @@
* if the elements of each inner vector are inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
*
*/
- Scalar& insert(Index row, Index col);
+ inline Scalar& insert(Index row, Index col);
public:
@@ -301,6 +323,11 @@
if(isCompressed())
{
Index totalReserveSize = 0;
+ for (Index j = 0; j < m_outerSize; ++j) totalReserveSize += reserveSizes[j];
+
+ // if reserveSizes is empty, don't do anything!
+ if (totalReserveSize == 0) return;
+
// turn the matrix into non-compressed mode
m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
@@ -312,18 +339,18 @@
{
newOuterIndex[j] = count;
count += reserveSizes[j] + (m_outerIndex[j+1]-m_outerIndex[j]);
- totalReserveSize += reserveSizes[j];
}
+
m_data.reserve(totalReserveSize);
StorageIndex previousOuterIndex = m_outerIndex[m_outerSize];
for(Index j=m_outerSize-1; j>=0; --j)
{
StorageIndex innerNNZ = previousOuterIndex - m_outerIndex[j];
- for(Index i=innerNNZ-1; i>=0; --i)
- {
- m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
- m_data.value(newOuterIndex[j]+i) = m_data.value(m_outerIndex[j]+i);
- }
+ StorageIndex begin = m_outerIndex[j];
+ StorageIndex end = begin + innerNNZ;
+ StorageIndex target = newOuterIndex[j];
+ internal::smart_memmove(innerIndexPtr() + begin, innerIndexPtr() + end, innerIndexPtr() + target);
+ internal::smart_memmove(valuePtr() + begin, valuePtr() + end, valuePtr() + target);
previousOuterIndex = m_outerIndex[j];
m_outerIndex[j] = newOuterIndex[j];
m_innerNonZeros[j] = innerNNZ;
@@ -350,15 +377,15 @@
m_data.resize(count);
for(Index j=m_outerSize-1; j>=0; --j)
{
- Index offset = newOuterIndex[j] - m_outerIndex[j];
+ StorageIndex offset = newOuterIndex[j] - m_outerIndex[j];
if(offset>0)
{
StorageIndex innerNNZ = m_innerNonZeros[j];
- for(Index i=innerNNZ-1; i>=0; --i)
- {
- m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
- m_data.value(newOuterIndex[j]+i) = m_data.value(m_outerIndex[j]+i);
- }
+ StorageIndex begin = m_outerIndex[j];
+ StorageIndex end = begin + innerNNZ;
+ StorageIndex target = newOuterIndex[j];
+ internal::smart_memmove(innerIndexPtr() + begin, innerIndexPtr() + end, innerIndexPtr() + target);
+ internal::smart_memmove(valuePtr() + begin, valuePtr() + end, valuePtr() + target);
}
}
@@ -392,7 +419,7 @@
{
eigen_assert(Index(m_outerIndex[outer+1]) == m_data.size() && "Invalid ordered insertion (invalid outer index)");
eigen_assert( (m_outerIndex[outer+1]-m_outerIndex[outer]==0 || m_data.index(m_data.size()-1)<inner) && "Invalid ordered insertion (invalid inner index)");
- Index p = m_outerIndex[outer+1];
+ StorageIndex p = m_outerIndex[outer+1];
++m_outerIndex[outer+1];
m_data.append(Scalar(0), inner);
return m_data.value(p);
@@ -402,7 +429,7 @@
* \warning use it only if you know what you are doing */
inline Scalar& insertBackByOuterInnerUnordered(Index outer, Index inner)
{
- Index p = m_outerIndex[outer+1];
+ StorageIndex p = m_outerIndex[outer+1];
++m_outerIndex[outer+1];
m_data.append(Scalar(0), inner);
return m_data.value(p);
@@ -446,10 +473,14 @@
template<typename InputIterators,typename DupFunctor>
void setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
- void sumupDuplicates() { collapseDuplicates(internal::scalar_sum_op<Scalar,Scalar>()); }
+ template<typename Derived, typename DupFunctor>
+ void collapseDuplicates(DenseBase<Derived>& wi, DupFunctor dup_func = DupFunctor());
- template<typename DupFunctor>
- void collapseDuplicates(DupFunctor dup_func = DupFunctor());
+ template<typename InputIterators>
+ void setFromSortedTriplets(const InputIterators& begin, const InputIterators& end);
+
+ template<typename InputIterators, typename DupFunctor>
+ void setFromSortedTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
//---
@@ -464,27 +495,23 @@
*/
void makeCompressed()
{
- if(isCompressed())
- return;
+ if (isCompressed()) return;
eigen_internal_assert(m_outerIndex!=0 && m_outerSize>0);
- Index oldStart = m_outerIndex[1];
+ StorageIndex start = m_outerIndex[1];
m_outerIndex[1] = m_innerNonZeros[0];
for(Index j=1; j<m_outerSize; ++j)
{
- Index nextOldStart = m_outerIndex[j+1];
- Index offset = oldStart - m_outerIndex[j];
- if(offset>0)
+ StorageIndex end = start + m_innerNonZeros[j];
+ StorageIndex target = m_outerIndex[j];
+ if (start != target)
{
- for(Index k=0; k<m_innerNonZeros[j]; ++k)
- {
- m_data.index(m_outerIndex[j]+k) = m_data.index(oldStart+k);
- m_data.value(m_outerIndex[j]+k) = m_data.value(oldStart+k);
- }
+ internal::smart_memmove(innerIndexPtr() + start, innerIndexPtr() + end, innerIndexPtr() + target);
+ internal::smart_memmove(valuePtr() + start, valuePtr() + end, valuePtr() + target);
}
- m_outerIndex[j+1] = m_outerIndex[j] + m_innerNonZeros[j];
- oldStart = nextOldStart;
+ start = m_outerIndex[j + 1];
+ m_outerIndex[j + 1] = m_outerIndex[j] + m_innerNonZeros[j];
}
internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
m_innerNonZeros = 0;
@@ -495,13 +522,12 @@
/** Turns the matrix into the uncompressed mode */
void uncompress()
{
- if(m_innerNonZeros != 0)
- return;
+ if(m_innerNonZeros != 0) return;
m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
- for (Index i = 0; i < m_outerSize; i++)
- {
- m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i];
- }
+ typename IndexVector::AlignedMapType innerNonZeroMap(m_innerNonZeros, m_outerSize);
+ typename IndexVector::ConstAlignedMapType outerIndexMap(m_outerIndex, m_outerSize);
+ typename IndexVector::ConstMapType nextOuterIndexMap(m_outerIndex + 1, m_outerSize);
+ innerNonZeroMap = nextOuterIndexMap - outerIndexMap;
}
/** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
@@ -520,27 +546,32 @@
template<typename KeepFunc>
void prune(const KeepFunc& keep = KeepFunc())
{
- // TODO optimize the uncompressed mode to avoid moving and allocating the data twice
- makeCompressed();
-
StorageIndex k = 0;
for(Index j=0; j<m_outerSize; ++j)
{
- Index previousStart = m_outerIndex[j];
- m_outerIndex[j] = k;
- Index end = m_outerIndex[j+1];
- for(Index i=previousStart; i<end; ++i)
+ StorageIndex previousStart = m_outerIndex[j];
+ if (isCompressed())
+ m_outerIndex[j] = k;
+ else
+ k = m_outerIndex[j];
+ StorageIndex end = isCompressed() ? m_outerIndex[j+1] : previousStart + m_innerNonZeros[j];
+ for(StorageIndex i=previousStart; i<end; ++i)
{
- if(keep(IsRowMajor?j:m_data.index(i), IsRowMajor?m_data.index(i):j, m_data.value(i)))
- {
+ StorageIndex row = IsRowMajor ? StorageIndex(j) : m_data.index(i);
+ StorageIndex col = IsRowMajor ? m_data.index(i) : StorageIndex(j);
+ bool keepEntry = keep(row, col, m_data.value(i));
+ if (keepEntry) {
m_data.value(k) = m_data.value(i);
m_data.index(k) = m_data.index(i);
++k;
- }
+ } else if (!isCompressed())
+ m_innerNonZeros[j]--;
}
}
- m_outerIndex[m_outerSize] = k;
- m_data.resize(k,0);
+ if (isCompressed()) {
+ m_outerIndex[m_outerSize] = k;
+ m_data.resize(k, 0);
+ }
}
/** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched.
@@ -551,64 +582,50 @@
*
* \sa reserve(), setZero(), makeCompressed()
*/
- void conservativeResize(Index rows, Index cols)
- {
- // No change
- if (this->rows() == rows && this->cols() == cols) return;
-
+ void conservativeResize(Index rows, Index cols) {
+
// If one dimension is null, then there is nothing to be preserved
- if(rows==0 || cols==0) return resize(rows,cols);
+ if (rows == 0 || cols == 0) return resize(rows, cols);
- Index innerChange = IsRowMajor ? cols - this->cols() : rows - this->rows();
- Index outerChange = IsRowMajor ? rows - this->rows() : cols - this->cols();
- StorageIndex newInnerSize = convert_index(IsRowMajor ? cols : rows);
+ Index newOuterSize = IsRowMajor ? rows : cols;
+ Index newInnerSize = IsRowMajor ? cols : rows;
- // Deals with inner non zeros
- if (m_innerNonZeros)
- {
- // Resize m_innerNonZeros
- m_innerNonZeros = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(
- m_innerNonZeros, m_outerSize + outerChange, m_outerSize);
-
- for(Index i=m_outerSize; i<m_outerSize+outerChange; i++)
- m_innerNonZeros[i] = 0;
- }
- else if (innerChange < 0)
- {
- // Inner size decreased: allocate a new m_innerNonZeros
- m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize + outerChange);
- for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++)
- m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i];
- for(Index i = m_outerSize; i < m_outerSize + outerChange; i++)
- m_innerNonZeros[i] = 0;
- }
-
- // Change the m_innerNonZeros in case of a decrease of inner size
- if (m_innerNonZeros && innerChange < 0)
- {
- for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++)
- {
- StorageIndex &n = m_innerNonZeros[i];
- StorageIndex start = m_outerIndex[i];
- while (n > 0 && m_data.index(start+n-1) >= newInnerSize) --n;
+ Index innerChange = newInnerSize - innerSize();
+ Index outerChange = newOuterSize - outerSize();
+
+ if (outerChange != 0) {
+ m_outerIndex = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(
+ outerIndexPtr(), newOuterSize + 1, outerSize() + 1);
+
+ if (!isCompressed())
+ m_innerNonZeros = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(
+ innerNonZeroPtr(), newOuterSize, outerSize());
+
+ if (outerChange > 0) {
+ StorageIndex lastIdx = outerSize() == 0 ? StorageIndex(0) : outerIndexPtr()[outerSize()];
+ std::fill_n(outerIndexPtr() + outerSize(), outerChange + 1, lastIdx);
+
+ if (!isCompressed()) std::fill_n(innerNonZeroPtr() + outerSize(), outerChange, StorageIndex(0));
}
}
-
+ m_outerSize = newOuterSize;
+
+ if (innerChange < 0) {
+ for (Index j = 0; j < outerSize(); j++) {
+ Index start = outerIndexPtr()[j];
+ Index end = isCompressed() ? outerIndexPtr()[j + 1] : start + innerNonZeroPtr()[j];
+ Index lb = data().searchLowerIndex(start, end, newInnerSize);
+ if (lb != end) {
+ uncompress();
+ innerNonZeroPtr()[j] = StorageIndex(lb - start);
+ }
+ }
+ }
m_innerSize = newInnerSize;
- // Re-allocate outer index structure if necessary
- if (outerChange == 0)
- return;
-
- m_outerIndex = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(
- m_outerIndex, m_outerSize + outerChange + 1, m_outerSize + 1);
- if (outerChange > 0)
- {
- StorageIndex lastIdx = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
- for(Index i=m_outerSize; i<m_outerSize+outerChange+1; i++)
- m_outerIndex[i] = lastIdx;
- }
- m_outerSize += outerChange;
+ Index newSize = outerIndexPtr()[outerSize()];
+ eigen_assert(newSize <= m_data.size());
+ m_data.resize(newSize);
}
/** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero.
@@ -740,12 +757,19 @@
inline void setIdentity()
{
eigen_assert(rows() == cols() && "ONLY FOR SQUARED MATRICES");
- this->m_data.resize(rows());
- Eigen::Map<IndexVector>(this->m_data.indexPtr(), rows()).setLinSpaced(0, StorageIndex(rows()-1));
- Eigen::Map<ScalarVector>(this->m_data.valuePtr(), rows()).setOnes();
- Eigen::Map<IndexVector>(this->m_outerIndex, rows()+1).setLinSpaced(0, StorageIndex(rows()));
- internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
- m_innerNonZeros = 0;
+ if (m_innerNonZeros) {
+ internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+ m_innerNonZeros = 0;
+ }
+ m_data.resize(rows());
+ // is it necessary to squeeze?
+ m_data.squeeze();
+ typename IndexVector::AlignedMapType outerIndexMap(outerIndexPtr(), rows() + 1);
+ typename IndexVector::AlignedMapType innerIndexMap(innerIndexPtr(), rows());
+ typename ScalarVector::AlignedMapType valueMap(valuePtr(), rows());
+ outerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
+ innerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
+ valueMap.setOnes();
}
inline SparseMatrix& operator=(const SparseMatrix& other)
@@ -865,7 +889,7 @@
/** \internal
* \sa insert(Index,Index) */
- EIGEN_DONT_INLINE Scalar& insertCompressed(Index row, Index col);
+ EIGEN_DEPRECATED EIGEN_DONT_INLINE Scalar& insertCompressed(Index row, Index col);
/** \internal
* A vector object that is equal to 0 everywhere but v at the position i */
@@ -884,7 +908,7 @@
/** \internal
* \sa insert(Index,Index) */
- EIGEN_DONT_INLINE Scalar& insertUncompressed(Index row, Index col);
+ EIGEN_DEPRECATED EIGEN_DONT_INLINE Scalar& insertUncompressed(Index row, Index col);
public:
/** \internal
@@ -913,17 +937,18 @@
* 1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression.
* 2 - otherwise, for each diagonal coeff,
* 2.a - if it already exists, then we update it,
- * 2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion.
- * 2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector.
- * 3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements.
- *
- * TODO: some piece of code could be isolated and reused for a general in-place update strategy.
- * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once),
- * then it *might* be better to disable case 2.b since they will have to be copied anyway.
+ * 2.b - if the correct position is at the end of the vector, and there is capacity, push to back
+ * 2.b - otherwise, the insertion requires a data move, record insertion locations and handle in a second pass
+ * 3 - at the end, if some entries failed to be updated in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements.
*/
template<typename DiagXpr, typename Func>
void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc)
{
+
+ constexpr StorageIndex kEmptyIndexVal(-1);
+ typedef typename IndexVector::AlignedMapType IndexMap;
+ typedef typename ScalarVector::AlignedMapType ValueMap;
+
Index n = diagXpr.size();
const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar,Scalar> >::value;
@@ -933,82 +958,106 @@
this->resize(n, n);
}
- if(m_data.size()==0 || overwrite)
+ if(data().size()==0 || overwrite)
{
- typedef Array<StorageIndex,Dynamic,1> ArrayXI;
- this->makeCompressed();
- this->resizeNonZeros(n);
- Eigen::Map<ArrayXI>(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1);
- Eigen::Map<ArrayXI>(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n));
- Eigen::Map<Array<Scalar,Dynamic,1> > values = this->coeffs();
- values.setZero();
- internal::call_assignment_no_alias(values, diagXpr, assignFunc);
+ if (!isCompressed()) {
+ internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+ m_innerNonZeros = 0;
+ }
+ resizeNonZeros(n);
+ IndexMap outerIndexMap(outerIndexPtr(), n + 1);
+ IndexMap innerIndexMap(innerIndexPtr(), n);
+ ValueMap valueMap(valuePtr(), n);
+ outerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
+ innerIndexMap.setEqualSpaced(StorageIndex(0), StorageIndex(1));
+ valueMap.setZero();
+ internal::call_assignment_no_alias(valueMap, diagXpr, assignFunc);
}
else
{
- bool isComp = isCompressed();
- internal::evaluator<DiagXpr> diaEval(diagXpr);
- std::vector<IndexPosPair> newEntries;
+ internal::evaluator<DiagXpr> diaEval(diagXpr);
- // 1 - try in-place update and record insertion failures
- for(Index i = 0; i<n; ++i)
- {
- internal::LowerBoundIndex lb = this->lower_bound(i,i);
- Index p = lb.value;
- if(lb.found)
- {
- // the coeff already exists
- assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
+ ei_declare_aligned_stack_constructed_variable(StorageIndex, tmp, n, 0);
+ typename IndexVector::AlignedMapType insertionLocations(tmp, n);
+ insertionLocations.setConstant(kEmptyIndexVal);
+
+ Index deferredInsertions = 0;
+ Index shift = 0;
+
+ for (Index j = 0; j < n; j++) {
+ Index begin = outerIndexPtr()[j];
+ Index end = isCompressed() ? outerIndexPtr()[j + 1] : begin + innerNonZeroPtr()[j];
+ Index capacity = outerIndexPtr()[j + 1] - end;
+ Index dst = data().searchLowerIndex(begin, end, j);
+ // the entry exists: update it now
+ if (dst != end && data().index(dst) == j) assignFunc.assignCoeff(data().value(dst), diaEval.coeff(j));
+ // the entry belongs at the back of the vector: push to back
+ else if (dst == end && capacity > 0)
+ assignFunc.assignCoeff(insertBackUncompressed(j, j), diaEval.coeff(j));
+ // the insertion requires a data move, record insertion location and handle in second pass
+ else {
+ insertionLocations.coeffRef(j) = StorageIndex(dst);
+ deferredInsertions++;
+ // if there is no capacity, all vectors to the right of this are shifted
+ if (capacity == 0) shift++;
+ }
}
- else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i]))
- {
- // non compressed mode with local room for inserting one element
- m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p);
- m_innerNonZeros[i]++;
- m_data.value(p) = Scalar(0);
- m_data.index(p) = StorageIndex(i);
- assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
- }
- else
- {
- // defer insertion
- newEntries.push_back(IndexPosPair(i,p));
- }
- }
- // 2 - insert deferred entries
- Index n_entries = Index(newEntries.size());
- if(n_entries>0)
- {
- Storage newData(m_data.size()+n_entries);
- Index prev_p = 0;
- Index prev_i = 0;
- for(Index k=0; k<n_entries;++k)
- {
- Index i = newEntries[k].i;
- Index p = newEntries[k].p;
- internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+p, newData.valuePtr()+prev_p+k);
- internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+p, newData.indexPtr()+prev_p+k);
- for(Index j=prev_i;j<i;++j)
- m_outerIndex[j+1] += k;
- if(!isComp)
- m_innerNonZeros[i]++;
- prev_p = p;
- prev_i = i;
- newData.value(p+k) = Scalar(0);
- newData.index(p+k) = StorageIndex(i);
- assignFunc.assignCoeff(newData.value(p+k), diaEval.coeff(i));
- }
- {
- internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+m_data.size(), newData.valuePtr()+prev_p+n_entries);
- internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+m_data.size(), newData.indexPtr()+prev_p+n_entries);
- for(Index j=prev_i+1;j<=m_outerSize;++j)
- m_outerIndex[j] += n_entries;
- }
- m_data.swap(newData);
- }
+
+ if (deferredInsertions > 0) {
+
+ data().resize(data().size() + shift);
+ Index copyEnd = isCompressed() ? outerIndexPtr()[outerSize()]
+ : outerIndexPtr()[outerSize() - 1] + innerNonZeroPtr()[outerSize() - 1];
+ for (Index j = outerSize() - 1; deferredInsertions > 0; j--) {
+ Index begin = outerIndexPtr()[j];
+ Index end = isCompressed() ? outerIndexPtr()[j + 1] : begin + innerNonZeroPtr()[j];
+ Index capacity = outerIndexPtr()[j + 1] - end;
+
+ bool doInsertion = insertionLocations(j) >= 0;
+ bool breakUpCopy = doInsertion && (capacity > 0);
+ // break up copy for sorted insertion into inactive nonzeros
+ // optionally, add another criterium, i.e. 'breakUpCopy || (capacity > threhsold)'
+ // where `threshold >= 0` to skip inactive nonzeros in each vector
+ // this reduces the total number of copied elements, but requires more moveChunk calls
+ if (breakUpCopy) {
+ Index copyBegin = outerIndexPtr()[j + 1];
+ Index to = copyBegin + shift;
+ Index chunkSize = copyEnd - copyBegin;
+ if (chunkSize > 0) data().moveChunk(copyBegin, to, chunkSize);
+ copyEnd = end;
+ }
+
+ outerIndexPtr()[j + 1] += shift;
+
+ if (doInsertion) {
+ // if there is capacity, shift into the inactive nonzeros
+ if (capacity > 0) shift++;
+ Index copyBegin = insertionLocations(j);
+ Index to = copyBegin + shift;
+ Index chunkSize = copyEnd - copyBegin;
+ if (chunkSize > 0) data().moveChunk(copyBegin, to, chunkSize);
+ Index dst = to - 1;
+ data().index(dst) = StorageIndex(j);
+ assignFunc.assignCoeff(data().value(dst) = Scalar(0), diaEval.coeff(j));
+ if (!isCompressed()) innerNonZeroPtr()[j]++;
+ shift--;
+ deferredInsertions--;
+ copyEnd = copyBegin;
+ }
+ }
+ }
+ eigen_assert((shift == 0) && (deferredInsertions == 0));
}
}
+ /* Provides a consistent reserve and reallocation strategy for insertCompressed and insertUncompressed
+ If there is insufficient space for one insertion, the size of the memory buffer is doubled */
+ inline void checkAllocatedSpaceAndMaybeExpand();
+ /* These functions are used to avoid a redundant binary search operation in functions such as coeffRef() and assume `dst` is the appropriate sorted insertion point */
+ EIGEN_STRONG_INLINE Scalar& insertAtByOuterInner(Index outer, Index inner, Index dst);
+ Scalar& insertCompressedAtByOuterInner(Index outer, Index inner, Index dst);
+ Scalar& insertUncompressedAtByOuterInner(Index outer, Index inner, Index dst);
+
private:
EIGEN_STATIC_ASSERT(NumTraits<StorageIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE)
EIGEN_STATIC_ASSERT((Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS)
@@ -1026,36 +1075,106 @@
namespace internal {
-template<typename InputIterator, typename SparseMatrixType, typename DupFunctor>
-void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat, DupFunctor dup_func)
-{
- enum { IsRowMajor = SparseMatrixType::IsRowMajor };
- typedef typename SparseMatrixType::Scalar Scalar;
+// Creates a compressed sparse matrix from a range of unsorted triplets
+// Requires temporary storage to handle duplicate entries
+template <typename InputIterator, typename SparseMatrixType, typename DupFunctor>
+void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat,
+ DupFunctor dup_func) {
+
+ constexpr bool IsRowMajor = SparseMatrixType::IsRowMajor;
typedef typename SparseMatrixType::StorageIndex StorageIndex;
- SparseMatrix<Scalar,IsRowMajor?ColMajor:RowMajor,StorageIndex> trMat(mat.rows(),mat.cols());
+ typedef typename VectorX<StorageIndex>::AlignedMapType IndexMap;
+ if (begin == end) return;
- if(begin!=end)
- {
- // pass 1: count the nnz per inner-vector
- typename SparseMatrixType::IndexVector wi(trMat.outerSize());
- wi.setZero();
- for(InputIterator it(begin); it!=end; ++it)
- {
- eigen_assert(it->row()>=0 && it->row()<mat.rows() && it->col()>=0 && it->col()<mat.cols());
- wi(IsRowMajor ? it->col() : it->row())++;
- }
-
- // pass 2: insert all the elements into trMat
- trMat.reserve(wi);
- for(InputIterator it(begin); it!=end; ++it)
- trMat.insertBackUncompressed(it->row(),it->col()) = it->value();
-
- // pass 3:
- trMat.collapseDuplicates(dup_func);
+ // free innerNonZeroPtr (if present) and zero outerIndexPtr
+ mat.resize(mat.rows(), mat.cols());
+ // allocate temporary storage for nonzero insertion (outer size) and duplicate removal (inner size)
+ ei_declare_aligned_stack_constructed_variable(StorageIndex, tmp, numext::maxi(mat.innerSize(), mat.outerSize()), 0);
+ // scan triplets to determine allocation size before constructing matrix
+ IndexMap outerIndexMap(mat.outerIndexPtr(), mat.outerSize() + 1);
+ for (InputIterator it(begin); it != end; ++it) {
+ eigen_assert(it->row() >= 0 && it->row() < mat.rows() && it->col() >= 0 && it->col() < mat.cols());
+ StorageIndex j = IsRowMajor ? it->row() : it->col();
+ outerIndexMap.coeffRef(j + 1)++;
}
- // pass 4: transposed copy -> implicit sorting
- mat = trMat;
+ // finalize outer indices and allocate memory
+ std::partial_sum(outerIndexMap.begin(), outerIndexMap.end(), outerIndexMap.begin());
+ Index nonZeros = mat.outerIndexPtr()[mat.outerSize()];
+ mat.resizeNonZeros(nonZeros);
+
+ // use tmp to track nonzero insertions
+ IndexMap back(tmp, mat.outerSize());
+ back = outerIndexMap.head(mat.outerSize());
+
+ // push triplets to back of each inner vector
+ for (InputIterator it(begin); it != end; ++it) {
+ StorageIndex j = IsRowMajor ? it->row() : it->col();
+ StorageIndex i = IsRowMajor ? it->col() : it->row();
+ mat.data().index(back.coeff(j)) = i;
+ mat.data().value(back.coeff(j)) = it->value();
+ back.coeffRef(j)++;
+ }
+
+ // use tmp to collapse duplicates
+ IndexMap wi(tmp, mat.innerSize());
+ mat.collapseDuplicates(wi, dup_func);
+ mat.sortInnerIndices();
+}
+
+// Creates a compressed sparse matrix from a sorted range of triplets
+template <typename InputIterator, typename SparseMatrixType, typename DupFunctor>
+void set_from_triplets_sorted(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat,
+ DupFunctor dup_func) {
+ constexpr bool IsRowMajor = SparseMatrixType::IsRowMajor;
+ typedef typename SparseMatrixType::StorageIndex StorageIndex;
+ typedef typename VectorX<StorageIndex>::AlignedMapType IndexMap;
+ if (begin == end) return;
+
+ constexpr StorageIndex kEmptyIndexValue(-1);
+ // deallocate inner nonzeros if present and zero outerIndexPtr
+ mat.resize(mat.rows(), mat.cols());
+ // use outer indices to count non zero entries (excluding duplicate entries)
+ StorageIndex previous_j = kEmptyIndexValue;
+ StorageIndex previous_i = kEmptyIndexValue;
+ // scan triplets to determine allocation size before constructing matrix
+ IndexMap outerIndexMap(mat.outerIndexPtr(), mat.outerSize() + 1);
+ for (InputIterator it(begin); it != end; ++it) {
+ eigen_assert(it->row() >= 0 && it->row() < mat.rows() && it->col() >= 0 && it->col() < mat.cols());
+ StorageIndex j = IsRowMajor ? it->row() : it->col();
+ StorageIndex i = IsRowMajor ? it->col() : it->row();
+ eigen_assert(j > previous_j || (j == previous_j && i >= previous_i));
+ // identify duplicates by examining previous location
+ bool duplicate = (previous_j == j) && (previous_i == i);
+ if (!duplicate) outerIndexMap.coeffRef(j + 1)++;
+ previous_j = j;
+ previous_i = i;
+ }
+
+ // finalize outer indices and allocate memory
+ std::partial_sum(outerIndexMap.begin(), outerIndexMap.end(), outerIndexMap.begin());
+ Index nonZeros = mat.outerIndexPtr()[mat.outerSize()];
+ mat.resizeNonZeros(nonZeros);
+
+ previous_i = kEmptyIndexValue;
+ previous_j = kEmptyIndexValue;
+ Index back = 0;
+ for (InputIterator it(begin); it != end; ++it) {
+ StorageIndex j = IsRowMajor ? it->row() : it->col();
+ StorageIndex i = IsRowMajor ? it->col() : it->row();
+ bool duplicate = (previous_j == j) && (previous_i == i);
+ if (duplicate) {
+ mat.data().value(back - 1) = dup_func(mat.data().value(back - 1), it->value());
+ } else {
+ // push triplets to back
+ mat.data().index(back) = i;
+ mat.data().value(back) = it->value();
+ previous_j = j;
+ previous_i = i;
+ back++;
+ }
+ }
+ // matrix is finalized
}
}
@@ -1105,47 +1224,71 @@
internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,Options_,StorageIndex_> >(begin, end, *this, internal::scalar_sum_op<Scalar,Scalar>());
}
+/** The same as setFromTriplets but triplets are assumed to be pre-sorted. This is faster and requires less temporary storage.
+ * Two triplets `a` and `b` are appropriately ordered if:
+ * \code
+ * ColMajor: ((a.col() != b.col()) ? (a.col() < b.col()) : (a.row() < b.row())
+ * RowMajor: ((a.row() != b.row()) ? (a.row() < b.row()) : (a.col() < b.col())
+ * \endcode
+ */
+template<typename Scalar, int Options_, typename StorageIndex_>
+template<typename InputIterators>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::setFromSortedTriplets(const InputIterators& begin, const InputIterators& end)
+{
+ internal::set_from_triplets_sorted<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_> >(begin, end, *this, internal::scalar_sum_op<Scalar, Scalar>());
+}
+
/** The same as setFromTriplets but when duplicates are met the functor \a dup_func is applied:
* \code
* value = dup_func(OldValue, NewValue)
- * \endcode
+ * \endcode
* Here is a C++11 example keeping the latest entry only:
* \code
* mat.setFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
* \endcode
*/
template<typename Scalar, int Options_, typename StorageIndex_>
-template<typename InputIterators,typename DupFunctor>
-void SparseMatrix<Scalar,Options_,StorageIndex_>::setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func)
+template<typename InputIterators, typename DupFunctor>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func)
{
- internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,Options_,StorageIndex_>, DupFunctor>(begin, end, *this, dup_func);
+ internal::set_from_triplets<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_>, DupFunctor>(begin, end, *this, dup_func);
+}
+
+/** The same as setFromSortedTriplets but when duplicates are met the functor \a dup_func is applied:
+ * \code
+ * value = dup_func(OldValue, NewValue)
+ * \endcode
+ * Here is a C++11 example keeping the latest entry only:
+ * \code
+ * mat.setFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
+ * \endcode
+ */
+template<typename Scalar, int Options_, typename StorageIndex_>
+template<typename InputIterators, typename DupFunctor>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::setFromSortedTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func)
+{
+ internal::set_from_triplets_sorted<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_>, DupFunctor>(begin, end, *this, dup_func);
}
/** \internal */
template<typename Scalar, int Options_, typename StorageIndex_>
-template<typename DupFunctor>
-void SparseMatrix<Scalar,Options_,StorageIndex_>::collapseDuplicates(DupFunctor dup_func)
+template<typename Derived, typename DupFunctor>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::collapseDuplicates(DenseBase<Derived>& wi, DupFunctor dup_func)
{
- eigen_assert(!isCompressed());
- // TODO, in practice we should be able to use m_innerNonZeros for that task
- IndexVector wi(innerSize());
- wi.fill(-1);
+ eigen_assert(wi.size() >= m_innerSize);
+ constexpr StorageIndex kEmptyIndexValue(-1);
+ wi.setConstant(kEmptyIndexValue);
StorageIndex count = 0;
// for each inner-vector, wi[inner_index] will hold the position of first element into the index/value buffers
- for(Index j=0; j<outerSize(); ++j)
- {
- StorageIndex start = count;
- Index oldEnd = m_outerIndex[j]+m_innerNonZeros[j];
- for(Index k=m_outerIndex[j]; k<oldEnd; ++k)
- {
- Index i = m_data.index(k);
- if(wi(i)>=start)
- {
+ for (Index j = 0; j < m_outerSize; ++j) {
+ StorageIndex start = count;
+ StorageIndex oldEnd = isCompressed() ? m_outerIndex[j + 1] : m_outerIndex[j] + m_innerNonZeros[j];
+ for (StorageIndex k = m_outerIndex[j]; k < oldEnd; ++k) {
+ StorageIndex i = m_data.index(k);
+ if (wi(i) >= start) {
// we already meet this entry => accumulate it
m_data.value(wi(i)) = dup_func(m_data.value(wi(i)), m_data.value(k));
- }
- else
- {
+ } else {
m_data.value(count) = m_data.value(k);
m_data.index(count) = m_data.index(k);
wi(i) = count;
@@ -1155,13 +1298,17 @@
m_outerIndex[j] = start;
}
m_outerIndex[m_outerSize] = count;
-
// turn the matrix into compressed form
- internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
- m_innerNonZeros = 0;
+ if (m_innerNonZeros) {
+ internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+ m_innerNonZeros = 0;
+ }
m_data.resize(m_outerIndex[m_outerSize]);
}
+
+
+/** \internal */
template<typename Scalar, int Options_, typename StorageIndex_>
template<typename OtherDerived>
EIGEN_DONT_INLINE SparseMatrix<Scalar,Options_,StorageIndex_>& SparseMatrix<Scalar,Options_,StorageIndex_>::operator=(const SparseMatrixBase<OtherDerived>& other)
@@ -1235,274 +1382,149 @@
}
}
-template<typename Scalar_, int Options_, typename StorageIndex_>
-typename SparseMatrix<Scalar_,Options_,StorageIndex_>::Scalar& SparseMatrix<Scalar_,Options_,StorageIndex_>::insert(Index row, Index col)
-{
- eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
-
- const Index outer = IsRowMajor ? row : col;
- const Index inner = IsRowMajor ? col : row;
-
- if(isCompressed())
- {
- if(nonZeros()==0)
- {
- // reserve space if not already done
- if(m_data.allocatedSize()==0)
- m_data.reserve(2*m_innerSize);
-
- // turn the matrix into non-compressed mode
- m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
-
- std::fill(m_innerNonZeros, m_innerNonZeros + m_outerSize, StorageIndex(0));
-
- // pack all inner-vectors to the end of the pre-allocated space
- // and allocate the entire free-space to the first inner-vector
- StorageIndex end = convert_index(m_data.allocatedSize());
- for(Index j=1; j<=m_outerSize; ++j)
- m_outerIndex[j] = end;
- }
- else
- {
- // turn the matrix into non-compressed mode
- m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
- for(Index j=0; j<m_outerSize; ++j)
- m_innerNonZeros[j] = m_outerIndex[j+1]-m_outerIndex[j];
- }
- }
-
- // check whether we can do a fast "push back" insertion
- Index data_end = m_data.allocatedSize();
-
- // First case: we are filling a new inner vector which is packed at the end.
- // We assume that all remaining inner-vectors are also empty and packed to the end.
- if(m_outerIndex[outer]==data_end)
- {
- eigen_internal_assert(m_innerNonZeros[outer]==0);
-
- // pack previous empty inner-vectors to end of the used-space
- // and allocate the entire free-space to the current inner-vector.
- StorageIndex p = convert_index(m_data.size());
- Index j = outer;
- while(j>=0 && m_innerNonZeros[j]==0)
- m_outerIndex[j--] = p;
-
- // push back the new element
- ++m_innerNonZeros[outer];
- m_data.append(Scalar(0), inner);
-
- // check for reallocation
- if(data_end != m_data.allocatedSize())
- {
- // m_data has been reallocated
- // -> move remaining inner-vectors back to the end of the free-space
- // so that the entire free-space is allocated to the current inner-vector.
- eigen_internal_assert(data_end < m_data.allocatedSize());
- StorageIndex new_end = convert_index(m_data.allocatedSize());
- for(Index k=outer+1; k<=m_outerSize; ++k)
- if(m_outerIndex[k]==data_end)
- m_outerIndex[k] = new_end;
- }
- return m_data.value(p);
- }
-
- // Second case: the next inner-vector is packed to the end
- // and the current inner-vector end match the used-space.
- if(m_outerIndex[outer+1]==data_end && m_outerIndex[outer]+m_innerNonZeros[outer]==m_data.size())
- {
- eigen_internal_assert(outer+1==m_outerSize || m_innerNonZeros[outer+1]==0);
-
- // add space for the new element
- ++m_innerNonZeros[outer];
- m_data.resize(m_data.size()+1);
-
- // check for reallocation
- if(data_end != m_data.allocatedSize())
- {
- // m_data has been reallocated
- // -> move remaining inner-vectors back to the end of the free-space
- // so that the entire free-space is allocated to the current inner-vector.
- eigen_internal_assert(data_end < m_data.allocatedSize());
- StorageIndex new_end = convert_index(m_data.allocatedSize());
- for(Index k=outer+1; k<=m_outerSize; ++k)
- if(m_outerIndex[k]==data_end)
- m_outerIndex[k] = new_end;
- }
-
- // and insert it at the right position (sorted insertion)
- Index startId = m_outerIndex[outer];
- Index p = m_outerIndex[outer]+m_innerNonZeros[outer]-1;
- while ( (p > startId) && (m_data.index(p-1) > inner) )
- {
- m_data.index(p) = m_data.index(p-1);
- m_data.value(p) = m_data.value(p-1);
- --p;
- }
-
- m_data.index(p) = convert_index(inner);
- return (m_data.value(p) = Scalar(0));
- }
-
- if(m_data.size() != m_data.allocatedSize())
- {
- // make sure the matrix is compatible to random un-compressed insertion:
- m_data.resize(m_data.allocatedSize());
- this->reserveInnerVectors(Array<StorageIndex,Dynamic,1>::Constant(m_outerSize, 2));
- }
-
- return insertUncompressed(row,col);
+template <typename Scalar_, int Options_, typename StorageIndex_>
+inline typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar& SparseMatrix<Scalar_, Options_, StorageIndex_>::insert(
+ Index row, Index col) {
+ Index outer = IsRowMajor ? row : col;
+ Index inner = IsRowMajor ? col : row;
+ Index start = outerIndexPtr()[outer];
+ Index end = isCompressed() ? outerIndexPtr()[outer + 1] : start + innerNonZeroPtr()[outer];
+ Index dst = data().searchLowerIndex(start, end, inner);
+ eigen_assert((dst == end || data().index(dst) != inner) &&
+ "you cannot insert an element that already exists, you must call coeffRef to this end");
+ return insertAtByOuterInner(outer, inner, dst);
}
-
-template<typename Scalar_, int Options_, typename StorageIndex_>
-EIGEN_DONT_INLINE typename SparseMatrix<Scalar_,Options_,StorageIndex_>::Scalar& SparseMatrix<Scalar_,Options_,StorageIndex_>::insertUncompressed(Index row, Index col)
-{
+
+template <typename Scalar_, int Options_, typename StorageIndex_>
+EIGEN_STRONG_INLINE typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertAtByOuterInner(Index outer, Index inner, Index dst) {
+ uncompress();
+ return insertUncompressedAtByOuterInner(outer, inner, dst);
+}
+
+template <typename Scalar_, int Options_, typename StorageIndex_>
+EIGEN_DEPRECATED EIGEN_DONT_INLINE typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertUncompressed(Index row, Index col) {
eigen_assert(!isCompressed());
-
- const Index outer = IsRowMajor ? row : col;
- const StorageIndex inner = convert_index(IsRowMajor ? col : row);
-
- Index room = m_outerIndex[outer+1] - m_outerIndex[outer];
- StorageIndex innerNNZ = m_innerNonZeros[outer];
- if(innerNNZ>=room)
- {
- // this inner vector is full, we need to reallocate the whole buffer :(
- reserve(SingletonVector(outer,std::max<StorageIndex>(2,innerNNZ)));
- }
-
- Index startId = m_outerIndex[outer];
- Index p = startId + m_innerNonZeros[outer];
- while ( (p > startId) && (m_data.index(p-1) > inner) )
- {
- m_data.index(p) = m_data.index(p-1);
- m_data.value(p) = m_data.value(p-1);
- --p;
- }
- eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exists, you must call coeffRef to this end");
-
- m_innerNonZeros[outer]++;
-
- m_data.index(p) = inner;
- return (m_data.value(p) = Scalar(0));
+ Index outer = IsRowMajor ? row : col;
+ Index inner = IsRowMajor ? col : row;
+ Index start = outerIndexPtr()[outer];
+ Index end = start + innerNonZeroPtr()[outer];
+ Index dst = data().searchLowerIndex(start, end, inner);
+ eigen_assert((dst == end || data().index(dst) != inner) &&
+ "you cannot insert an element that already exists, you must call coeffRef to this end");
+ return insertUncompressedAtByOuterInner(outer, inner, dst);
}
-template<typename Scalar_, int Options_, typename StorageIndex_>
-EIGEN_DONT_INLINE typename SparseMatrix<Scalar_,Options_,StorageIndex_>::Scalar& SparseMatrix<Scalar_,Options_,StorageIndex_>::insertCompressed(Index row, Index col)
-{
+template <typename Scalar_, int Options_, typename StorageIndex_>
+EIGEN_DEPRECATED EIGEN_DONT_INLINE typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertCompressed(Index row, Index col) {
eigen_assert(isCompressed());
+ Index outer = IsRowMajor ? row : col;
+ Index inner = IsRowMajor ? col : row;
+ Index start = outerIndexPtr()[outer];
+ Index end = outerIndexPtr()[outer + 1];
+ Index dst = data().searchLowerIndex(start, end, inner);
+ eigen_assert((dst == end || data().index(dst) != inner) &&
+ "you cannot insert an element that already exists, you must call coeffRef to this end");
+ return insertCompressedAtByOuterInner(outer, inner, dst);
+}
- const Index outer = IsRowMajor ? row : col;
- const Index inner = IsRowMajor ? col : row;
-
- Index previousOuter = outer;
- if (m_outerIndex[outer+1]==0)
- {
- // we start a new inner vector
- while (previousOuter>=0 && m_outerIndex[previousOuter]==0)
- {
- m_outerIndex[previousOuter] = convert_index(m_data.size());
- --previousOuter;
- }
- m_outerIndex[outer+1] = m_outerIndex[outer];
+template <typename Scalar_, int Options_, typename StorageIndex_>
+EIGEN_STRONG_INLINE void SparseMatrix<Scalar_, Options_, StorageIndex_>::checkAllocatedSpaceAndMaybeExpand() {
+ // if there is no capacity for a single insertion, double the capacity
+ if (data().allocatedSize() <= data().size()) {
+ // increase capacity by a mininum of 32
+ Index minReserve = 32;
+ Index reserveSize = numext::maxi(minReserve, data().allocatedSize());
+ data().reserve(reserveSize);
}
+}
- // here we have to handle the tricky case where the outerIndex array
- // starts with: [ 0 0 0 0 0 1 ...] and we are inserted in, e.g.,
- // the 2nd inner vector...
- bool isLastVec = (!(previousOuter==-1 && m_data.size()!=0))
- && (std::size_t(m_outerIndex[outer+1]) == m_data.size());
+template <typename Scalar_, int Options_, typename StorageIndex_>
+typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertCompressedAtByOuterInner(Index outer, Index inner, Index dst) {
+ eigen_assert(isCompressed());
+ // compressed insertion always requires expanding the buffer
+ checkAllocatedSpaceAndMaybeExpand();
+ data().resize(data().size() + 1);
+ Index chunkSize = outerIndexPtr()[outerSize()] - dst;
+ // shift the existing data to the right if necessary
+ if (chunkSize > 0) data().moveChunk(dst, dst + 1, chunkSize);
+ // update nonzero counts
+ typename IndexVector::AlignedMapType outerIndexMap(outerIndexPtr(), outerSize() + 1);
+ outerIndexMap.segment(outer + 1, outerSize() - outer).array() += 1;
+ // initialize the coefficient
+ data().index(dst) = StorageIndex(inner);
+ // return a reference to the coefficient
+ return data().value(dst) = Scalar(0);
+}
- std::size_t startId = m_outerIndex[outer];
- // FIXME let's make sure sizeof(long int) == sizeof(std::size_t)
- std::size_t p = m_outerIndex[outer+1];
- ++m_outerIndex[outer+1];
-
- double reallocRatio = 1;
- if (m_data.allocatedSize()<=m_data.size())
- {
- // if there is no preallocated memory, let's reserve a minimum of 32 elements
- if (m_data.size()==0)
- {
- m_data.reserve(32);
- }
- else
- {
- // we need to reallocate the data, to reduce multiple reallocations
- // we use a smart resize algorithm based on the current filling ratio
- // in addition, we use double to avoid integers overflows
- double nnzEstimate = double(m_outerIndex[outer])*double(m_outerSize)/double(outer+1);
- reallocRatio = (nnzEstimate-double(m_data.size()))/double(m_data.size());
- // furthermore we bound the realloc ratio to:
- // 1) reduce multiple minor realloc when the matrix is almost filled
- // 2) avoid to allocate too much memory when the matrix is almost empty
- reallocRatio = (std::min)((std::max)(reallocRatio,1.5),8.);
+template <typename Scalar_, int Options_, typename StorageIndex_>
+typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertUncompressedAtByOuterInner(Index outer, Index inner, Index dst) {
+ eigen_assert(!isCompressed());
+ // find nearest outer vector to the right with capacity (if any) to minimize copy size
+ Index target = outer;
+ for (; target < outerSize(); target++) {
+ Index start = outerIndexPtr()[target];
+ Index end = start + innerNonZeroPtr()[target];
+ Index capacity = outerIndexPtr()[target + 1] - end;
+ if (capacity > 0) {
+ // `target` has room for interior insertion
+ Index chunkSize = end - dst;
+ // shift the existing data to the right if necessary
+ if (chunkSize > 0) data().moveChunk(dst, dst + 1, chunkSize);
+ break;
}
}
- m_data.resize(m_data.size()+1,reallocRatio);
-
- if (!isLastVec)
- {
- if (previousOuter==-1)
- {
- // oops wrong guess.
- // let's correct the outer offsets
- for (Index k=0; k<=(outer+1); ++k)
- m_outerIndex[k] = 0;
- Index k=outer+1;
- while(m_outerIndex[k]==0)
- m_outerIndex[k++] = 1;
- while (k<=m_outerSize && m_outerIndex[k]!=0)
- m_outerIndex[k++]++;
- p = 0;
- --k;
- k = m_outerIndex[k]-1;
- while (k>0)
- {
- m_data.index(k) = m_data.index(k-1);
- m_data.value(k) = m_data.value(k-1);
- k--;
- }
+ if (target == outerSize()) {
+ // no room for interior insertion (to the right of `outer`)
+ target = outer;
+ Index dst_offset = dst - outerIndexPtr()[target];
+ Index totalCapacity = data().allocatedSize() - data().size();
+ eigen_assert(totalCapacity >= 0);
+ if (totalCapacity == 0) {
+ // there is no room left. we must reallocate. reserve space in each vector
+ constexpr StorageIndex kReserveSizePerVector(2);
+ reserveInnerVectors(IndexVector::Constant(outerSize(), kReserveSizePerVector));
+ } else {
+ // dont reallocate, but re-distribute the remaining capacity to the right of `outer`
+ // each vector in the range [outer,outerSize) will receive totalCapacity / (outerSize - outer) nonzero
+ // reservations each vector in the range [outer,remainder) will receive an additional nonzero reservation where
+ // remainder = totalCapacity % (outerSize - outer)
+ typedef internal::sparse_reserve_op<StorageIndex> ReserveSizesOp;
+ typedef CwiseNullaryOp<ReserveSizesOp, IndexVector> ReserveSizesXpr;
+ ReserveSizesXpr reserveSizesXpr(outerSize(), 1, ReserveSizesOp(target, outerSize(), totalCapacity));
+ eigen_assert(reserveSizesXpr.sum() == totalCapacity);
+ reserveInnerVectors(reserveSizesXpr);
}
- else
- {
- // we are not inserting into the last inner vec
- // update outer indices:
- Index j = outer+2;
- while (j<=m_outerSize && m_outerIndex[j]!=0)
- m_outerIndex[j++]++;
- --j;
- // shift data of last vecs:
- Index k = m_outerIndex[j]-1;
- while (k>=Index(p))
- {
- m_data.index(k) = m_data.index(k-1);
- m_data.value(k) = m_data.value(k-1);
- k--;
- }
- }
+ Index start = outerIndexPtr()[target];
+ Index end = start + innerNonZeroPtr()[target];
+ dst = start + dst_offset;
+ Index chunkSize = end - dst;
+ if (chunkSize > 0) data().moveChunk(dst, dst + 1, chunkSize);
}
-
- while ( (p > startId) && (m_data.index(p-1) > inner) )
- {
- m_data.index(p) = m_data.index(p-1);
- m_data.value(p) = m_data.value(p-1);
- --p;
- }
-
- m_data.index(p) = inner;
- return (m_data.value(p) = Scalar(0));
+ // update nonzero counts
+ innerNonZeroPtr()[outer]++;
+ typename IndexVector::AlignedMapType outerIndexMap(outerIndexPtr(), outerSize() + 1);
+ outerIndexMap.segment(outer + 1, target - outer).array() += 1;
+ // initialize the coefficient
+ data().index(dst) = StorageIndex(inner);
+ // return a reference to the coefficient
+ return data().value(dst) = Scalar(0);
}
namespace internal {
-template<typename Scalar_, int Options_, typename StorageIndex_>
-struct evaluator<SparseMatrix<Scalar_,Options_,StorageIndex_> >
- : evaluator<SparseCompressedBase<SparseMatrix<Scalar_,Options_,StorageIndex_> > >
-{
- typedef evaluator<SparseCompressedBase<SparseMatrix<Scalar_,Options_,StorageIndex_> > > Base;
- typedef SparseMatrix<Scalar_,Options_,StorageIndex_> SparseMatrixType;
- evaluator() : Base() {}
- explicit evaluator(const SparseMatrixType &mat) : Base(mat) {}
-};
+ template <typename Scalar_, int Options_, typename StorageIndex_>
+ struct evaluator<SparseMatrix<Scalar_, Options_, StorageIndex_>>
+ : evaluator<SparseCompressedBase<SparseMatrix<Scalar_, Options_, StorageIndex_>>> {
+ typedef evaluator<SparseCompressedBase<SparseMatrix<Scalar_, Options_, StorageIndex_>>> Base;
+ typedef SparseMatrix<Scalar_, Options_, StorageIndex_> SparseMatrixType;
+ evaluator() : Base() {}
+ explicit evaluator(const SparseMatrixType& mat) : Base(mat) {}
+ };
}
diff --git a/bench/spbench/spbenchsolver.cpp b/bench/spbench/spbenchsolver.cpp
index 2a73511..9d2e381 100644
--- a/bench/spbench/spbenchsolver.cpp
+++ b/bench/spbench/spbenchsolver.cpp
@@ -6,19 +6,20 @@
cout<< " MATRIX FOLDER : \n";
cout<< " The matrices for the benchmark should be collected in a folder specified with an environment variable EIGEN_MATRIXDIR \n";
cout<< " The matrices are stored using the matrix market coordinate format \n";
- cout<< " The matrix and associated right-hand side (rhs) files are named respectively \n";
- cout<< " as MatrixName.mtx and MatrixName_b.mtx. If the rhs does not exist, a random one is generated. \n";
- cout<< " If a matrix is SPD, the matrix should be named as MatrixName_SPD.mtx \n";
- cout<< " If a true solution exists, it should be named as MatrixName_x.mtx; \n" ;
- cout<< " it will be used to compute the norm of the error relative to the computed solutions\n\n";
+ cout<< " The matrix and associated right-hand side (rhs) files are named respectively as MatrixName.mtx and MatrixName_b.mtx.\n";
+ cout<< " If the rhs does not exist, a random one is generated. \n";
+ cout<< " If a true solution exists, it should be named as MatrixName_x.mtx ; it will be used to compute the norm of the error \n";
+ cout<< " relative to the computed solutions. \n";
+ cout<< " If a matrix is SPD, the matrix should be named as MatrixName_SPD.mtx (then use MatrixName_SPD_b.mtx \n";
+ cout<< " and MatrixName_SPD_x.mtx for the rhs and the true solution).\n\n";
cout<< " OPTIONS : \n";
cout<< " -h or --help \n print this help and return\n\n";
cout<< " -d matrixdir \n Use matrixdir as the matrix folder instead of the one specified in the environment variable EIGEN_MATRIXDIR\n\n";
cout<< " -o outputfile.xml \n Output the statistics to a xml file \n\n";
cout<< " --eps <RelErr> Sets the relative tolerance for iterative solvers (default 1e-08) \n\n";
cout<< " --maxits <MaxIts> Sets the maximum number of iterations (default 1000) \n\n";
-
}
+
int main(int argc, char ** args)
{
diff --git a/test/bfloat16_float.cpp b/test/bfloat16_float.cpp
index b2a22ce..5cc44ac 100644
--- a/test/bfloat16_float.cpp
+++ b/test/bfloat16_float.cpp
@@ -11,8 +11,6 @@
#include "main.h"
-#include <Eigen/src/Core/arch/Default/BFloat16.h>
-
#define VERIFY_BFLOAT16_BITS_EQUAL(h, bits) \
VERIFY_IS_EQUAL((numext::bit_cast<numext::uint16_t>(h)), (static_cast<numext::uint16_t>(bits)))
diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
index e424a93..82f9d61 100644
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@@ -16,6 +16,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
#include "main.h"
#include "gpu_common.h"
diff --git a/test/gpu_common.h b/test/gpu_common.h
index c37eaa1..05080ad 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@@ -12,9 +12,6 @@
#include <iostream>
-#define EIGEN_USE_GPU
-#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
-
#if !defined(__CUDACC__) && !defined(__HIPCC__)
dim3 threadIdx, blockDim, blockIdx;
#endif
diff --git a/test/gpu_example.cu b/test/gpu_example.cu
index a69f5ea..d4715cf 100644
--- a/test/gpu_example.cu
+++ b/test/gpu_example.cu
@@ -9,6 +9,7 @@
// The following is an example GPU test.
+#define EIGEN_USE_GPU
#include "main.h" // Include the main test utilities.
// Define a kernel functor.
diff --git a/test/gpu_test_helper.h b/test/gpu_test_helper.h
index 0942466..008e95c 100644
--- a/test/gpu_test_helper.h
+++ b/test/gpu_test_helper.h
@@ -3,10 +3,8 @@
#include <Eigen/Core>
-#ifdef EIGEN_GPUCC
-#define EIGEN_USE_GPU
-#include "../unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h"
-#endif // EIGEN_GPUCC
+// Allow gpu** macros for generic tests.
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
// doesn't allow std::tuple to compile for host code either. In these cases,
diff --git a/test/half_float.cpp b/test/half_float.cpp
index 00a8b48..83b057e 100644
--- a/test/half_float.cpp
+++ b/test/half_float.cpp
@@ -9,8 +9,6 @@
#include "main.h"
-#include <Eigen/src/Core/arch/Default/Half.h>
-
#define VERIFY_HALF_BITS_EQUAL(h, bits) \
VERIFY_IS_EQUAL((numext::bit_cast<numext::uint16_t>(h)), (static_cast<numext::uint16_t>(bits)))
diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp
index ecc17f5..72316a2 100644
--- a/test/incomplete_cholesky.cpp
+++ b/test/incomplete_cholesky.cpp
@@ -10,7 +10,6 @@
// #define EIGEN_MAX_ALIGN_BYTES 0
#include "sparse_solver.h"
#include <Eigen/IterativeLinearSolvers>
-#include <unsupported/Eigen/IterativeSolvers>
template<typename T, typename I_> void test_incomplete_cholesky_T()
{
diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp
index daf24a7..401adf0 100644
--- a/test/jacobisvd.cpp
+++ b/test/jacobisvd.cpp
@@ -60,7 +60,9 @@
}
template <typename MatrixType>
-void jacobisvd_verify_assert(const MatrixType& m = MatrixType()) {
+void jacobisvd_verify_assert(const MatrixType& input = MatrixType()) {
+ MatrixType m(input.rows(), input.cols());
+ svd_fill_random(m);
svd_verify_assert<MatrixType, 0>(m);
svd_verify_assert<MatrixType, ColPivHouseholderQRPreconditioner>(m);
svd_verify_assert<MatrixType, HouseholderQRPreconditioner>(m);
diff --git a/test/sparse.h b/test/sparse.h
index f3e697d..78e6619 100644
--- a/test/sparse.h
+++ b/test/sparse.h
@@ -152,6 +152,4 @@
}
}
-
-#include <unsupported/Eigen/SparseExtra>
#endif // EIGEN_TESTSPARSE_H
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 52ff7b7..6e2661e 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -477,14 +477,45 @@
refMat_prod(r,c) *= v;
refMat_last(r,c) = v;
}
+
SparseMatrixType m(rows,cols);
m.setFromTriplets(triplets.begin(), triplets.end());
VERIFY_IS_APPROX(m, refMat_sum);
+ VERIFY_IS_EQUAL(m.innerIndicesAreSorted(), m.outerSize());
m.setFromTriplets(triplets.begin(), triplets.end(), std::multiplies<Scalar>());
VERIFY_IS_APPROX(m, refMat_prod);
+ VERIFY_IS_EQUAL(m.innerIndicesAreSorted(), m.outerSize());
m.setFromTriplets(triplets.begin(), triplets.end(), [] (Scalar,Scalar b) { return b; });
VERIFY_IS_APPROX(m, refMat_last);
+ VERIFY_IS_EQUAL(m.innerIndicesAreSorted(), m.outerSize());
+
+ // test setFromSortedTriplets
+
+ struct triplet_comp {
+ inline bool operator()(const TripletType& a, const TripletType& b) {
+ return SparseMatrixType::IsRowMajor ? ((a.row() != b.row()) ? (a.row() < b.row()) : (a.col() < b.col()))
+ : ((a.col() != b.col()) ? (a.col() < b.col()) : (a.row() < b.row()));
+ }
+ };
+
+ // stable_sort is only necessary when the reduction functor is dependent on the order of the triplets
+ // this is the case with refMat_last
+ // for most cases, std::sort is sufficient and preferred
+ std::stable_sort(triplets.begin(), triplets.end(), triplet_comp());
+
+ m.setZero();
+ m.setFromSortedTriplets(triplets.begin(), triplets.end());
+ VERIFY_IS_APPROX(m, refMat_sum);
+ VERIFY_IS_EQUAL(m.innerIndicesAreSorted(), m.outerSize());
+
+ m.setFromSortedTriplets(triplets.begin(), triplets.end(), std::multiplies<Scalar>());
+ VERIFY_IS_APPROX(m, refMat_prod);
+ VERIFY_IS_EQUAL(m.innerIndicesAreSorted(), m.outerSize());
+
+ m.setFromSortedTriplets(triplets.begin(), triplets.end(), [](Scalar, Scalar b) { return b; });
+ VERIFY_IS_APPROX(m, refMat_last);
+ VERIFY_IS_EQUAL(m.innerIndicesAreSorted(), m.outerSize());
}
// test Map
diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index dbf549a..8b46fc8 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp
@@ -477,7 +477,7 @@
initSparse<Real>(0.1, refA, A);
initSparse<Real>(0.1, refB, B);
- SparseMatrix<Real, OrderC, std::int8_t> result;
+ SparseMatrix<Real, OrderC, std::int16_t> result;
SparseMatrix<Real, OrderC> result_large;
DenseMat refResult;
@@ -486,8 +486,8 @@
// Case: Small input but large result
{
- SparseMatrix<Real, OrderA, std::int8_t> A(127, 8);
- SparseMatrix<Real, OrderB, std::int8_t> B(8, 127);
+ SparseMatrix<Real, OrderA, std::int16_t> A(127, 8);
+ SparseMatrix<Real, OrderB, std::int16_t> B(8, 127);
DenseMat refA(127, 8);
DenseMat refB(8, 127);
diff --git a/test/sparselu.cpp b/test/sparselu.cpp
index 84cc6eb..0f9d25f 100644
--- a/test/sparselu.cpp
+++ b/test/sparselu.cpp
@@ -17,7 +17,6 @@
#include "sparse_solver.h"
#include <Eigen/SparseLU>
-#include <unsupported/Eigen/SparseExtra>
template<typename T> void test_sparselu_T()
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 60748cc..4b3d423 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -10,9 +10,6 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
-#if defined(EIGEN_USE_LIBXSMM)
-#include "third_party/libxsmm/include/libxsmm.h"
-#endif
#include "./InternalHeaderCheck.h"
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index 4a1f938..05a9066 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -7,7 +7,7 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#include "sparse_product.cpp"
+#include "sparse.h"
#ifdef min
#undef min