Update Eigen to commit:aa6964bf3a34fd607837dd8123bc42465185c4f8
CHANGELOG
=========
aa6964bf3 - Work around MSVC issue in Block XprType.
877c2d1e9 - fix typo in comment
0c9526912 - Pass div_ceil arguments by value.
d9839718a - [ROCm] Replace HIP_PATH with ROCM_PATH for rocm 6.0
5bdf58b8d - Eliminate use of _res.
a96545777 - Consolidate multiple implementations of divup/div_up/div_ceil.
PiperOrigin-RevId: 575157496
Change-Id: I3154dd9ac5d8713a1630caf4f502e3c082bbbbc0
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 248d297..31cd094 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -20,40 +20,41 @@
template<typename XprType_, int BlockRows, int BlockCols, bool InnerPanel_>
struct traits<Block<XprType_, BlockRows, BlockCols, InnerPanel_> > : traits<XprType_>
{
- typedef typename traits<XprType_>::Scalar Scalar;
- typedef typename traits<XprType_>::StorageKind StorageKind;
- typedef typename traits<XprType_>::XprKind XprKind;
- typedef typename ref_selector<XprType_>::type XprTypeNested;
+ typedef XprType_ XprType;
+ typedef typename traits<XprType>::Scalar Scalar;
+ typedef typename traits<XprType>::StorageKind StorageKind;
+ typedef typename traits<XprType>::XprKind XprKind;
+ typedef typename ref_selector<XprType>::type XprTypeNested;
typedef std::remove_reference_t<XprTypeNested> XprTypeNested_;
enum{
- MatrixRows = traits<XprType_>::RowsAtCompileTime,
- MatrixCols = traits<XprType_>::ColsAtCompileTime,
+ MatrixRows = traits<XprType>::RowsAtCompileTime,
+ MatrixCols = traits<XprType>::ColsAtCompileTime,
RowsAtCompileTime = MatrixRows == 0 ? 0 : BlockRows,
ColsAtCompileTime = MatrixCols == 0 ? 0 : BlockCols,
MaxRowsAtCompileTime = BlockRows==0 ? 0
: RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime)
- : int(traits<XprType_>::MaxRowsAtCompileTime),
+ : int(traits<XprType>::MaxRowsAtCompileTime),
MaxColsAtCompileTime = BlockCols==0 ? 0
: ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
- : int(traits<XprType_>::MaxColsAtCompileTime),
+ : int(traits<XprType>::MaxColsAtCompileTime),
- XprTypeIsRowMajor = (int(traits<XprType_>::Flags)&RowMajorBit) != 0,
+ XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
: (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
: XprTypeIsRowMajor,
HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
InnerStrideAtCompileTime = HasSameStorageOrderAsXprType
- ? int(inner_stride_at_compile_time<XprType_>::ret)
- : int(outer_stride_at_compile_time<XprType_>::ret),
+ ? int(inner_stride_at_compile_time<XprType>::ret)
+ : int(outer_stride_at_compile_time<XprType>::ret),
OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
- ? int(outer_stride_at_compile_time<XprType_>::ret)
- : int(inner_stride_at_compile_time<XprType_>::ret),
+ ? int(outer_stride_at_compile_time<XprType>::ret)
+ : int(inner_stride_at_compile_time<XprType>::ret),
// FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
- FlagsLvalueBit = is_lvalue<XprType_>::value ? LvalueBit : 0,
+ FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
- Flags = (traits<XprType_>::Flags & (DirectAccessBit | (InnerPanel_?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit,
+ Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel_?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit,
// FIXME DirectAccessBit should not be handled by expressions
//
// Alignment is needed by MapBase's assertions
@@ -86,7 +87,7 @@
* type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
* most of the time this is the only way it is used.
*
- * However, if you want to directly manipulate block expressions,
+ * However, if you want to directly maniputate block expressions,
* for instance if you want to write a function returning such an expression, you
* will need to use this class.
*
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 0f5a0fd..3d801e9 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -1341,19 +1341,6 @@
#endif
-// Integer division with rounding up.
-// T is assumed to be an integer type with a>=0, and b>0
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR
-T div_ceil(T a, T b)
-{
- EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
- eigen_assert(a >= 0);
- eigen_assert(b > 0);
- // Note: This form is used because it cannot overflow.
- return a == 0 ? 0 : (a - 1) / b + 1;
-}
-
/** Log base 2 for 32 bits positive integers.
* Conveniently returns 0 for x==0. */
inline int log2(int x)
diff --git a/Eigen/src/Core/arch/AVX512/GemmKernel.h b/Eigen/src/Core/arch/AVX512/GemmKernel.h
index 2df1704..7220bfa 100644
--- a/Eigen/src/Core/arch/AVX512/GemmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/GemmKernel.h
@@ -90,6 +90,8 @@
const Index a_stride, b_stride;
const Index a_off, b_off;
+ static EIGEN_ALWAYS_INLINE constexpr int div_up(int a, int b) { return a == 0 ? 0 : (a - 1) / b + 1; }
+
EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar *a_addr) {
_mm_prefetch((char *)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0);
}
@@ -477,7 +479,7 @@
*
* const Scalar *cox = (idx == 0) ? co1 : co2;
*
- * const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
+ * const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
* scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
* write_c<0, um_vecs, idx, a_unroll>(cox);
*
@@ -496,7 +498,7 @@
EIGEN_ALWAYS_INLINE void c_update_1count(Scalar *&cox) {
if (pow >= 4) cox += ldc;
- const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
+ const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
auto &alpha_reg = zmm[alpha_load_reg];
scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
@@ -642,7 +644,7 @@
template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch, bool no_a_preload = false>
EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo,
Scalar *&co2, int &fetchA_idx, int &fetchB_idx) {
- const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
+ const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
if (max_b_unroll >= 1)
innerkernel_1pow<uk, 1, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
@@ -727,7 +729,7 @@
template <int a_unroll, int b_unroll, int max_b_unroll>
EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
- const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
+ const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
if (!use_less_a_regs && k > 1)
a_loads<0, 2, 0, um_vecs, a_unroll>(ao);
else
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index e8bb821..d7632e6 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -62,9 +62,9 @@
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static void run(Index rows, Index cols, Index depth,
- const LhsScalar* lhs_, Index lhsStride,
- const RhsScalar* rhs_, Index rhsStride,
- ResScalar* res_, Index resIncr, Index resStride,
+ const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsStride,
+ ResScalar* _res, Index resIncr, Index resStride,
ResScalar alpha,
level3_blocking<LhsScalar,RhsScalar>& blocking,
GemmParallelInfo<Index>* info = 0)
@@ -72,9 +72,9 @@
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper;
- LhsMapper lhs(lhs_, lhsStride);
- RhsMapper rhs(rhs_, rhsStride);
- ResMapper res(res_, resStride, resIncr);
+ LhsMapper lhs(_lhs, lhsStride);
+ RhsMapper rhs(_rhs, rhsStride);
+ ResMapper res(_res, resStride, resIncr);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 55b637b..03a4107 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -63,9 +63,9 @@
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version>
{
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
- static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs_, Index lhsStride,
- const RhsScalar* rhs_, Index rhsStride,
- ResScalar* res_, Index resIncr, Index resStride,
+ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsStride,
+ ResScalar* _res, Index resIncr, Index resStride,
const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
{
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
@@ -73,9 +73,9 @@
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
- LhsMapper lhs(lhs_,lhsStride);
- RhsMapper rhs(rhs_,rhsStride);
- ResMapper res(res_, resStride, resIncr);
+ LhsMapper lhs(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride, resIncr);
Index kc = blocking.kc();
Index mc = (std::min)(size,blocking.mc());
@@ -116,7 +116,7 @@
gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
(std::min)(size,i2), alpha, -1, -1, 0, 0);
- sybb(res_+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+ sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
if (UpLo==Upper)
{
@@ -147,11 +147,11 @@
enum {
BlockSize = meta_least_common_multiple<plain_enum_max(mr, nr), plain_enum_min(mr,nr)>::ret
};
- void operator()(ResScalar* res_, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+ void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
{
typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
- ResMapper res(res_, resStride, resIncr);
+ ResMapper res(_res, resStride, resIncr);
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1;
gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 8133880..aafc1cd 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -46,7 +46,7 @@
for(Index w=0; w<BlockRows; w++)
blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
}
- void operator()(Scalar* blockA, const Scalar* lhs_, Index lhsStride, Index cols, Index rows)
+ void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
{
typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket;
@@ -56,7 +56,7 @@
HasHalf = (int)HalfPacketSize < (int)PacketSize,
HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
- const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(lhs_,lhsStride);
+ const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
Index count = 0;
//Index peeled_mc3 = (rows/Pack1)*Pack1;
@@ -104,11 +104,11 @@
struct symm_pack_rhs
{
enum { PacketSize = packet_traits<Scalar>::size };
- void operator()(Scalar* blockB, const Scalar* rhs_, Index rhsStride, Index rows, Index cols, Index k2)
+ void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
{
Index end_k = k2 + rows;
Index count = 0;
- const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(rhs_,rhsStride);
+ const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
@@ -333,8 +333,8 @@
static EIGEN_DONT_INLINE void run(
Index rows, Index cols,
- const Scalar* lhs_, Index lhsStride,
- const Scalar* rhs_, Index rhsStride,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
};
@@ -345,9 +345,9 @@
int ResInnerStride>
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run(
Index rows, Index cols,
- const Scalar* lhs_, Index lhsStride,
- const Scalar* rhs_, Index rhsStride,
- Scalar* res_, Index resIncr, Index resStride,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* _res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{
Index size = rows;
@@ -358,10 +358,10 @@
typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
- LhsMapper lhs(lhs_,lhsStride);
- LhsTransposeMapper lhs_transpose(lhs_,lhsStride);
- RhsMapper rhs(rhs_,rhsStride);
- ResMapper res(res_, resStride, resIncr);
+ LhsMapper lhs(_lhs,lhsStride);
+ LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride, resIncr);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@@ -428,8 +428,8 @@
static EIGEN_DONT_INLINE void run(
Index rows, Index cols,
- const Scalar* lhs_, Index lhsStride,
- const Scalar* rhs_, Index rhsStride,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
};
@@ -440,9 +440,9 @@
int ResInnerStride>
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run(
Index rows, Index cols,
- const Scalar* lhs_, Index lhsStride,
- const Scalar* rhs_, Index rhsStride,
- Scalar* res_, Index resIncr, Index resStride,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* _res, Index resIncr, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{
Index size = cols;
@@ -451,8 +451,8 @@
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
- LhsMapper lhs(lhs_,lhsStride);
- ResMapper res(res_,resStride, resIncr);
+ LhsMapper lhs(_lhs,lhsStride);
+ ResMapper res(_res,resStride, resIncr);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@@ -469,7 +469,7 @@
{
const Index actual_kc = (std::min)(k2+kc,size)-k2;
- pack_rhs(blockB, rhs_, rhsStride, actual_kc, cols, k2);
+ pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2);
// => GEPP
for(Index i2=0; i2<rows; i2+=mc)
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index bd30dc3..d07d367 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -27,15 +27,15 @@
static constexpr bool IsLower = ((Mode & Lower) == Lower);
static constexpr bool HasUnitDiag = (Mode & UnitDiag) == UnitDiag;
static constexpr bool HasZeroDiag = (Mode & ZeroDiag) == ZeroDiag;
- static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* lhs_, Index lhsStride,
- const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr,
+ static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr,
const RhsScalar& alpha);
};
template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
- ::run(Index _rows, Index _cols, const LhsScalar* lhs_, Index lhsStride,
- const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const RhsScalar& alpha)
+ ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha)
{
static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
Index size = (std::min)(_rows,_cols);
@@ -43,15 +43,15 @@
Index cols = IsLower ? (std::min)(_rows,_cols) : _cols;
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
- const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride));
+ const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
- const RhsMap rhs(rhs_,cols,InnerStride<>(rhsIncr));
+ const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
- ResMap res(res_,rows);
+ ResMap res(_res,rows);
typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
@@ -86,7 +86,7 @@
rows, cols-size,
LhsMapper(&lhs.coeffRef(0,size), lhsStride),
RhsMapper(&rhs.coeffRef(size), rhsIncr),
- res_, resIncr, alpha);
+ _res, resIncr, alpha);
}
}
@@ -97,15 +97,15 @@
static constexpr bool IsLower = ((Mode & Lower) == Lower);
static constexpr bool HasUnitDiag = (Mode & UnitDiag) == UnitDiag;
static constexpr bool HasZeroDiag = (Mode & ZeroDiag) == ZeroDiag;
- static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* lhs_, Index lhsStride,
- const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr,
+ static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr,
const ResScalar& alpha);
};
template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
- ::run(Index _rows, Index _cols, const LhsScalar* lhs_, Index lhsStride,
- const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const ResScalar& alpha)
+ ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
{
static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
Index diagSize = (std::min)(_rows,_cols);
@@ -113,15 +113,15 @@
Index cols = IsLower ? diagSize : _cols;
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
- const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride));
+ const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap;
- const RhsMap rhs(rhs_,cols);
+ const RhsMap rhs(_rhs,cols);
typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
- ResMap res(res_,rows,InnerStride<>(resIncr));
+ ResMap res(_res,rows,InnerStride<>(resIncr));
typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
index f62a28a..8b4a636 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
@@ -53,18 +53,18 @@
#define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
- static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \
- const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+ const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \
- rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \
+ _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
} \
}; \
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \
- static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \
- const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+ const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \
- rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \
+ _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
} \
};
@@ -84,23 +84,23 @@
IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
LowUp = IsLower ? Lower : Upper \
}; \
- static void run(Index rows_, Index cols_, const EIGTYPE* lhs_, Index lhsStride, \
- const EIGTYPE* rhs_, Index rhsIncr, EIGTYPE* res_, Index resIncr, EIGTYPE alpha) \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
{ \
if (ConjLhs || IsZeroDiag) { \
triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \
- rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \
+ _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
return; \
}\
- Index size = (std::min)(rows_,cols_); \
- Index rows = IsLower ? rows_ : size; \
- Index cols = IsLower ? size : cols_; \
+ Index size = (std::min)(_rows,_cols); \
+ Index rows = IsLower ? _rows : size; \
+ Index cols = IsLower ? size : _cols; \
\
typedef VectorX##EIGPREFIX VectorRhs; \
EIGTYPE *x, *y;\
\
/* Set x*/ \
- Map<const VectorRhs, 0, InnerStride<> > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \
+ Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
VectorRhs x_tmp; \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
x = x_tmp.data(); \
@@ -124,24 +124,24 @@
diag = IsUnitDiag ? 'U' : 'N'; \
\
/* call ?TRMV*/ \
- BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)lhs_, &lda, (BLASTYPE*)x, &incx); \
+ BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
\
/* Add op(a_tr)rhs into res*/ \
- BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)res_, &incy); \
+ BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
if (size<(std::max)(rows,cols)) { \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
x = x_tmp.data(); \
if (size<rows) { \
- y = res_ + size*resIncr; \
- a = lhs_ + size; \
+ y = _res + size*resIncr; \
+ a = _lhs + size; \
m = convert_index<BlasIndex>(rows-size); \
n = convert_index<BlasIndex>(size); \
} \
else { \
x += size; \
- y = res_; \
- a = lhs_ + size*lda; \
+ y = _res; \
+ a = _lhs + size*lda; \
m = convert_index<BlasIndex>(size); \
n = convert_index<BlasIndex>(cols-size); \
} \
@@ -173,23 +173,23 @@
IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
LowUp = IsLower ? Lower : Upper \
}; \
- static void run(Index rows_, Index cols_, const EIGTYPE* lhs_, Index lhsStride, \
- const EIGTYPE* rhs_, Index rhsIncr, EIGTYPE* res_, Index resIncr, EIGTYPE alpha) \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
{ \
if (IsZeroDiag) { \
triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \
- rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \
+ _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
return; \
}\
- Index size = (std::min)(rows_,cols_); \
- Index rows = IsLower ? rows_ : size; \
- Index cols = IsLower ? size : cols_; \
+ Index size = (std::min)(_rows,_cols); \
+ Index rows = IsLower ? _rows : size; \
+ Index cols = IsLower ? size : _cols; \
\
typedef VectorX##EIGPREFIX VectorRhs; \
EIGTYPE *x, *y;\
\
/* Set x*/ \
- Map<const VectorRhs, 0, InnerStride<> > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \
+ Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
VectorRhs x_tmp; \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
x = x_tmp.data(); \
@@ -213,24 +213,24 @@
diag = IsUnitDiag ? 'U' : 'N'; \
\
/* call ?TRMV*/ \
- BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)lhs_, &lda, (BLASTYPE*)x, &incx); \
+ BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
\
/* Add op(a_tr)rhs into res*/ \
- BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)res_, &incy); \
+ BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
if (size<(std::max)(rows,cols)) { \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
x = x_tmp.data(); \
if (size<rows) { \
- y = res_ + size*resIncr; \
- a = lhs_ + size*lda; \
+ y = _res + size*resIncr; \
+ a = _lhs + size*lda; \
m = convert_index<BlasIndex>(rows-size); \
n = convert_index<BlasIndex>(size); \
} \
else { \
x += size; \
- y = res_; \
- a = lhs_ + size; \
+ y = _res; \
+ a = _lhs + size; \
m = convert_index<BlasIndex>(size); \
n = convert_index<BlasIndex>(cols-size); \
} \
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 8e4c278..fe6e5de 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -422,6 +422,15 @@
using std::numeric_limits;
+// Integer division with rounding up.
+// T is assumed to be an integer type with a>=0, and b>0
+template<typename T>
+EIGEN_DEVICE_FUNC
+T div_ceil(const T &a, const T &b)
+{
+ return (a+b-1) / b;
+}
+
// Handle integer comparisons of different signedness.
template <typename X, typename Y, bool XIsInteger = NumTraits<X>::IsInteger, bool XIsSigned = NumTraits<X>::IsSigned,
bool YIsInteger = NumTraits<Y>::IsInteger, bool YIsSigned = NumTraits<Y>::IsSigned>
diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
index 9ee9292..96f73cc 100644
--- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@@ -93,9 +93,9 @@
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
{
- remove_all_t<ResultType> res_(res.rows(), res.cols());
- internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,ResultType>(lhs, rhs, res_, tolerance);
- res.swap(res_);
+ remove_all_t<ResultType> _res(res.rows(), res.cols());
+ internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,ResultType>(lhs, rhs, _res, tolerance);
+ res.swap(_res);
}
};
@@ -107,9 +107,9 @@
{
// we need a col-major matrix to hold the result
typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> SparseTemporaryType;
- SparseTemporaryType res_(res.rows(), res.cols());
- internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, res_, tolerance);
- res = res_;
+ SparseTemporaryType _res(res.rows(), res.cols());
+ internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, _res, tolerance);
+ res = _res;
}
};
@@ -120,9 +120,9 @@
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
{
// let's transpose the product to get a column x column product
- remove_all_t<ResultType> res_(res.rows(), res.cols());
- internal::sparse_sparse_product_with_pruning_impl<Rhs,Lhs,ResultType>(rhs, lhs, res_, tolerance);
- res.swap(res_);
+ remove_all_t<ResultType> _res(res.rows(), res.cols());
+ internal::sparse_sparse_product_with_pruning_impl<Rhs,Lhs,ResultType>(rhs, lhs, _res, tolerance);
+ res.swap(_res);
}
};
@@ -140,9 +140,9 @@
// let's transpose the product to get a column x column product
// typedef SparseMatrix<typename ResultType::Scalar> SparseTemporaryType;
-// SparseTemporaryType res_(res.cols(), res.rows());
-// sparse_sparse_product_with_pruning_impl<Rhs,Lhs,SparseTemporaryType>(rhs, lhs, res_);
-// res = res_.transpose();
+// SparseTemporaryType _res(res.cols(), res.rows());
+// sparse_sparse_product_with_pruning_impl<Rhs,Lhs,SparseTemporaryType>(rhs, lhs, _res);
+// res = _res.transpose();
}
};
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e0b2c83..e1a056f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -446,36 +446,34 @@
option(EIGEN_TEST_HIP "Add HIP support." OFF)
if (EIGEN_TEST_HIP)
- set(ROCM_PATH "/opt/rocm" CACHE STRING "Path to the ROCm installation.")
+ set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
- if (EXISTS ${ROCM_PATH}/hip)
- set(HIP_PATH ${ROCM_PATH}/hip)
- list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
- elseif (EXISTS ${ROCM_PATH}/lib/cmake/hip)
- set(HIP_PATH ${ROCM_PATH})
- list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/lib/cmake/hip)
+ if (EXISTS ${HIP_PATH})
+
+ list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
+
+ find_package(HIP REQUIRED)
+ if (HIP_FOUND)
+
+ execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+ if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd"))
+
+ include_directories(${HIP_PATH}/include)
+
+ set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+ ei_add_test(gpu_basic)
+ ei_add_test(gpu_example)
+ unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+
+ elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
+ message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+ else ()
+ message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+ endif()
+ endif()
else ()
- message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but could not find the ROCm installation under ${ROCM_PATH}")
- endif()
-
- find_package(HIP REQUIRED)
- if (HIP_FOUND)
- execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
-
- if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd"))
-
- include_directories(${HIP_PATH}/include)
-
- set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
- ei_add_test(gpu_basic)
- ei_add_test(gpu_example)
- unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
-
- elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
- message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
- else ()
- message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
- endif()
+ message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
endif()
endif()
diff --git a/test/main.h b/test/main.h
index be6fe14..99149ca 100644
--- a/test/main.h
+++ b/test/main.h
@@ -125,10 +125,6 @@
// B0 is defined in POSIX header termios.h
#define B0 FORBIDDEN_IDENTIFIER
#define I FORBIDDEN_IDENTIFIER
-
-// _res is defined by resolv.h
-#define _res FORBIDDEN_IDENTIFIER
-
// Unit tests calling Eigen's blas library must preserve the default blocking size
// to avoid troubles.
#ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 4087a8b..aa460ba 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -443,7 +443,7 @@
const int dim = isColMajor ? i : NumDims - i - 1;
m_block_dimensions[dim] =
numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
- coeff_to_allocate = numext::div_ceil(
+ coeff_to_allocate = divup(
coeff_to_allocate,
numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
}
@@ -474,7 +474,7 @@
const IndexType total_size_other_dims =
total_size / m_block_dimensions[dim];
const IndexType alloc_avail =
- numext::div_ceil<IndexType>(target_block_size, total_size_other_dims);
+ divup<IndexType>(target_block_size, total_size_other_dims);
if (alloc_avail == m_block_dimensions[dim]) {
// Insufficient excess coefficients to allocate.
break;
@@ -496,7 +496,7 @@
// Calculate block counts by dimension and total block count.
DSizes<IndexType, NumDims> block_count;
for (int i = 0; i < NumDims; ++i) {
- block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]);
+ block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
}
m_total_block_count = array_prod(block_count);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 59e75e5..1cf5035 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -898,7 +898,7 @@
// First multiple after a. This is b when <= bcast_dim_left_index +
// bcast_dim_size.
const Index first_multiple =
- numext::div_ceil<Index>(bcast_dim_left_index, input_bcast_dim_size) *
+ divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
input_bcast_dim_size;
if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index ec34885..f0520e8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -144,8 +144,8 @@
const Index bn) {
Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
BlockSizes sz;
- sz.lhs_size = numext::div_ceil<Index>(bm * bk * sizeof(LhsScalar), align) * align;
- sz.rhs_size = numext::div_ceil<Index>(bn * bk * sizeof(RhsScalar), align) * align;
+ sz.lhs_size = divup<Index>(bm * bk * sizeof(LhsScalar), align) * align;
+ sz.rhs_size = divup<Index>(bn * bk * sizeof(RhsScalar), align) * align;
return sz;
}
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 7576808..308c23b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -206,9 +206,9 @@
}
// Number of kernels for each dimension.
- Index nm0 = numext::div_ceil(m, bm);
- Index nn0 = numext::div_ceil(n, bn);
- Index nk = numext::div_ceil(k, bk);
+ Index nm0 = divup(m, bm);
+ Index nn0 = divup(n, bn);
+ Index nk = divup(k, bk);
// Calculate task grain size (number of kernels executed per task).
// This task size coarsening serves two purposes:
@@ -226,8 +226,8 @@
gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
}
// Number of tasks in each dimension.
- Index nm = numext::div_ceil(nm0, gm);
- Index nn = numext::div_ceil(nn0, gn);
+ Index nm = divup(nm0, gm);
+ Index nn = divup(nn0, gn);
// If there is enough concurrency in the sharding dimension, we choose not
// to paralellize by the other dimension, and execute all kernels in sync
@@ -1130,9 +1130,9 @@
done(std::move(done_callback)),
buffer_size_bytes(m * n * sizeof(Scalar)),
block_size(blockSize(k, num_threads)),
- num_blocks(numext::div_ceil<Index>(k, block_size)),
+ num_blocks(divup<Index>(k, block_size)),
num_pending_blocks(internal::convert_index<int>(num_blocks)),
- l0_ranges(numext::div_ceil<Index>(num_blocks, l0_size)),
+ l0_ranges(divup<Index>(num_blocks, l0_size)),
l0_state(l0_ranges),
block_buffers(num_blocks) {
// Keep count of pending gemm tasks for each l0 range.
@@ -1434,10 +1434,10 @@
static Index blockSize(Index k, int num_threads) {
const auto round_up = [=](Index index) -> Index {
const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
- return numext::div_ceil<Index>(index, kmultiple) * kmultiple;
+ return divup<Index>(index, kmultiple) * kmultiple;
};
- const Index target_block_size = round_up(numext::div_ceil<Index>(k, num_threads));
+ const Index target_block_size = round_up(divup<Index>(k, num_threads));
const Index desired_min_block_size = 12 * packet_size;
return numext::mini<Index>(
@@ -1485,19 +1485,19 @@
int num_threads, bool shard_by_col) const {
Index gm = 1;
Index gm1 = 1;
- Index nm0 = numext::div_ceil(m, bm);
+ Index nm0 = divup(m, bm);
Index nm1 = nm0;
for (;;) {
// Find the next candidate for m grain size. It needs to result in
// different number of blocks. E.g. if we have 10 kernels, we want to try
// 5 and 10, but not 6, 7, 8 and 9.
- while (gm1 <= nm0 && nm1 == numext::div_ceil(nm0, gm1)) gm1++;
+ while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++;
if (gm1 > nm0) break;
// Check the candidate.
int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
shard_by_col);
if (res < 0) break;
- nm1 = numext::div_ceil(nm0, gm1);
+ nm1 = divup(nm0, gm1);
if (res == 0) continue;
// Commit new grain size.
gm = gm1;
@@ -1509,15 +1509,15 @@
int num_threads, bool shard_by_col) const {
Index gn = 1;
Index gn1 = 1;
- Index nn0 = numext::div_ceil(n, bn);
+ Index nn0 = divup(n, bn);
Index nn1 = nn0;
for (;;) {
- while (gn1 <= nn0 && nn1 == numext::div_ceil(nn0, gn1)) gn1++;
+ while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++;
if (gn1 > nn0) break;
int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
shard_by_col);
if (res < 0) break;
- nn1 = numext::div_ceil(nn0, gn1);
+ nn1 = divup(nn0, gn1);
if (res == 0) continue;
gn = gn1;
}
@@ -1544,14 +1544,14 @@
// But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
// of cores will be busy). While grain size 3 gives us 4 tasks, which gives
// us parallelism of 1 (we can load all cores).
- Index nm0 = numext::div_ceil(m, bm);
- Index nn0 = numext::div_ceil(n, bn);
- Index new_tasks = numext::div_ceil(nm0, gm) * numext::div_ceil(nn0, gn);
+ Index nm0 = divup(m, bm);
+ Index nn0 = divup(n, bn);
+ Index new_tasks = divup(nm0, gm) * divup(nn0, gn);
double new_parallelism = static_cast<double>(new_tasks) /
- (numext::div_ceil<int>(new_tasks, num_threads) * num_threads);
- Index old_tasks = numext::div_ceil(nm0, oldgm) * numext::div_ceil(nn0, oldgn);
+ (divup<int>(new_tasks, num_threads) * num_threads);
+ Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn);
double old_parallelism = static_cast<double>(old_tasks) /
- (numext::div_ceil<int>(old_tasks, num_threads) * num_threads);
+ (divup<int>(old_tasks, num_threads) * num_threads);
if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
return 0;
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 53b66c0..6c7ad67 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -223,7 +223,7 @@
Index lastIdx) {
while (lastIdx - firstIdx > block.size) {
// Split into halves and schedule the second half on a different thread.
- const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size;
+ const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
lastIdx = midIdx;
}
@@ -282,7 +282,7 @@
ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
while (lastIdx - firstIdx > block.size) {
// Split into halves and schedule the second half on a different thread.
- const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size;
+ const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
pool_->Schedule(
[ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
lastIdx = midIdx;
@@ -357,7 +357,7 @@
const Index max_oversharding_factor = 4;
Index block_size = numext::mini(
n, numext::maxi<Index>(
- numext::div_ceil<Index>(n, max_oversharding_factor * numThreads()),
+ divup<Index>(n, max_oversharding_factor * numThreads()),
block_size_f));
const Index max_block_size = numext::mini(n, 2 * block_size);
@@ -367,13 +367,13 @@
block_size = numext::mini(n, new_block_size);
}
- Index block_count = numext::div_ceil(n, block_size);
+ Index block_count = divup(n, block_size);
// Calculate parallel efficiency as fraction of total CPU time used for
// computations:
double max_efficiency =
static_cast<double>(block_count) /
- (numext::div_ceil<int>(block_count, numThreads()) * numThreads());
+ (divup<int>(block_count, numThreads()) * numThreads());
// Now try to increase block size up to max_block_size as long as it
// doesn't decrease parallel efficiency.
@@ -381,7 +381,7 @@
max_efficiency < 1.0 && prev_block_count > 1;) {
// This is the next block size that divides size into a smaller number
// of blocks than the current block_size.
- Index coarser_block_size = numext::div_ceil(n, prev_block_count - 1);
+ Index coarser_block_size = divup(n, prev_block_count - 1);
if (block_align) {
Index new_block_size = block_align(coarser_block_size);
eigen_assert(new_block_size >= coarser_block_size);
@@ -391,12 +391,12 @@
break; // Reached max block size. Stop.
}
// Recalculate parallel efficiency.
- const Index coarser_block_count = numext::div_ceil(n, coarser_block_size);
+ const Index coarser_block_count = divup(n, coarser_block_size);
eigen_assert(coarser_block_count < prev_block_count);
prev_block_count = coarser_block_count;
const double coarser_efficiency =
static_cast<double>(coarser_block_count) /
- (numext::div_ceil<int>(coarser_block_count, numThreads()) * numThreads());
+ (divup<int>(coarser_block_count, numThreads()) * numThreads());
if (coarser_efficiency + 0.01 >= max_efficiency) {
// Taking it.
block_size = coarser_block_size;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 4eebbe7..461abe4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -261,7 +261,7 @@
const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
const size_t aligned_blocksize =
align *
- numext::div_ceil<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
+ divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
return {block_mapper, requirements.cost_per_coeff * block_size,
aligned_blocksize};
@@ -661,7 +661,7 @@
block_size;
const StorageIndex size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
- const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, numext::div_ceil<int>(size, block_size)), 1);
+ const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
LAUNCH_GPU_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 524432e..b7c2cb8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -27,6 +27,21 @@
return second;
}
+
+template <typename T, typename X, typename Y>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T divup(const X x, const Y y) {
+ // Note: This form is used because it cannot overflow.
+ return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T divup(const T x, const T y) {
+ // Note: This form is used because it cannot overflow.
+ return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1);
+}
+
template <size_t n> struct max_n_1 {
static const size_t size = n;
};
@@ -34,11 +49,6 @@
static const size_t size = 1;
};
-template <typename T>
-EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-constexpr T divup(const T x, const T y) {
- return Eigen::numext::div_ceil(x, y);
-}
// Default packet types
template <typename Scalar, typename Device>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index 7348a71..aee86fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -414,7 +414,7 @@
typedef typename Self::Index Index;
const int block_size = 256;
const int num_per_thread = 128;
- const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+ const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
unsigned int* semaphore = NULL;
if (num_blocks > 1) {
@@ -441,7 +441,7 @@
const int block_size = 256;
const int num_per_thread = 128;
- const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+ const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
half* scratch = static_cast<half*>(device.scratchpad());
if (num_blocks > 1) {
@@ -507,7 +507,7 @@
const int unroll_times = 16;
eigen_assert(NumPerThread % unroll_times == 0);
- const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+ const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
const Index num_threads = blockDim.x * gridDim.x;
@@ -593,8 +593,8 @@
eigen_assert(NumPerThread % unroll_times == 0);
eigen_assert(unroll_times % 2 == 0);
- const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
- const Index num_input_blocks = numext::div_ceil<Index>(input_col_blocks * num_preserved_coeffs, 2);
+ const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+ const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
const Index num_threads = blockDim.x * gridDim.x;
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -785,7 +785,7 @@
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 128;
- const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
@@ -793,7 +793,7 @@
if (num_blocks > 1) {
// We initialize the outputs outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks.
- const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
+ const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
const int max_blocks2 = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / 1024;
const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
@@ -831,7 +831,7 @@
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = /*256*/128;
const int num_per_thread = /*128*/64;
- const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
@@ -900,7 +900,7 @@
}
// Do the reduction.
- const Index max_iter = num_preserved_coeffs * numext::div_ceil<Index>(num_coeffs_to_reduce, NumPerThread);
+ const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
for (Index i = thread_id; i < max_iter; i += num_threads) {
const Index input_col = i % num_preserved_coeffs;
const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
@@ -953,7 +953,7 @@
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 16;
- const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
@@ -961,7 +961,7 @@
if (num_blocks > 1) {
// We initialize the outputs in the reduction kernel itself when we don't have to worry
// about race conditions between multiple thread blocks.
- const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
+ const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
const int max_blocks2 = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / 1024;
const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 4f4a93e..169a7a2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -215,7 +215,7 @@
EIGEN_CONSTEXPR Index kBlockAlignment = 128;
const Index items_per_cacheline =
numext::maxi<Index>(1, kBlockAlignment / item_size);
- return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline);
+ return items_per_cacheline * divup(block_size, items_per_cacheline);
}
template <typename Self>
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 3a985cf..d41baf2 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -287,52 +287,49 @@
# Add HIP specific tests
if (EIGEN_TEST_HIP)
- set(ROCM_PATH "/opt/rocm" CACHE STRING "Path to the ROCm installation.")
+ set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
- if (EXISTS ${ROCM_PATH}/hip)
- set(HIP_PATH ${ROCM_PATH}/hip)
+ if (EXISTS ${HIP_PATH})
list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
- elseif (EXISTS ${ROCM_PATH}/lib/cmake/hip)
- set(HIP_PATH ${ROCM_PATH})
- list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/lib/cmake/hip)
- else ()
- message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but could not find the ROCm installation under ${ROCM_PATH}")
- endif()
- find_package(HIP REQUIRED)
- if (HIP_FOUND)
- execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+ find_package(HIP REQUIRED)
+ if (HIP_FOUND)
+ execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
- if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd"))
- include_directories(${CMAKE_CURRENT_BINARY_DIR})
- include_directories(${HIP_PATH}/include)
+ if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd"))
+ include_directories(${CMAKE_CURRENT_BINARY_DIR})
+ include_directories(${HIP_PATH}/include)
- set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
- #
- # complex datatype is not yet supported by HIP
- # so leaving out those tests for now
- #
- # ei_add_test(cxx11_tensor_complex_gpu)
- # ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
- #
- ei_add_test(cxx11_tensor_reduction_gpu)
- ei_add_test(cxx11_tensor_argmax_gpu)
- ei_add_test(cxx11_tensor_cast_float16_gpu)
- ei_add_test(cxx11_tensor_scan_gpu)
- ei_add_test(cxx11_tensor_device)
+ set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+ #
+ # complex datatype is not yet supported by HIP
+ # so leaving out those tests for now
+ #
+ # ei_add_test(cxx11_tensor_complex_gpu)
+ # ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+ #
+ ei_add_test(cxx11_tensor_reduction_gpu)
+ ei_add_test(cxx11_tensor_argmax_gpu)
+ ei_add_test(cxx11_tensor_cast_float16_gpu)
+ ei_add_test(cxx11_tensor_scan_gpu)
+ ei_add_test(cxx11_tensor_device)
- ei_add_test(cxx11_tensor_gpu)
- ei_add_test(cxx11_tensor_contract_gpu)
- ei_add_test(cxx11_tensor_of_float16_gpu)
- ei_add_test(cxx11_tensor_of_bfloat16_gpu)
- ei_add_test(cxx11_tensor_random_gpu)
+ ei_add_test(cxx11_tensor_gpu)
+ ei_add_test(cxx11_tensor_contract_gpu)
+ ei_add_test(cxx11_tensor_of_float16_gpu)
+ ei_add_test(cxx11_tensor_of_bfloat16_gpu)
+ ei_add_test(cxx11_tensor_random_gpu)
- unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+ unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
- elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
- message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
- else ()
- message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+ elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
+ message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+ else ()
+ message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+ endif()
endif()
+ else ()
+ message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
endif()
+
endif()