Update Eigen to commit:aa6964bf3a34fd607837dd8123bc42465185c4f8 CHANGELOG ========= aa6964bf3 - Work around MSVC issue in Block XprType. 877c2d1e9 - fix typo in comment 0c9526912 - Pass div_ceil arguments by value. d9839718a - [ROCm] Replace HIP_PATH with ROCM_PATH for rocm 6.0 5bdf58b8d - Eliminate use of _res. a96545777 - Consolidate multiple implementations of divup/div_up/div_ceil. PiperOrigin-RevId: 575157496 Change-Id: I3154dd9ac5d8713a1630caf4f502e3c082bbbbc0
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 248d297..31cd094 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h
@@ -20,40 +20,41 @@ template<typename XprType_, int BlockRows, int BlockCols, bool InnerPanel_> struct traits<Block<XprType_, BlockRows, BlockCols, InnerPanel_> > : traits<XprType_> { - typedef typename traits<XprType_>::Scalar Scalar; - typedef typename traits<XprType_>::StorageKind StorageKind; - typedef typename traits<XprType_>::XprKind XprKind; - typedef typename ref_selector<XprType_>::type XprTypeNested; + typedef XprType_ XprType; + typedef typename traits<XprType>::Scalar Scalar; + typedef typename traits<XprType>::StorageKind StorageKind; + typedef typename traits<XprType>::XprKind XprKind; + typedef typename ref_selector<XprType>::type XprTypeNested; typedef std::remove_reference_t<XprTypeNested> XprTypeNested_; enum{ - MatrixRows = traits<XprType_>::RowsAtCompileTime, - MatrixCols = traits<XprType_>::ColsAtCompileTime, + MatrixRows = traits<XprType>::RowsAtCompileTime, + MatrixCols = traits<XprType>::ColsAtCompileTime, RowsAtCompileTime = MatrixRows == 0 ? 0 : BlockRows, ColsAtCompileTime = MatrixCols == 0 ? 0 : BlockCols, MaxRowsAtCompileTime = BlockRows==0 ? 0 : RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) - : int(traits<XprType_>::MaxRowsAtCompileTime), + : int(traits<XprType>::MaxRowsAtCompileTime), MaxColsAtCompileTime = BlockCols==0 ? 0 : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) - : int(traits<XprType_>::MaxColsAtCompileTime), + : int(traits<XprType>::MaxColsAtCompileTime), - XprTypeIsRowMajor = (int(traits<XprType_>::Flags)&RowMajorBit) != 0, + XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0, IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 : XprTypeIsRowMajor, HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor), InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime), InnerStrideAtCompileTime = HasSameStorageOrderAsXprType - ? int(inner_stride_at_compile_time<XprType_>::ret) - : int(outer_stride_at_compile_time<XprType_>::ret), + ? int(inner_stride_at_compile_time<XprType>::ret) + : int(outer_stride_at_compile_time<XprType>::ret), OuterStrideAtCompileTime = HasSameStorageOrderAsXprType - ? int(outer_stride_at_compile_time<XprType_>::ret) - : int(inner_stride_at_compile_time<XprType_>::ret), + ? int(outer_stride_at_compile_time<XprType>::ret) + : int(inner_stride_at_compile_time<XprType>::ret), // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further - FlagsLvalueBit = is_lvalue<XprType_>::value ? LvalueBit : 0, + FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, - Flags = (traits<XprType_>::Flags & (DirectAccessBit | (InnerPanel_?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit, + Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel_?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit, // FIXME DirectAccessBit should not be handled by expressions // // Alignment is needed by MapBase's assertions @@ -86,7 +87,7 @@ * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and * most of the time this is the only way it is used. * - * However, if you want to directly manipulate block expressions, + * However, if you want to directly maniputate block expressions, * for instance if you want to write a function returning such an expression, you * will need to use this class. *
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 0f5a0fd..3d801e9 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h
@@ -1341,19 +1341,6 @@ #endif -// Integer division with rounding up. -// T is assumed to be an integer type with a>=0, and b>0 -template<typename T> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR -T div_ceil(T a, T b) -{ - EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES) - eigen_assert(a >= 0); - eigen_assert(b > 0); - // Note: This form is used because it cannot overflow. - return a == 0 ? 0 : (a - 1) / b + 1; -} - /** Log base 2 for 32 bits positive integers. * Conveniently returns 0 for x==0. */ inline int log2(int x)
diff --git a/Eigen/src/Core/arch/AVX512/GemmKernel.h b/Eigen/src/Core/arch/AVX512/GemmKernel.h index 2df1704..7220bfa 100644 --- a/Eigen/src/Core/arch/AVX512/GemmKernel.h +++ b/Eigen/src/Core/arch/AVX512/GemmKernel.h
@@ -90,6 +90,8 @@ const Index a_stride, b_stride; const Index a_off, b_off; + static EIGEN_ALWAYS_INLINE constexpr int div_up(int a, int b) { return a == 0 ? 0 : (a - 1) / b + 1; } + EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar *a_addr) { _mm_prefetch((char *)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0); } @@ -477,7 +479,7 @@ * * const Scalar *cox = (idx == 0) ? co1 : co2; * - * const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line); + * const int um_vecs = div_up(a_unroll, nelems_in_cache_line); * scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg); * write_c<0, um_vecs, idx, a_unroll>(cox); * @@ -496,7 +498,7 @@ EIGEN_ALWAYS_INLINE void c_update_1count(Scalar *&cox) { if (pow >= 4) cox += ldc; - const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line); + const int um_vecs = div_up(a_unroll, nelems_in_cache_line); auto &alpha_reg = zmm[alpha_load_reg]; scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg); @@ -642,7 +644,7 @@ template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch, bool no_a_preload = false> EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo, Scalar *&co2, int &fetchA_idx, int &fetchB_idx) { - const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line); + const int um_vecs = div_up(a_unroll, nelems_in_cache_line); if (max_b_unroll >= 1) innerkernel_1pow<uk, 1, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx); @@ -727,7 +729,7 @@ template <int a_unroll, int b_unroll, int max_b_unroll> EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) { - const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line); + const int um_vecs = div_up(a_unroll, nelems_in_cache_line); if (!use_less_a_regs && k > 1) a_loads<0, 2, 0, um_vecs, a_unroll>(ao); else
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index e8bb821..d7632e6 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -62,9 +62,9 @@ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; static void run(Index rows, Index cols, Index depth, - const LhsScalar* lhs_, Index lhsStride, - const RhsScalar* rhs_, Index rhsStride, - ResScalar* res_, Index resIncr, Index resStride, + const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsStride, + ResScalar* _res, Index resIncr, Index resStride, ResScalar alpha, level3_blocking<LhsScalar,RhsScalar>& blocking, GemmParallelInfo<Index>* info = 0) @@ -72,9 +72,9 @@ typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper; - LhsMapper lhs(lhs_, lhsStride); - RhsMapper rhs(rhs_, rhsStride); - ResMapper res(res_, resStride, resIncr); + LhsMapper lhs(_lhs, lhsStride); + RhsMapper rhs(_rhs, rhsStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 55b637b..03a4107 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -63,9 +63,9 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version> { typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; - static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs_, Index lhsStride, - const RhsScalar* rhs_, Index rhsStride, - ResScalar* res_, Index resIncr, Index resStride, + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsStride, + ResScalar* _res, Index resIncr, Index resStride, const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking) { typedef gebp_traits<LhsScalar,RhsScalar> Traits; @@ -73,9 +73,9 @@ typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; - LhsMapper lhs(lhs_,lhsStride); - RhsMapper rhs(rhs_,rhsStride); - ResMapper res(res_, resStride, resIncr); + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); Index mc = (std::min)(size,blocking.mc()); @@ -116,7 +116,7 @@ gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, -1, -1, 0, 0); - sybb(res_+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); + sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { @@ -147,11 +147,11 @@ enum { BlockSize = meta_least_common_multiple<plain_enum_max(mr, nr), plain_enum_min(mr,nr)>::ret }; - void operator()(ResScalar* res_, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper; - ResMapper res(res_, resStride, resIncr); + ResMapper res(_res, resStride, resIncr); gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1; gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 8133880..aafc1cd 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -46,7 +46,7 @@ for(Index w=0; w<BlockRows; w++) blockA[count++] = numext::conj(lhs(k, i+w)); // transposed } - void operator()(Scalar* blockA, const Scalar* lhs_, Index lhsStride, Index cols, Index rows) + void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket; typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket; @@ -56,7 +56,7 @@ HasHalf = (int)HalfPacketSize < (int)PacketSize, HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; - const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(lhs_,lhsStride); + const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; @@ -104,11 +104,11 @@ struct symm_pack_rhs { enum { PacketSize = packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* rhs_, Index rhsStride, Index rows, Index cols, Index k2) + void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { Index end_k = k2 + rows; Index count = 0; - const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(rhs_,rhsStride); + const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride); Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; @@ -333,8 +333,8 @@ static EIGEN_DONT_INLINE void run( Index rows, Index cols, - const Scalar* lhs_, Index lhsStride, - const Scalar* rhs_, Index rhsStride, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); }; @@ -345,9 +345,9 @@ int ResInnerStride> EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run( Index rows, Index cols, - const Scalar* lhs_, Index lhsStride, - const Scalar* rhs_, Index rhsStride, - Scalar* res_, Index resIncr, Index resStride, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { Index size = rows; @@ -358,10 +358,10 @@ typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper; typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; - LhsMapper lhs(lhs_,lhsStride); - LhsTransposeMapper lhs_transpose(lhs_,lhsStride); - RhsMapper rhs(rhs_,rhsStride); - ResMapper res(res_, resStride, resIncr); + LhsMapper lhs(_lhs,lhsStride); + LhsTransposeMapper lhs_transpose(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -428,8 +428,8 @@ static EIGEN_DONT_INLINE void run( Index rows, Index cols, - const Scalar* lhs_, Index lhsStride, - const Scalar* rhs_, Index rhsStride, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); }; @@ -440,9 +440,9 @@ int ResInnerStride> EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run( Index rows, Index cols, - const Scalar* lhs_, Index lhsStride, - const Scalar* rhs_, Index rhsStride, - Scalar* res_, Index resIncr, Index resStride, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { Index size = cols; @@ -451,8 +451,8 @@ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; - LhsMapper lhs(lhs_,lhsStride); - ResMapper res(res_,resStride, resIncr); + LhsMapper lhs(_lhs,lhsStride); + ResMapper res(_res,resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -469,7 +469,7 @@ { const Index actual_kc = (std::min)(k2+kc,size)-k2; - pack_rhs(blockB, rhs_, rhsStride, actual_kc, cols, k2); + pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2); // => GEPP for(Index i2=0; i2<rows; i2+=mc)
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index bd30dc3..d07d367 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -27,15 +27,15 @@ static constexpr bool IsLower = ((Mode & Lower) == Lower); static constexpr bool HasUnitDiag = (Mode & UnitDiag) == UnitDiag; static constexpr bool HasZeroDiag = (Mode & ZeroDiag) == ZeroDiag; - static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* lhs_, Index lhsStride, - const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, + static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha); }; template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version> EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version> - ::run(Index _rows, Index _cols, const LhsScalar* lhs_, Index lhsStride, - const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const RhsScalar& alpha) + ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; Index size = (std::min)(_rows,_cols); @@ -43,15 +43,15 @@ Index cols = IsLower ? (std::min)(_rows,_cols) : _cols; typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap; - const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride)); + const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs); typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap; - const RhsMap rhs(rhs_,cols,InnerStride<>(rhsIncr)); + const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs); typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap; - ResMap res(res_,rows); + ResMap res(_res,rows); typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper; typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper; @@ -86,7 +86,7 @@ rows, cols-size, LhsMapper(&lhs.coeffRef(0,size), lhsStride), RhsMapper(&rhs.coeffRef(size), rhsIncr), - res_, resIncr, alpha); + _res, resIncr, alpha); } } @@ -97,15 +97,15 @@ static constexpr bool IsLower = ((Mode & Lower) == Lower); static constexpr bool HasUnitDiag = (Mode & UnitDiag) == UnitDiag; static constexpr bool HasZeroDiag = (Mode & ZeroDiag) == ZeroDiag; - static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* lhs_, Index lhsStride, - const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, + static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha); }; template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version> EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version> - ::run(Index _rows, Index _cols, const LhsScalar* lhs_, Index lhsStride, - const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const ResScalar& alpha) + ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; Index diagSize = (std::min)(_rows,_cols); @@ -113,15 +113,15 @@ Index cols = IsLower ? diagSize : _cols; typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap; - const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride)); + const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs); typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap; - const RhsMap rhs(rhs_,cols); + const RhsMap rhs(_rhs,cols); typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs); typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap; - ResMap res(res_,rows,InnerStride<>(resIncr)); + ResMap res(_res,rows,InnerStride<>(resIncr)); typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper; typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index f62a28a..8b4a636 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
@@ -53,18 +53,18 @@ #define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \ - static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \ - const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \ + static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ + const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \ - rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ } \ }; \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \ - static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \ - const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \ + static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ + const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \ - rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ } \ }; @@ -84,23 +84,23 @@ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ LowUp = IsLower ? Lower : Upper \ }; \ - static void run(Index rows_, Index cols_, const EIGTYPE* lhs_, Index lhsStride, \ - const EIGTYPE* rhs_, Index rhsIncr, EIGTYPE* res_, Index resIncr, EIGTYPE alpha) \ + static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \ { \ if (ConjLhs || IsZeroDiag) { \ triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \ - rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ return; \ }\ - Index size = (std::min)(rows_,cols_); \ - Index rows = IsLower ? rows_ : size; \ - Index cols = IsLower ? size : cols_; \ + Index size = (std::min)(_rows,_cols); \ + Index rows = IsLower ? _rows : size; \ + Index cols = IsLower ? size : _cols; \ \ typedef VectorX##EIGPREFIX VectorRhs; \ EIGTYPE *x, *y;\ \ /* Set x*/ \ - Map<const VectorRhs, 0, InnerStride<> > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \ + Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ VectorRhs x_tmp; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ @@ -124,24 +124,24 @@ diag = IsUnitDiag ? 'U' : 'N'; \ \ /* call ?TRMV*/ \ - BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)lhs_, &lda, (BLASTYPE*)x, &incx); \ + BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ \ /* Add op(a_tr)rhs into res*/ \ - BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)res_, &incy); \ + BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ if (size<(std::max)(rows,cols)) { \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size<rows) { \ - y = res_ + size*resIncr; \ - a = lhs_ + size; \ + y = _res + size*resIncr; \ + a = _lhs + size; \ m = convert_index<BlasIndex>(rows-size); \ n = convert_index<BlasIndex>(size); \ } \ else { \ x += size; \ - y = res_; \ - a = lhs_ + size*lda; \ + y = _res; \ + a = _lhs + size*lda; \ m = convert_index<BlasIndex>(size); \ n = convert_index<BlasIndex>(cols-size); \ } \ @@ -173,23 +173,23 @@ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ LowUp = IsLower ? Lower : Upper \ }; \ - static void run(Index rows_, Index cols_, const EIGTYPE* lhs_, Index lhsStride, \ - const EIGTYPE* rhs_, Index rhsIncr, EIGTYPE* res_, Index resIncr, EIGTYPE alpha) \ + static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \ { \ if (IsZeroDiag) { \ triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \ - rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ return; \ }\ - Index size = (std::min)(rows_,cols_); \ - Index rows = IsLower ? rows_ : size; \ - Index cols = IsLower ? size : cols_; \ + Index size = (std::min)(_rows,_cols); \ + Index rows = IsLower ? _rows : size; \ + Index cols = IsLower ? size : _cols; \ \ typedef VectorX##EIGPREFIX VectorRhs; \ EIGTYPE *x, *y;\ \ /* Set x*/ \ - Map<const VectorRhs, 0, InnerStride<> > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \ + Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ VectorRhs x_tmp; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ @@ -213,24 +213,24 @@ diag = IsUnitDiag ? 'U' : 'N'; \ \ /* call ?TRMV*/ \ - BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)lhs_, &lda, (BLASTYPE*)x, &incx); \ + BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ \ /* Add op(a_tr)rhs into res*/ \ - BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)res_, &incy); \ + BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ if (size<(std::max)(rows,cols)) { \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size<rows) { \ - y = res_ + size*resIncr; \ - a = lhs_ + size*lda; \ + y = _res + size*resIncr; \ + a = _lhs + size*lda; \ m = convert_index<BlasIndex>(rows-size); \ n = convert_index<BlasIndex>(size); \ } \ else { \ x += size; \ - y = res_; \ - a = lhs_ + size; \ + y = _res; \ + a = _lhs + size; \ m = convert_index<BlasIndex>(size); \ n = convert_index<BlasIndex>(cols-size); \ } \
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 8e4c278..fe6e5de 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h
@@ -422,6 +422,15 @@ using std::numeric_limits; +// Integer division with rounding up. +// T is assumed to be an integer type with a>=0, and b>0 +template<typename T> +EIGEN_DEVICE_FUNC +T div_ceil(const T &a, const T &b) +{ + return (a+b-1) / b; +} + // Handle integer comparisons of different signedness. template <typename X, typename Y, bool XIsInteger = NumTraits<X>::IsInteger, bool XIsSigned = NumTraits<X>::IsSigned, bool YIsInteger = NumTraits<Y>::IsInteger, bool YIsSigned = NumTraits<Y>::IsSigned>
diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h index 9ee9292..96f73cc 100644 --- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@@ -93,9 +93,9 @@ static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) { - remove_all_t<ResultType> res_(res.rows(), res.cols()); - internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,ResultType>(lhs, rhs, res_, tolerance); - res.swap(res_); + remove_all_t<ResultType> _res(res.rows(), res.cols()); + internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,ResultType>(lhs, rhs, _res, tolerance); + res.swap(_res); } }; @@ -107,9 +107,9 @@ { // we need a col-major matrix to hold the result typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> SparseTemporaryType; - SparseTemporaryType res_(res.rows(), res.cols()); - internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, res_, tolerance); - res = res_; + SparseTemporaryType _res(res.rows(), res.cols()); + internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, _res, tolerance); + res = _res; } }; @@ -120,9 +120,9 @@ static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) { // let's transpose the product to get a column x column product - remove_all_t<ResultType> res_(res.rows(), res.cols()); - internal::sparse_sparse_product_with_pruning_impl<Rhs,Lhs,ResultType>(rhs, lhs, res_, tolerance); - res.swap(res_); + remove_all_t<ResultType> _res(res.rows(), res.cols()); + internal::sparse_sparse_product_with_pruning_impl<Rhs,Lhs,ResultType>(rhs, lhs, _res, tolerance); + res.swap(_res); } }; @@ -140,9 +140,9 @@ // let's transpose the product to get a column x column product // typedef SparseMatrix<typename ResultType::Scalar> SparseTemporaryType; -// SparseTemporaryType res_(res.cols(), res.rows()); -// sparse_sparse_product_with_pruning_impl<Rhs,Lhs,SparseTemporaryType>(rhs, lhs, res_); -// res = res_.transpose(); +// SparseTemporaryType _res(res.cols(), res.rows()); +// sparse_sparse_product_with_pruning_impl<Rhs,Lhs,SparseTemporaryType>(rhs, lhs, _res); +// res = _res.transpose(); } };
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e0b2c83..e1a056f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt
@@ -446,36 +446,34 @@ option(EIGEN_TEST_HIP "Add HIP support." OFF) if (EIGEN_TEST_HIP) - set(ROCM_PATH "/opt/rocm" CACHE STRING "Path to the ROCm installation.") + set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.") - if (EXISTS ${ROCM_PATH}/hip) - set(HIP_PATH ${ROCM_PATH}/hip) - list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) - elseif (EXISTS ${ROCM_PATH}/lib/cmake/hip) - set(HIP_PATH ${ROCM_PATH}) - list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/lib/cmake/hip) + if (EXISTS ${HIP_PATH}) + + list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) + + find_package(HIP REQUIRED) + if (HIP_FOUND) + + execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM) + + if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd")) + + include_directories(${HIP_PATH}/include) + + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + ei_add_test(gpu_basic) + ei_add_test(gpu_example) + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) + + elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia")) + message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") + else () + message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") + endif() + endif() else () - message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but could not find the ROCm installation under ${ROCM_PATH}") - endif() - - find_package(HIP REQUIRED) - if (HIP_FOUND) - execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM) - - if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd")) - - include_directories(${HIP_PATH}/include) - - set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") - ei_add_test(gpu_basic) - ei_add_test(gpu_example) - unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) - - elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia")) - message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") - else () - message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") - endif() + message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist") endif() endif()
diff --git a/test/main.h b/test/main.h index be6fe14..99149ca 100644 --- a/test/main.h +++ b/test/main.h
@@ -125,10 +125,6 @@ // B0 is defined in POSIX header termios.h #define B0 FORBIDDEN_IDENTIFIER #define I FORBIDDEN_IDENTIFIER - -// _res is defined by resolv.h -#define _res FORBIDDEN_IDENTIFIER - // Unit tests calling Eigen's blas library must preserve the default blocking size // to avoid troubles. #ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 4087a8b..aa460ba 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -443,7 +443,7 @@ const int dim = isColMajor ? i : NumDims - i - 1; m_block_dimensions[dim] = numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); - coeff_to_allocate = numext::div_ceil( + coeff_to_allocate = divup( coeff_to_allocate, numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim])); } @@ -474,7 +474,7 @@ const IndexType total_size_other_dims = total_size / m_block_dimensions[dim]; const IndexType alloc_avail = - numext::div_ceil<IndexType>(target_block_size, total_size_other_dims); + divup<IndexType>(target_block_size, total_size_other_dims); if (alloc_avail == m_block_dimensions[dim]) { // Insufficient excess coefficients to allocate. break; @@ -496,7 +496,7 @@ // Calculate block counts by dimension and total block count. DSizes<IndexType, NumDims> block_count; for (int i = 0; i < NumDims; ++i) { - block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]); + block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); } m_total_block_count = array_prod(block_count);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 59e75e5..1cf5035 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -898,7 +898,7 @@ // First multiple after a. This is b when <= bcast_dim_left_index + // bcast_dim_size. const Index first_multiple = - numext::div_ceil<Index>(bcast_dim_left_index, input_bcast_dim_size) * + divup<Index>(bcast_dim_left_index, input_bcast_dim_size) * input_bcast_dim_size; if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index ec34885..f0520e8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -144,8 +144,8 @@ const Index bn) { Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); BlockSizes sz; - sz.lhs_size = numext::div_ceil<Index>(bm * bk * sizeof(LhsScalar), align) * align; - sz.rhs_size = numext::div_ceil<Index>(bn * bk * sizeof(RhsScalar), align) * align; + sz.lhs_size = divup<Index>(bm * bk * sizeof(LhsScalar), align) * align; + sz.rhs_size = divup<Index>(bn * bk * sizeof(RhsScalar), align) * align; return sz; } };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 7576808..308c23b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -206,9 +206,9 @@ } // Number of kernels for each dimension. - Index nm0 = numext::div_ceil(m, bm); - Index nn0 = numext::div_ceil(n, bn); - Index nk = numext::div_ceil(k, bk); + Index nm0 = divup(m, bm); + Index nn0 = divup(n, bn); + Index nk = divup(k, bk); // Calculate task grain size (number of kernels executed per task). // This task size coarsening serves two purposes: @@ -226,8 +226,8 @@ gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); } // Number of tasks in each dimension. - Index nm = numext::div_ceil(nm0, gm); - Index nn = numext::div_ceil(nn0, gn); + Index nm = divup(nm0, gm); + Index nn = divup(nn0, gn); // If there is enough concurrency in the sharding dimension, we choose not // to paralellize by the other dimension, and execute all kernels in sync @@ -1130,9 +1130,9 @@ done(std::move(done_callback)), buffer_size_bytes(m * n * sizeof(Scalar)), block_size(blockSize(k, num_threads)), - num_blocks(numext::div_ceil<Index>(k, block_size)), + num_blocks(divup<Index>(k, block_size)), num_pending_blocks(internal::convert_index<int>(num_blocks)), - l0_ranges(numext::div_ceil<Index>(num_blocks, l0_size)), + l0_ranges(divup<Index>(num_blocks, l0_size)), l0_state(l0_ranges), block_buffers(num_blocks) { // Keep count of pending gemm tasks for each l0 range. @@ -1434,10 +1434,10 @@ static Index blockSize(Index k, int num_threads) { const auto round_up = [=](Index index) -> Index { const Index kmultiple = packet_size <= 8 ? 8 : packet_size; - return numext::div_ceil<Index>(index, kmultiple) * kmultiple; + return divup<Index>(index, kmultiple) * kmultiple; }; - const Index target_block_size = round_up(numext::div_ceil<Index>(k, num_threads)); + const Index target_block_size = round_up(divup<Index>(k, num_threads)); const Index desired_min_block_size = 12 * packet_size; return numext::mini<Index>( @@ -1485,19 +1485,19 @@ int num_threads, bool shard_by_col) const { Index gm = 1; Index gm1 = 1; - Index nm0 = numext::div_ceil(m, bm); + Index nm0 = divup(m, bm); Index nm1 = nm0; for (;;) { // Find the next candidate for m grain size. It needs to result in // different number of blocks. E.g. if we have 10 kernels, we want to try // 5 and 10, but not 6, 7, 8 and 9. - while (gm1 <= nm0 && nm1 == numext::div_ceil(nm0, gm1)) gm1++; + while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; if (gm1 > nm0) break; // Check the candidate. int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, shard_by_col); if (res < 0) break; - nm1 = numext::div_ceil(nm0, gm1); + nm1 = divup(nm0, gm1); if (res == 0) continue; // Commit new grain size. gm = gm1; @@ -1509,15 +1509,15 @@ int num_threads, bool shard_by_col) const { Index gn = 1; Index gn1 = 1; - Index nn0 = numext::div_ceil(n, bn); + Index nn0 = divup(n, bn); Index nn1 = nn0; for (;;) { - while (gn1 <= nn0 && nn1 == numext::div_ceil(nn0, gn1)) gn1++; + while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; if (gn1 > nn0) break; int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, shard_by_col); if (res < 0) break; - nn1 = numext::div_ceil(nn0, gn1); + nn1 = divup(nn0, gn1); if (res == 0) continue; gn = gn1; } @@ -1544,14 +1544,14 @@ // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 // of cores will be busy). While grain size 3 gives us 4 tasks, which gives // us parallelism of 1 (we can load all cores). - Index nm0 = numext::div_ceil(m, bm); - Index nn0 = numext::div_ceil(n, bn); - Index new_tasks = numext::div_ceil(nm0, gm) * numext::div_ceil(nn0, gn); + Index nm0 = divup(m, bm); + Index nn0 = divup(n, bn); + Index new_tasks = divup(nm0, gm) * divup(nn0, gn); double new_parallelism = static_cast<double>(new_tasks) / - (numext::div_ceil<int>(new_tasks, num_threads) * num_threads); - Index old_tasks = numext::div_ceil(nm0, oldgm) * numext::div_ceil(nn0, oldgn); + (divup<int>(new_tasks, num_threads) * num_threads); + Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); double old_parallelism = static_cast<double>(old_tasks) / - (numext::div_ceil<int>(old_tasks, num_threads) * num_threads); + (divup<int>(old_tasks, num_threads) * num_threads); if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; return 0; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 53b66c0..6c7ad67 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -223,7 +223,7 @@ Index lastIdx) { while (lastIdx - firstIdx > block.size) { // Split into halves and schedule the second half on a different thread. - const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size; + const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); lastIdx = midIdx; } @@ -282,7 +282,7 @@ ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) { while (lastIdx - firstIdx > block.size) { // Split into halves and schedule the second half on a different thread. - const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size; + const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; pool_->Schedule( [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); }); lastIdx = midIdx; @@ -357,7 +357,7 @@ const Index max_oversharding_factor = 4; Index block_size = numext::mini( n, numext::maxi<Index>( - numext::div_ceil<Index>(n, max_oversharding_factor * numThreads()), + divup<Index>(n, max_oversharding_factor * numThreads()), block_size_f)); const Index max_block_size = numext::mini(n, 2 * block_size); @@ -367,13 +367,13 @@ block_size = numext::mini(n, new_block_size); } - Index block_count = numext::div_ceil(n, block_size); + Index block_count = divup(n, block_size); // Calculate parallel efficiency as fraction of total CPU time used for // computations: double max_efficiency = static_cast<double>(block_count) / - (numext::div_ceil<int>(block_count, numThreads()) * numThreads()); + (divup<int>(block_count, numThreads()) * numThreads()); // Now try to increase block size up to max_block_size as long as it // doesn't decrease parallel efficiency. @@ -381,7 +381,7 @@ max_efficiency < 1.0 && prev_block_count > 1;) { // This is the next block size that divides size into a smaller number // of blocks than the current block_size. - Index coarser_block_size = numext::div_ceil(n, prev_block_count - 1); + Index coarser_block_size = divup(n, prev_block_count - 1); if (block_align) { Index new_block_size = block_align(coarser_block_size); eigen_assert(new_block_size >= coarser_block_size); @@ -391,12 +391,12 @@ break; // Reached max block size. Stop. } // Recalculate parallel efficiency. - const Index coarser_block_count = numext::div_ceil(n, coarser_block_size); + const Index coarser_block_count = divup(n, coarser_block_size); eigen_assert(coarser_block_count < prev_block_count); prev_block_count = coarser_block_count; const double coarser_efficiency = static_cast<double>(coarser_block_count) / - (numext::div_ceil<int>(coarser_block_count, numThreads()) * numThreads()); + (divup<int>(coarser_block_count, numThreads()) * numThreads()); if (coarser_efficiency + 0.01 >= max_efficiency) { // Taking it. block_size = coarser_block_size;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 4eebbe7..461abe4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -261,7 +261,7 @@ const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); const size_t aligned_blocksize = align * - numext::div_ceil<size_t>(block_size * sizeof(typename Evaluator::Scalar), align); + divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align); return {block_mapper, requirements.cost_per_coeff * block_size, aligned_blocksize}; @@ -661,7 +661,7 @@ block_size; const StorageIndex size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. - const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, numext::div_ceil<int>(size, block_size)), 1); + const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); LAUNCH_GPU_KERNEL( (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 524432e..b7c2cb8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -27,6 +27,21 @@ return second; } + +template <typename T, typename X, typename Y> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const X x, const Y y) { + // Note: This form is used because it cannot overflow. + return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1); +} + +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const T x, const T y) { + // Note: This form is used because it cannot overflow. + return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1); +} + template <size_t n> struct max_n_1 { static const size_t size = n; }; @@ -34,11 +49,6 @@ static const size_t size = 1; }; -template <typename T> -EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -constexpr T divup(const T x, const T y) { - return Eigen::numext::div_ceil(x, y); -} // Default packet types template <typename Scalar, typename Device>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index 7348a71..aee86fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -414,7 +414,7 @@ typedef typename Self::Index Index; const int block_size = 256; const int num_per_thread = 128; - const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread); + const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); unsigned int* semaphore = NULL; if (num_blocks > 1) { @@ -441,7 +441,7 @@ const int block_size = 256; const int num_per_thread = 128; - const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread); + const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); half* scratch = static_cast<half*>(device.scratchpad()); if (num_blocks > 1) { @@ -507,7 +507,7 @@ const int unroll_times = 16; eigen_assert(NumPerThread % unroll_times == 0); - const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread); + const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread); const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; const Index num_threads = blockDim.x * gridDim.x; @@ -593,8 +593,8 @@ eigen_assert(NumPerThread % unroll_times == 0); eigen_assert(unroll_times % 2 == 0); - const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); - const Index num_input_blocks = numext::div_ceil<Index>(input_col_blocks * num_preserved_coeffs, 2); + const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); + const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2); const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -785,7 +785,7 @@ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 128; - const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread); + const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); @@ -793,7 +793,7 @@ if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there // won't be a race conditions between multiple thread blocks. - const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024); + const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024); const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024; const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2); @@ -831,7 +831,7 @@ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = /*256*/128; const int num_per_thread = /*128*/64; - const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread); + const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); @@ -900,7 +900,7 @@ } // Do the reduction. - const Index max_iter = num_preserved_coeffs * numext::div_ceil<Index>(num_coeffs_to_reduce, NumPerThread); + const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread); for (Index i = thread_id; i < max_iter; i += num_threads) { const Index input_col = i % num_preserved_coeffs; const Index input_row = (i / num_preserved_coeffs) * NumPerThread; @@ -953,7 +953,7 @@ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 16; - const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread); + const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); @@ -961,7 +961,7 @@ if (num_blocks > 1) { // We initialize the outputs in the reduction kernel itself when we don't have to worry // about race conditions between multiple thread blocks. - const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024); + const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024); const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024; const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 4f4a93e..169a7a2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -215,7 +215,7 @@ EIGEN_CONSTEXPR Index kBlockAlignment = 128; const Index items_per_cacheline = numext::maxi<Index>(1, kBlockAlignment / item_size); - return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline); + return items_per_cacheline * divup(block_size, items_per_cacheline); } template <typename Self>
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 3a985cf..d41baf2 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt
@@ -287,52 +287,49 @@ # Add HIP specific tests if (EIGEN_TEST_HIP) - set(ROCM_PATH "/opt/rocm" CACHE STRING "Path to the ROCm installation.") + set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.") - if (EXISTS ${ROCM_PATH}/hip) - set(HIP_PATH ${ROCM_PATH}/hip) + if (EXISTS ${HIP_PATH}) list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) - elseif (EXISTS ${ROCM_PATH}/lib/cmake/hip) - set(HIP_PATH ${ROCM_PATH}) - list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/lib/cmake/hip) - else () - message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but could not find the ROCm installation under ${ROCM_PATH}") - endif() - find_package(HIP REQUIRED) - if (HIP_FOUND) - execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM) + find_package(HIP REQUIRED) + if (HIP_FOUND) + execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM) - if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd")) - include_directories(${CMAKE_CURRENT_BINARY_DIR}) - include_directories(${HIP_PATH}/include) + if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd")) + include_directories(${CMAKE_CURRENT_BINARY_DIR}) + include_directories(${HIP_PATH}/include) - set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") - # - # complex datatype is not yet supported by HIP - # so leaving out those tests for now - # - # ei_add_test(cxx11_tensor_complex_gpu) - # ei_add_test(cxx11_tensor_complex_cwise_ops_gpu) - # - ei_add_test(cxx11_tensor_reduction_gpu) - ei_add_test(cxx11_tensor_argmax_gpu) - ei_add_test(cxx11_tensor_cast_float16_gpu) - ei_add_test(cxx11_tensor_scan_gpu) - ei_add_test(cxx11_tensor_device) + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + # + # complex datatype is not yet supported by HIP + # so leaving out those tests for now + # + # ei_add_test(cxx11_tensor_complex_gpu) + # ei_add_test(cxx11_tensor_complex_cwise_ops_gpu) + # + ei_add_test(cxx11_tensor_reduction_gpu) + ei_add_test(cxx11_tensor_argmax_gpu) + ei_add_test(cxx11_tensor_cast_float16_gpu) + ei_add_test(cxx11_tensor_scan_gpu) + ei_add_test(cxx11_tensor_device) - ei_add_test(cxx11_tensor_gpu) - ei_add_test(cxx11_tensor_contract_gpu) - ei_add_test(cxx11_tensor_of_float16_gpu) - ei_add_test(cxx11_tensor_of_bfloat16_gpu) - ei_add_test(cxx11_tensor_random_gpu) + ei_add_test(cxx11_tensor_gpu) + ei_add_test(cxx11_tensor_contract_gpu) + ei_add_test(cxx11_tensor_of_float16_gpu) + ei_add_test(cxx11_tensor_of_bfloat16_gpu) + ei_add_test(cxx11_tensor_random_gpu) - unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) - elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia")) - message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") - else () - message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") + elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia")) + message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") + else () + message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") + endif() endif() + else () + message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist") endif() + endif()