blob: 9333d1604c89709113e1854c6f5e4339272e82f5 [file] [log] [blame]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
// IWYU pragma: private
#include "../InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
/* Optimized selfadjoint matrix * vector product:
* This algorithm processes 2 columns at once that allows to both reduce
* the number of load/stores of the result by a factor 2 and to reduce
* the instruction dependency.
*/
template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
int Version = Specialized>
struct selfadjoint_matrix_vector_product;
template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
int Version>
struct selfadjoint_matrix_vector_product
{
static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC void run(Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs,
Scalar* res, Scalar alpha);
};
template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
int Version>
EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC void
selfadjoint_matrix_vector_product<Scalar, Index, StorageOrder, UpLo, ConjugateLhs, ConjugateRhs, Version>::run(
Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Scalar* res, Scalar alpha) {
typedef typename packet_traits<Scalar>::type Packet;
typedef typename NumTraits<Scalar>::Real RealScalar;
const Index PacketSize = sizeof(Packet) / sizeof(Scalar);
enum {
IsRowMajor = StorageOrder == RowMajor ? 1 : 0,
IsLower = UpLo == Lower ? 1 : 0,
FirstTriangular = IsRowMajor == IsLower
};
conj_helper<Scalar, Scalar, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0;
conj_helper<Scalar, Scalar, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
conj_helper<RealScalar, Scalar, false, ConjugateRhs> cjd;
conj_helper<Packet, Packet, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0;
conj_helper<Packet, Packet, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, !IsRowMajor), ConjugateRhs>
pcj1;
Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
Index bound = numext::maxi(Index(0), size - 8) & 0xfffffffe;
if (FirstTriangular) bound = size - bound;
for (Index j = FirstTriangular ? bound : 0; j < (FirstTriangular ? size : bound); j += 2) {
const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
Scalar t0 = cjAlpha * rhs[j];
Packet ptmp0 = pset1<Packet>(t0);
Scalar t1 = cjAlpha * rhs[j + 1];
Packet ptmp1 = pset1<Packet>(t1);
Scalar t2(0);
Packet ptmp2 = pset1<Packet>(t2);
Scalar t3(0);
Packet ptmp3 = pset1<Packet>(t3);
Index starti = FirstTriangular ? 0 : j + 2;
Index endi = FirstTriangular ? j : size;
Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi - starti);
Index alignedEnd = alignedStart + ((endi - alignedStart) / (PacketSize)) * (PacketSize);
res[j] += cjd.pmul(numext::real(A0[j]), t0);
res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
if (FirstTriangular) {
res[j] += cj0.pmul(A1[j], t1);
t3 += cj1.pmul(A1[j], rhs[j]);
} else {
res[j + 1] += cj0.pmul(A0[j + 1], t0);
t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
}
for (Index i = starti; i < alignedStart; ++i) {
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
t2 += cj1.pmul(A0[i], rhs[i]);
t3 += cj1.pmul(A1[i], rhs[i]);
}
// Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
// gcc 4.2 does this optimization automatically.
const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
Packet A0i = ploadu<Packet>(a0It);
a0It += PacketSize;
Packet A1i = ploadu<Packet>(a1It);
a1It += PacketSize;
Packet Bi = ploadu<Packet>(rhsIt);
rhsIt += PacketSize; // FIXME should be aligned in most cases
Packet Xi = pload<Packet>(resIt);
Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
pstore(resIt, Xi);
resIt += PacketSize;
}
for (Index i = alignedEnd; i < endi; i++) {
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
t2 += cj1.pmul(A0[i], rhs[i]);
t3 += cj1.pmul(A1[i], rhs[i]);
}
res[j] += alpha * (t2 + predux(ptmp2));
res[j + 1] += alpha * (t3 + predux(ptmp3));
}
for (Index j = FirstTriangular ? 0 : bound; j < (FirstTriangular ? bound : size); j++) {
const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
Scalar t1 = cjAlpha * rhs[j];
Scalar t2(0);
res[j] += cjd.pmul(numext::real(A0[j]), t1);
for (Index i = FirstTriangular ? 0 : j + 1; i < (FirstTriangular ? j : size); i++) {
res[i] += cj0.pmul(A0[i], t1);
t2 += cj1.pmul(A0[i], rhs[i]);
}
res[j] += alpha * t2;
}
}
} // end namespace internal
/***************************************************************************
* Wrapper to product_selfadjoint_vector
***************************************************************************/
namespace internal {
template <typename Lhs, int LhsMode, typename Rhs>
struct selfadjoint_product_impl<Lhs, LhsMode, false, Rhs, 0, true> {
typedef typename Product<Lhs, Rhs>::Scalar Scalar;
typedef internal::blas_traits<Lhs> LhsBlasTraits;
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
typedef internal::remove_all_t<ActualLhsType> ActualLhsTypeCleaned;
typedef internal::blas_traits<Rhs> RhsBlasTraits;
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
typedef internal::remove_all_t<ActualRhsType> ActualRhsTypeCleaned;
enum { LhsUpLo = LhsMode & (Upper | Lower) };
template <typename Dest>
static EIGEN_DEVICE_FUNC void run(Dest& dest, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
typedef typename Dest::Scalar ResScalar;
typedef typename Rhs::Scalar RhsScalar;
typedef Map<Matrix<ResScalar, Dynamic, 1>, plain_enum_min(AlignedMax, internal::packet_traits<ResScalar>::size)>
MappedDest;
eigen_assert(dest.rows() == a_lhs.rows() && dest.cols() == a_rhs.cols());
add_const_on_value_type_t<ActualLhsType> lhs = LhsBlasTraits::extract(a_lhs);
add_const_on_value_type_t<ActualRhsType> rhs = RhsBlasTraits::extract(a_rhs);
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * RhsBlasTraits::extractScalarFactor(a_rhs);
enum {
EvalToDest = (Dest::InnerStrideAtCompileTime == 1),
UseRhs = (ActualRhsTypeCleaned::InnerStrideAtCompileTime == 1)
};
internal::gemv_static_vector_if<ResScalar, Dest::SizeAtCompileTime, Dest::MaxSizeAtCompileTime, !EvalToDest>
static_dest;
internal::gemv_static_vector_if<RhsScalar, ActualRhsTypeCleaned::SizeAtCompileTime,
ActualRhsTypeCleaned::MaxSizeAtCompileTime, !UseRhs>
static_rhs;
ei_declare_aligned_stack_constructed_variable(ResScalar, actualDestPtr, dest.size(),
EvalToDest ? dest.data() : static_dest.data());
ei_declare_aligned_stack_constructed_variable(RhsScalar, actualRhsPtr, rhs.size(),
UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data());
if (!EvalToDest) {
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
Index size = dest.size();
EIGEN_DENSE_STORAGE_CTOR_PLUGIN
#endif
MappedDest(actualDestPtr, dest.size()) = dest;
}
if (!UseRhs) {
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
Index size = rhs.size();
EIGEN_DENSE_STORAGE_CTOR_PLUGIN
#endif
Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
}
internal::selfadjoint_matrix_vector_product<
Scalar, Index, (internal::traits<ActualLhsTypeCleaned>::Flags & RowMajorBit) ? RowMajor : ColMajor,
int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate),
bool(RhsBlasTraits::NeedToConjugate)>::run(lhs.rows(), // size
&lhs.coeffRef(0, 0), lhs.outerStride(), // lhs info
actualRhsPtr, // rhs info
actualDestPtr, // result info
actualAlpha // scale factor
);
if (!EvalToDest) dest = MappedDest(actualDestPtr, dest.size());
}
};
template <typename Lhs, typename Rhs, int RhsMode>
struct selfadjoint_product_impl<Lhs, 0, true, Rhs, RhsMode, false> {
typedef typename Product<Lhs, Rhs>::Scalar Scalar;
enum { RhsUpLo = RhsMode & (Upper | Lower) };
template <typename Dest>
static void run(Dest& dest, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
// let's simply transpose the product
Transpose<Dest> destT(dest);
selfadjoint_product_impl<Transpose<const Rhs>, int(RhsUpLo) == Upper ? Lower : Upper, false, Transpose<const Lhs>,
0, true>::run(destT, a_rhs.transpose(), a_lhs.transpose(), alpha);
}
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H