Eigen/src/Core/products/SelfadjointMatrixVector.h - eigen - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
 #define EIGEN_SELFADJOINT_MATRIX_VECTOR_H

 // IWYU pragma: private
 #include "../InternalHeaderCheck.h"

 namespace Eigen {

 namespace internal {

 /* Optimized selfadjoint matrix * vector product:
  * This algorithm processes 2 columns at once that allows to both reduce
  * the number of load/stores of the result by a factor 2 and to reduce
  * the instruction dependency.
  */

 template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
           int Version = Specialized>
 struct selfadjoint_matrix_vector_product;

 template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
           int Version>
 struct selfadjoint_matrix_vector_product

 {
   static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC void run(Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs,
                                                       Scalar* res, Scalar alpha);
 };

 template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
           int Version>
 EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC void
 selfadjoint_matrix_vector_product<Scalar, Index, StorageOrder, UpLo, ConjugateLhs, ConjugateRhs, Version>::run(
     Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Scalar* res, Scalar alpha) {
   typedef typename packet_traits<Scalar>::type Packet;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   const Index PacketSize = sizeof(Packet) / sizeof(Scalar);

   enum {
     IsRowMajor = StorageOrder == RowMajor ? 1 : 0,
     IsLower = UpLo == Lower ? 1 : 0,
     FirstTriangular = IsRowMajor == IsLower
   };

   conj_helper<Scalar, Scalar, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0;
   conj_helper<Scalar, Scalar, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
   conj_helper<RealScalar, Scalar, false, ConjugateRhs> cjd;

   conj_helper<Packet, Packet, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0;
   conj_helper<Packet, Packet, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, !IsRowMajor), ConjugateRhs>
       pcj1;

   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;

   Index bound = numext::maxi(Index(0), size - 8) & 0xfffffffe;
   if (FirstTriangular) bound = size - bound;

   for (Index j = FirstTriangular ? bound : 0; j < (FirstTriangular ? size : bound); j += 2) {
     const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
     const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;

     Scalar t0 = cjAlpha * rhs[j];
     Packet ptmp0 = pset1<Packet>(t0);
     Scalar t1 = cjAlpha * rhs[j + 1];
     Packet ptmp1 = pset1<Packet>(t1);

     Scalar t2(0);
     Packet ptmp2 = pset1<Packet>(t2);
     Scalar t3(0);
     Packet ptmp3 = pset1<Packet>(t3);

     Index starti = FirstTriangular ? 0 : j + 2;
     Index endi = FirstTriangular ? j : size;
     Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi - starti);
     Index alignedEnd = alignedStart + ((endi - alignedStart) / (PacketSize)) * (PacketSize);

     res[j] += cjd.pmul(numext::real(A0[j]), t0);
     res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
     if (FirstTriangular) {
       res[j] += cj0.pmul(A1[j], t1);
       t3 += cj1.pmul(A1[j], rhs[j]);
     } else {
       res[j + 1] += cj0.pmul(A0[j + 1], t0);
       t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
     }

     for (Index i = starti; i < alignedStart; ++i) {
       res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
       t2 += cj1.pmul(A0[i], rhs[i]);
       t3 += cj1.pmul(A1[i], rhs[i]);
     }
     // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
     // gcc 4.2 does this optimization automatically.
     const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
     const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
     const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
     Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
     for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
       Packet A0i = ploadu<Packet>(a0It);
       a0It += PacketSize;
       Packet A1i = ploadu<Packet>(a1It);
       a1It += PacketSize;
       Packet Bi = ploadu<Packet>(rhsIt);
       rhsIt += PacketSize;  // FIXME should be aligned in most cases
       Packet Xi = pload<Packet>(resIt);

       Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
       ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
       ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
       pstore(resIt, Xi);
       resIt += PacketSize;
     }
     for (Index i = alignedEnd; i < endi; i++) {
       res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
       t2 += cj1.pmul(A0[i], rhs[i]);
       t3 += cj1.pmul(A1[i], rhs[i]);
     }

     res[j] += alpha * (t2 + predux(ptmp2));
     res[j + 1] += alpha * (t3 + predux(ptmp3));
   }
   for (Index j = FirstTriangular ? 0 : bound; j < (FirstTriangular ? bound : size); j++) {
     const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;

     Scalar t1 = cjAlpha * rhs[j];
     Scalar t2(0);
     res[j] += cjd.pmul(numext::real(A0[j]), t1);
     for (Index i = FirstTriangular ? 0 : j + 1; i < (FirstTriangular ? j : size); i++) {
       res[i] += cj0.pmul(A0[i], t1);
       t2 += cj1.pmul(A0[i], rhs[i]);
     }
     res[j] += alpha * t2;
   }
 }

 }  // end namespace internal

 /***************************************************************************
  * Wrapper to product_selfadjoint_vector
  ***************************************************************************/

 namespace internal {

 template <typename Lhs, int LhsMode, typename Rhs>
 struct selfadjoint_product_impl<Lhs, LhsMode, false, Rhs, 0, true> {
   typedef typename Product<Lhs, Rhs>::Scalar Scalar;

   typedef internal::blas_traits<Lhs> LhsBlasTraits;
   typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
   typedef internal::remove_all_t<ActualLhsType> ActualLhsTypeCleaned;

   typedef internal::blas_traits<Rhs> RhsBlasTraits;
   typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
   typedef internal::remove_all_t<ActualRhsType> ActualRhsTypeCleaned;

   enum { LhsUpLo = LhsMode & (Upper | Lower) };

   template <typename Dest>
   static EIGEN_DEVICE_FUNC void run(Dest& dest, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
     typedef typename Dest::Scalar ResScalar;
     typedef typename Rhs::Scalar RhsScalar;
     typedef Map<Matrix<ResScalar, Dynamic, 1>, plain_enum_min(AlignedMax, internal::packet_traits<ResScalar>::size)>
         MappedDest;

     eigen_assert(dest.rows() == a_lhs.rows() && dest.cols() == a_rhs.cols());

     add_const_on_value_type_t<ActualLhsType> lhs = LhsBlasTraits::extract(a_lhs);
     add_const_on_value_type_t<ActualRhsType> rhs = RhsBlasTraits::extract(a_rhs);

     Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * RhsBlasTraits::extractScalarFactor(a_rhs);

     enum {
       EvalToDest = (Dest::InnerStrideAtCompileTime == 1),
       UseRhs = (ActualRhsTypeCleaned::InnerStrideAtCompileTime == 1)
     };

     internal::gemv_static_vector_if<ResScalar, Dest::SizeAtCompileTime, Dest::MaxSizeAtCompileTime, !EvalToDest>
         static_dest;
     internal::gemv_static_vector_if<RhsScalar, ActualRhsTypeCleaned::SizeAtCompileTime,
                                     ActualRhsTypeCleaned::MaxSizeAtCompileTime, !UseRhs>
         static_rhs;

     ei_declare_aligned_stack_constructed_variable(ResScalar, actualDestPtr, dest.size(),
                                                   EvalToDest ? dest.data() : static_dest.data());

     ei_declare_aligned_stack_constructed_variable(RhsScalar, actualRhsPtr, rhs.size(),
                                                   UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data());

     if (!EvalToDest) {
 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       Index size = dest.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
 #endif
       MappedDest(actualDestPtr, dest.size()) = dest;
     }

     if (!UseRhs) {
 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       Index size = rhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
 #endif
       Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
     }

     internal::selfadjoint_matrix_vector_product<
         Scalar, Index, (internal::traits<ActualLhsTypeCleaned>::Flags & RowMajorBit) ? RowMajor : ColMajor,
         int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate),
         bool(RhsBlasTraits::NeedToConjugate)>::run(lhs.rows(),                              // size
                                                    &lhs.coeffRef(0, 0), lhs.outerStride(),  // lhs info
                                                    actualRhsPtr,                            // rhs info
                                                    actualDestPtr,                           // result info
                                                    actualAlpha                              // scale factor
     );

     if (!EvalToDest) dest = MappedDest(actualDestPtr, dest.size());
   }
 };

 template <typename Lhs, typename Rhs, int RhsMode>
 struct selfadjoint_product_impl<Lhs, 0, true, Rhs, RhsMode, false> {
   typedef typename Product<Lhs, Rhs>::Scalar Scalar;
   enum { RhsUpLo = RhsMode & (Upper | Lower) };

   template <typename Dest>
   static void run(Dest& dest, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
     // let's simply transpose the product
     Transpose<Dest> destT(dest);
     selfadjoint_product_impl<Transpose<const Rhs>, int(RhsUpLo) == Upper ? Lower : Upper, false, Transpose<const Lhs>,
                              0, true>::run(destT, a_rhs.transpose(), a_lhs.transpose(), alpha);
   }
 };

 }  // end namespace internal

 }  // end namespace Eigen

 #endif  // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
	#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H

	// IWYU pragma: private
	#include "../InternalHeaderCheck.h"

	namespace Eigen {

	namespace internal {

	/* Optimized selfadjoint matrix * vector product:
	* This algorithm processes 2 columns at once that allows to both reduce
	* the number of load/stores of the result by a factor 2 and to reduce
	* the instruction dependency.
	*/

	template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
	int Version = Specialized>
	struct selfadjoint_matrix_vector_product;

	template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
	int Version>
	struct selfadjoint_matrix_vector_product

	{
	static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC void run(Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs,
	Scalar* res, Scalar alpha);
	};

	template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
	int Version>
	EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC void
	selfadjoint_matrix_vector_product<Scalar, Index, StorageOrder, UpLo, ConjugateLhs, ConjugateRhs, Version>::run(
	Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Scalar* res, Scalar alpha) {
	typedef typename packet_traits<Scalar>::type Packet;
	typedef typename NumTraits<Scalar>::Real RealScalar;
	const Index PacketSize = sizeof(Packet) / sizeof(Scalar);

	enum {
	IsRowMajor = StorageOrder == RowMajor ? 1 : 0,
	IsLower = UpLo == Lower ? 1 : 0,
	FirstTriangular = IsRowMajor == IsLower
	};

	conj_helper<Scalar, Scalar, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0;
	conj_helper<Scalar, Scalar, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
	conj_helper<RealScalar, Scalar, false, ConjugateRhs> cjd;

	conj_helper<Packet, Packet, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0;
	conj_helper<Packet, Packet, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, !IsRowMajor), ConjugateRhs>
	pcj1;

	Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;

	Index bound = numext::maxi(Index(0), size - 8) & 0xfffffffe;
	if (FirstTriangular) bound = size - bound;

	for (Index j = FirstTriangular ? bound : 0; j < (FirstTriangular ? size : bound); j += 2) {
	const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
	const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;

	Scalar t0 = cjAlpha * rhs[j];
	Packet ptmp0 = pset1<Packet>(t0);
	Scalar t1 = cjAlpha * rhs[j + 1];
	Packet ptmp1 = pset1<Packet>(t1);

	Scalar t2(0);
	Packet ptmp2 = pset1<Packet>(t2);
	Scalar t3(0);
	Packet ptmp3 = pset1<Packet>(t3);

	Index starti = FirstTriangular ? 0 : j + 2;
	Index endi = FirstTriangular ? j : size;
	Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi - starti);
	Index alignedEnd = alignedStart + ((endi - alignedStart) / (PacketSize)) * (PacketSize);

	res[j] += cjd.pmul(numext::real(A0[j]), t0);
	res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
	if (FirstTriangular) {
	res[j] += cj0.pmul(A1[j], t1);
	t3 += cj1.pmul(A1[j], rhs[j]);
	} else {
	res[j + 1] += cj0.pmul(A0[j + 1], t0);
	t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
	}

	for (Index i = starti; i < alignedStart; ++i) {
	res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
	t2 += cj1.pmul(A0[i], rhs[i]);
	t3 += cj1.pmul(A1[i], rhs[i]);
	}
	// Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
	// gcc 4.2 does this optimization automatically.
	const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
	const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
	const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
	Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
	for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
	Packet A0i = ploadu<Packet>(a0It);
	a0It += PacketSize;
	Packet A1i = ploadu<Packet>(a1It);
	a1It += PacketSize;
	Packet Bi = ploadu<Packet>(rhsIt);
	rhsIt += PacketSize; // FIXME should be aligned in most cases
	Packet Xi = pload<Packet>(resIt);

	Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
	ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
	ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
	pstore(resIt, Xi);
	resIt += PacketSize;
	}
	for (Index i = alignedEnd; i < endi; i++) {
	res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
	t2 += cj1.pmul(A0[i], rhs[i]);
	t3 += cj1.pmul(A1[i], rhs[i]);
	}

	res[j] += alpha * (t2 + predux(ptmp2));
	res[j + 1] += alpha * (t3 + predux(ptmp3));
	}
	for (Index j = FirstTriangular ? 0 : bound; j < (FirstTriangular ? bound : size); j++) {
	const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;

	Scalar t1 = cjAlpha * rhs[j];
	Scalar t2(0);
	res[j] += cjd.pmul(numext::real(A0[j]), t1);
	for (Index i = FirstTriangular ? 0 : j + 1; i < (FirstTriangular ? j : size); i++) {
	res[i] += cj0.pmul(A0[i], t1);
	t2 += cj1.pmul(A0[i], rhs[i]);
	}
	res[j] += alpha * t2;
	}
	}

	} // end namespace internal

	/***************************************************************************
	* Wrapper to product_selfadjoint_vector
	***************************************************************************/

	namespace internal {

	template <typename Lhs, int LhsMode, typename Rhs>
	struct selfadjoint_product_impl<Lhs, LhsMode, false, Rhs, 0, true> {
	typedef typename Product<Lhs, Rhs>::Scalar Scalar;

	typedef internal::blas_traits<Lhs> LhsBlasTraits;
	typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
	typedef internal::remove_all_t<ActualLhsType> ActualLhsTypeCleaned;

	typedef internal::blas_traits<Rhs> RhsBlasTraits;
	typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
	typedef internal::remove_all_t<ActualRhsType> ActualRhsTypeCleaned;

	enum { LhsUpLo = LhsMode & (Upper \| Lower) };

	template <typename Dest>
	static EIGEN_DEVICE_FUNC void run(Dest& dest, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
	typedef typename Dest::Scalar ResScalar;
	typedef typename Rhs::Scalar RhsScalar;
	typedef Map<Matrix<ResScalar, Dynamic, 1>, plain_enum_min(AlignedMax, internal::packet_traits<ResScalar>::size)>
	MappedDest;

	eigen_assert(dest.rows() == a_lhs.rows() && dest.cols() == a_rhs.cols());

	add_const_on_value_type_t<ActualLhsType> lhs = LhsBlasTraits::extract(a_lhs);
	add_const_on_value_type_t<ActualRhsType> rhs = RhsBlasTraits::extract(a_rhs);

	Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * RhsBlasTraits::extractScalarFactor(a_rhs);

	enum {
	EvalToDest = (Dest::InnerStrideAtCompileTime == 1),
	UseRhs = (ActualRhsTypeCleaned::InnerStrideAtCompileTime == 1)
	};

	internal::gemv_static_vector_if<ResScalar, Dest::SizeAtCompileTime, Dest::MaxSizeAtCompileTime, !EvalToDest>
	static_dest;
	internal::gemv_static_vector_if<RhsScalar, ActualRhsTypeCleaned::SizeAtCompileTime,
	ActualRhsTypeCleaned::MaxSizeAtCompileTime, !UseRhs>
	static_rhs;

	ei_declare_aligned_stack_constructed_variable(ResScalar, actualDestPtr, dest.size(),
	EvalToDest ? dest.data() : static_dest.data());

	ei_declare_aligned_stack_constructed_variable(RhsScalar, actualRhsPtr, rhs.size(),
	UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data());

	if (!EvalToDest) {
	#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
	Index size = dest.size();
	EIGEN_DENSE_STORAGE_CTOR_PLUGIN
	#endif
	MappedDest(actualDestPtr, dest.size()) = dest;
	}

	if (!UseRhs) {
	#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
	Index size = rhs.size();
	EIGEN_DENSE_STORAGE_CTOR_PLUGIN
	#endif
	Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
	}

	internal::selfadjoint_matrix_vector_product<
	Scalar, Index, (internal::traits<ActualLhsTypeCleaned>::Flags & RowMajorBit) ? RowMajor : ColMajor,
	int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate),
	bool(RhsBlasTraits::NeedToConjugate)>::run(lhs.rows(), // size
	&lhs.coeffRef(0, 0), lhs.outerStride(), // lhs info
	actualRhsPtr, // rhs info
	actualDestPtr, // result info
	actualAlpha // scale factor
	);

	if (!EvalToDest) dest = MappedDest(actualDestPtr, dest.size());
	}
	};

	template <typename Lhs, typename Rhs, int RhsMode>
	struct selfadjoint_product_impl<Lhs, 0, true, Rhs, RhsMode, false> {
	typedef typename Product<Lhs, Rhs>::Scalar Scalar;
	enum { RhsUpLo = RhsMode & (Upper \| Lower) };

	template <typename Dest>
	static void run(Dest& dest, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
	// let's simply transpose the product
	Transpose<Dest> destT(dest);
	selfadjoint_product_impl<Transpose<const Rhs>, int(RhsUpLo) == Upper ? Lower : Upper, false, Transpose<const Lhs>,
	0, true>::run(destT, a_rhs.transpose(), a_lhs.transpose(), alpha);
	}
	};

	} // end namespace internal

	} // end namespace Eigen

	#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H