Eigen/src/Core/PartialReduxEvaluator.h - eigen - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_PARTIALREDUX_H
 #define EIGEN_PARTIALREDUX_H

 namespace Eigen {

 namespace internal {


 /***************************************************************************
 *
 * This file provides evaluators for partial reductions.
 * There are two modes:
 *
 *  - scalar path: simply calls the respective function on the column or row.
 *    -> nothing special here, all the tricky part is handled by the return
 *       types of VectorwiseOp's members. They embed the functor calling the
 *       respective DenseBase's member function.
 *
 *  - vectorized path: implements a packet-wise reductions followed by
 *    some (optional) processing of the outcome, e.g., division by n for mean.
 *
 * For the vectorized path let's observe that the packet-size and outer-unrolling
 * are both decided by the assignement logic. So all we have to do is to decide
 * on the inner unrolling.
 *
 * For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h,
 * but be need to be careful to specify correct increment.
 *
 ***************************************************************************/


 /* logic deciding a strategy for unrolling of vectorized paths */
 template<typename Func, typename Evaluator>
 struct packetwise_redux_traits
 {
   enum {
     OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime,
     Cost = OuterSize == Dynamic ? HugeCost
          : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits<Func>::Cost,
     Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling
   };

 };

 /* Value to be returned when size==0 , by default let's return 0 */
 template<typename PacketType,typename Func>
 EIGEN_DEVICE_FUNC
 PacketType packetwise_redux_empty_value(const Func& ) { return pset1<PacketType>(0); }

 /* For products the default is 1 */
 template<typename PacketType,typename Scalar>
 EIGEN_DEVICE_FUNC
 PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) { return pset1<PacketType>(1); }

 /* Perform the actual reduction */
 template<typename Func, typename Evaluator,
          int Unrolling = packetwise_redux_traits<Func, Evaluator>::Unrolling
 >
 struct packetwise_redux_impl;

 /* Perform the actual reduction with unrolling */
 template<typename Func, typename Evaluator>
 struct packetwise_redux_impl<Func, Evaluator, CompleteUnrolling>
 {
   typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
   typedef typename Evaluator::Scalar Scalar;

   template<typename PacketType>
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
   PacketType run(const Evaluator &eval, const Func& func, Index /*size*/)
   {
     return redux_vec_unroller<Func, Evaluator, 0, packetwise_redux_traits<Func, Evaluator>::OuterSize>::template run<PacketType>(eval,func);
   }
 };

 /* Add a specialization of redux_vec_unroller for size==0 at compiletime.
  * This specialization is not required for general reductions, which is
  * why it is defined here.
  */
 template<typename Func, typename Evaluator, int Start>
 struct redux_vec_unroller<Func, Evaluator, Start, 0>
 {
   template<typename PacketType>
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f)
   {
     return packetwise_redux_empty_value<PacketType>(f);
   }
 };

 /* Perform the actual reduction for dynamic sizes */
 template<typename Func, typename Evaluator>
 struct packetwise_redux_impl<Func, Evaluator, NoUnrolling>
 {
   typedef typename Evaluator::Scalar Scalar;
   typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;

   template<typename PacketType>
   EIGEN_DEVICE_FUNC
   static PacketType run(const Evaluator &eval, const Func& func, Index size)
   {
     if(size==0)
       return packetwise_redux_empty_value<PacketType>(func);

     const Index size4 = (size-1)&(~3);
     PacketType p = eval.template packetByOuterInner<Unaligned,PacketType>(0,0);
     Index i = 1;
     // This loop is optimized for instruction pipelining:
     // - each iteration generates two independent instructions
     // - thanks to branch prediction and out-of-order execution we have independent instructions across loops
     for(; i<size4; i+=4)
       p = func.packetOp(p,
             func.packetOp(
               func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+0,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+1,0)),
               func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+2,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+3,0))));
     for(; i<size; ++i)
       p = func.packetOp(p, eval.template packetByOuterInner<Unaligned,PacketType>(i,0));
     return p;
   }
 };

 template< typename ArgType, typename MemberOp, int Direction>
 struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
   : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
 {
   typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
   typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
   typedef typename internal::add_const_on_value_type<ArgTypeNested>::type ConstArgTypeNested;
   typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
   typedef typename ArgType::Scalar InputScalar;
   typedef typename XprType::Scalar Scalar;
   enum {
     TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
   };
   typedef typename MemberOp::template Cost<int(TraversalSize)> CostOpType;
   enum {
     CoeffReadCost = TraversalSize==Dynamic ? HugeCost
                   : TraversalSize==0 ? 1
                   : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),

     _ArgFlags = evaluator<ArgType>::Flags,

     _Vectorizable =  bool(int(_ArgFlags)&PacketAccessBit)
                   && bool(MemberOp::Vectorizable)
                   && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0)
                   && (TraversalSize!=0),

     Flags = (traits<XprType>::Flags&RowMajorBit)
           | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit)))
           | (_Vectorizable ? PacketAccessBit : 0)
           | LinearAccessBit,

     Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
   };

   EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
     : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value)));
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }

   typedef typename XprType::CoeffReturnType CoeffReturnType;

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Scalar coeff(Index i, Index j) const
   {
     return coeff(Direction==Vertical ? j : i);
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Scalar coeff(Index index) const
   {
     return m_functor(m_arg.template subVector<DirectionType(Direction)>(index));
   }

   template<int LoadMode,typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketType packet(Index i, Index j) const
   {
     return packet<LoadMode,PacketType>(Direction==Vertical ? j : i);
   }

   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
   PacketType packet(Index idx) const
   {
     enum { PacketSize = internal::unpacket_traits<PacketType>::size };
     typedef Block<const ArgTypeNestedCleaned,
                   Direction==Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize),
                   Direction==Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime),
                   true /* InnerPanel */> PanelType;

     PanelType panel(m_arg,
                     Direction==Vertical ? 0 : idx,
                     Direction==Vertical ? idx : 0,
                     Direction==Vertical ? m_arg.rows() : Index(PacketSize),
                     Direction==Vertical ? Index(PacketSize) : m_arg.cols());

     // FIXME
     // See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of panel get reversed
     // and methods like packetByOuterInner do not make sense anymore in this context.
     // So let's just by pass "vectorization" in this case:
     if(PacketSize==1)
       return internal::pset1<PacketType>(coeff(idx));

     typedef typename internal::redux_evaluator<PanelType> PanelEvaluator;
     PanelEvaluator panel_eval(panel);
     typedef typename MemberOp::BinaryOp BinaryOp;
     PacketType p = internal::packetwise_redux_impl<BinaryOp,PanelEvaluator>::template run<PacketType>(panel_eval,m_functor.binaryFunc(),m_arg.outerSize());
     return p;
   }

 protected:
   ConstArgTypeNested m_arg;
   const MemberOp m_functor;
 };

 } // end namespace internal

 } // end namespace Eigen

 #endif // EIGEN_PARTIALREDUX_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	#ifndef EIGEN_PARTIALREDUX_H
	#define EIGEN_PARTIALREDUX_H

	namespace Eigen {

	namespace internal {


	/***************************************************************************
	*
	* This file provides evaluators for partial reductions.
	* There are two modes:
	*
	* - scalar path: simply calls the respective function on the column or row.
	* -> nothing special here, all the tricky part is handled by the return
	* types of VectorwiseOp's members. They embed the functor calling the
	* respective DenseBase's member function.
	*
	* - vectorized path: implements a packet-wise reductions followed by
	* some (optional) processing of the outcome, e.g., division by n for mean.
	*
	* For the vectorized path let's observe that the packet-size and outer-unrolling
	* are both decided by the assignement logic. So all we have to do is to decide
	* on the inner unrolling.
	*
	* For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h,
	* but be need to be careful to specify correct increment.
	*
	***************************************************************************/


	/* logic deciding a strategy for unrolling of vectorized paths */
	template<typename Func, typename Evaluator>
	struct packetwise_redux_traits
	{
	enum {
	OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime,
	Cost = OuterSize == Dynamic ? HugeCost
	: OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits<Func>::Cost,
	Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling
	};

	};

	/* Value to be returned when size==0 , by default let's return 0 */
	template<typename PacketType,typename Func>
	EIGEN_DEVICE_FUNC
	PacketType packetwise_redux_empty_value(const Func& ) { return pset1<PacketType>(0); }

	/* For products the default is 1 */
	template<typename PacketType,typename Scalar>
	EIGEN_DEVICE_FUNC
	PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) { return pset1<PacketType>(1); }

	/* Perform the actual reduction */
	template<typename Func, typename Evaluator,
	int Unrolling = packetwise_redux_traits<Func, Evaluator>::Unrolling
	>
	struct packetwise_redux_impl;

	/* Perform the actual reduction with unrolling */
	template<typename Func, typename Evaluator>
	struct packetwise_redux_impl<Func, Evaluator, CompleteUnrolling>
	{
	typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
	typedef typename Evaluator::Scalar Scalar;

	template<typename PacketType>
	EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
	PacketType run(const Evaluator &eval, const Func& func, Index /size/)
	{
	return redux_vec_unroller<Func, Evaluator, 0, packetwise_redux_traits<Func, Evaluator>::OuterSize>::template run<PacketType>(eval,func);
	}
	};

	/* Add a specialization of redux_vec_unroller for size==0 at compiletime.
	* This specialization is not required for general reductions, which is
	* why it is defined here.
	*/
	template<typename Func, typename Evaluator, int Start>
	struct redux_vec_unroller<Func, Evaluator, Start, 0>
	{
	template<typename PacketType>
	EIGEN_DEVICE_FUNC
	static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f)
	{
	return packetwise_redux_empty_value<PacketType>(f);
	}
	};

	/* Perform the actual reduction for dynamic sizes */
	template<typename Func, typename Evaluator>
	struct packetwise_redux_impl<Func, Evaluator, NoUnrolling>
	{
	typedef typename Evaluator::Scalar Scalar;
	typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;

	template<typename PacketType>
	EIGEN_DEVICE_FUNC
	static PacketType run(const Evaluator &eval, const Func& func, Index size)
	{
	if(size==0)
	return packetwise_redux_empty_value<PacketType>(func);

	const Index size4 = (size-1)&(~3);
	PacketType p = eval.template packetByOuterInner<Unaligned,PacketType>(0,0);
	Index i = 1;
	// This loop is optimized for instruction pipelining:
	// - each iteration generates two independent instructions
	// - thanks to branch prediction and out-of-order execution we have independent instructions across loops
	for(; i<size4; i+=4)
	p = func.packetOp(p,
	func.packetOp(
	func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+0,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+1,0)),
	func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+2,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+3,0))));
	for(; i<size; ++i)
	p = func.packetOp(p, eval.template packetByOuterInner<Unaligned,PacketType>(i,0));
	return p;
	}
	};

	template< typename ArgType, typename MemberOp, int Direction>
	struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
	: evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
	{
	typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
	typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
	typedef typename internal::add_const_on_value_type<ArgTypeNested>::type ConstArgTypeNested;
	typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
	typedef typename ArgType::Scalar InputScalar;
	typedef typename XprType::Scalar Scalar;
	enum {
	TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime)
	};
	typedef typename MemberOp::template Cost<int(TraversalSize)> CostOpType;
	enum {
	CoeffReadCost = TraversalSize==Dynamic ? HugeCost
	: TraversalSize==0 ? 1
	: TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),

	_ArgFlags = evaluator<ArgType>::Flags,

	_Vectorizable = bool(int(_ArgFlags)&PacketAccessBit)
	&& bool(MemberOp::Vectorizable)
	&& (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0)
	&& (TraversalSize!=0),

	Flags = (traits<XprType>::Flags&RowMajorBit)
	\| (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit)))
	\| (_Vectorizable ? PacketAccessBit : 0)
	\| LinearAccessBit,

	Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
	};

	EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
	: m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
	{
	EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value)));
	EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
	}

	typedef typename XprType::CoeffReturnType CoeffReturnType;

	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
	const Scalar coeff(Index i, Index j) const
	{
	return coeff(Direction==Vertical ? j : i);
	}

	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
	const Scalar coeff(Index index) const
	{
	return m_functor(m_arg.template subVector<DirectionType(Direction)>(index));
	}

	template<int LoadMode,typename PacketType>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
	PacketType packet(Index i, Index j) const
	{
	return packet<LoadMode,PacketType>(Direction==Vertical ? j : i);
	}

	template<int LoadMode,typename PacketType>
	EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
	PacketType packet(Index idx) const
	{
	enum { PacketSize = internal::unpacket_traits<PacketType>::size };
	typedef Block<const ArgTypeNestedCleaned,
	Direction==Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize),
	Direction==Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime),
	true /* InnerPanel */> PanelType;

	PanelType panel(m_arg,
	Direction==Vertical ? 0 : idx,
	Direction==Vertical ? idx : 0,
	Direction==Vertical ? m_arg.rows() : Index(PacketSize),
	Direction==Vertical ? Index(PacketSize) : m_arg.cols());

	// FIXME
	// See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of panel get reversed
	// and methods like packetByOuterInner do not make sense anymore in this context.
	// So let's just by pass "vectorization" in this case:
	if(PacketSize==1)
	return internal::pset1<PacketType>(coeff(idx));

	typedef typename internal::redux_evaluator<PanelType> PanelEvaluator;
	PanelEvaluator panel_eval(panel);
	typedef typename MemberOp::BinaryOp BinaryOp;
	PacketType p = internal::packetwise_redux_impl<BinaryOp,PanelEvaluator>::template run<PacketType>(panel_eval,m_functor.binaryFunc(),m_arg.outerSize());
	return p;
	}

	protected:
	ConstArgTypeNested m_arg;
	const MemberOp m_functor;
	};

	} // end namespace internal

	} // end namespace Eigen

	#endif // EIGEN_PARTIALREDUX_H