Eigen/src/Core/arch/AltiVec/MatrixProduct.h - eigen - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
 #define EIGEN_MATRIX_PRODUCT_ALTIVEC_H

 #ifdef __MMA__

 namespace Eigen {

 namespace internal {

 const int accRows = 4;
 const int accCols = 4;
 const int accCount = 4;
 const int floatVectorSize = 4;

 typedef struct
 {
   __vector float v0;
   __vector float v1;
   __vector float v2;
   __vector float v3;
 } Packet4fx4;

 union PacketQuad
 {
   __struct_quad sc;
   Packet4fx4    sf;
 };

 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
 struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
 {
   void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };

 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
 void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
   ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
     int ri = 0, j;
     for(j = 0; j + floatVectorSize < rows; j+=floatVectorSize)
     {
         int i;
         for(i = 0; i + floatVectorSize < depth; i+=floatVectorSize)
         {
             PacketBlock<Packet4f, 4> block;
             block.packet[0] = lhs.template loadPacket<Packet4f>(j, i + 0);
             block.packet[1] = lhs.template loadPacket<Packet4f>(j, i + 1);
             block.packet[2] = lhs.template loadPacket<Packet4f>(j, i + 2);
             block.packet[3] = lhs.template loadPacket<Packet4f>(j, i + 3);

             pstore<float>((float *)(blockA + ri     ), block.packet[0]);
             pstore<float>((float *)(blockA + ri +  4), block.packet[1]);
             pstore<float>((float *)(blockA + ri +  8), block.packet[2]);
             pstore<float>((float *)(blockA + ri + 12), block.packet[3]);
             ri += 4*floatVectorSize;
         }
         for(; i < depth; i++)
         {
             Packet4f lhsV = lhs.template loadPacket<Packet4f>(j, i);
             pstore<float>((float *)(blockA + ri), lhsV);
             ri += floatVectorSize;
         }
     }
     for(int i = 0; i < depth; i++)
     {
         int k = j;
         for(; k < rows; k++)
         {
             blockA[ri] = lhs(k, i);
             ri += 1;
         }
     }
 }

 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
 struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
 {
   void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
 };

 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
 void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
   ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
     int ri = 0, j;
     for(j = 0; j + floatVectorSize < cols; j+=floatVectorSize)
     {
         int i;
         for(i = 0; i + floatVectorSize < depth; i+=floatVectorSize)
         {
             PacketBlock<Packet4f, 4> block;
             block.packet[0] = rhs.template loadPacket<Packet4f>(i, j + 0);
             block.packet[1] = rhs.template loadPacket<Packet4f>(i, j + 1);
             block.packet[2] = rhs.template loadPacket<Packet4f>(i, j + 2);
             block.packet[3] = rhs.template loadPacket<Packet4f>(i, j + 3);

             ptranspose(block);

             pstore<float>((float *)(blockB + ri     ), block.packet[0]);
             pstore<float>((float *)(blockB + ri +  4), block.packet[1]);
             pstore<float>((float *)(blockB + ri +  8), block.packet[2]);
             pstore<float>((float *)(blockB + ri + 12), block.packet[3]);

             ri += 4*floatVectorSize;
         }
         for(; i < depth; i++)
         {
             blockB[ri+0] = rhs(i, j+0);
             blockB[ri+1] = rhs(i, j+1);
             blockB[ri+2] = rhs(i, j+2);
             blockB[ri+3] = rhs(i, j+3);
             ri += floatVectorSize;
         }
     }
     for(int i = 0; i < depth; i++)
     {
         int k = j;
         for(; k < cols; k++)
         {
             blockB[ri] = rhs(i, k);
             ri += 1;
         }
     }
 }

 template<typename DataMapper, typename Index, typename Scalar>
 EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, Scalar alpha, __vector_quad *acc)
 {
   //[TODO]
   //
   //Packet4fx4 r;
   //
   //__builtin_mma_disassemble_acc((void *)&r, *acc);
   //
   PacketQuad result;
   result.sc  =  __builtin_mma_disassemble_acc(*acc);

   Packet4f pAlpha = pset1<Packet4f>(alpha);

   PacketBlock<Packet4f, 4> block;
   block.packet[0] = pAlpha*result.sf.v3;
   block.packet[1] = pAlpha*result.sf.v2;
   block.packet[2] = pAlpha*result.sf.v1;
   block.packet[3] = pAlpha*result.sf.v0;

   data.template storePacketBlock<Packet4f, 4>(i, j, block);
 }

 template<typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 struct gebp_kernel<float, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
 {
   void operator()(const DataMapper& res, const float* blockA, const RhsScalar* blockB,
                   Index rows, Index depth, Index cols, float alpha,
                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
 };

 template<typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 void gebp_kernel<float, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
   ::operator()(const DataMapper& res, const float* blockA, const RhsScalar* blockB,
                Index rows, Index depth, Index cols, float alpha,
                Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
       const int remaining_rows = rows % accRows;
       const int remaining_cols = cols % accCols;
       const int remaining_depth = depth % floatVectorSize;
       const int timesRows = (rows / accRows);
       const int timesCols = (cols / accCols);

       int row;
       for(row = 0; row + accRows <= rows; row += accRows)
       {
           const float *rhs_base = blockB;
           const float *lhs_base = blockA + (row/accRows)*depth*floatVectorSize;

           int col;
           for(col = 0; col + accCount*accCols <= cols; col += accCount*accCols){
               const float *rhs_ptr  = rhs_base + (col/accCols)*depth*floatVectorSize;
               const float *rhs_ptr2 = rhs_base + ((col/accCols) + 1)*depth*floatVectorSize;
               const float *rhs_ptr3 = rhs_base + ((col/accCols) + 2)*depth*floatVectorSize;
               const float *rhs_ptr4 = rhs_base + ((col/accCols) + 3)*depth*floatVectorSize;
               const float *lhs_ptr = lhs_base;

               __vector_quad acc, acc2, acc3, acc4;
               __builtin_mma_xxsetaccz(&acc);
               __builtin_mma_xxsetaccz(&acc2);
               __builtin_mma_xxsetaccz(&acc3);
               __builtin_mma_xxsetaccz(&acc4);

               for(int k = 0; k < depth; k++)
               {
                   __vector float lhsV  = *((__vector float *)lhs_ptr );
                   __vector float rhsV  = *((__vector float *)rhs_ptr );
                   __vector float rhs2V = *((__vector float *)rhs_ptr2);
                   __vector float rhs3V = *((__vector float *)rhs_ptr3);
                   __vector float rhs4V = *((__vector float *)rhs_ptr4);

                   __builtin_mma_xvf32gerpp(&acc, (__vector unsigned char) rhsV, (__vector unsigned char) lhsV);
                   __builtin_mma_xvf32gerpp(&acc2, (__vector unsigned char) rhs2V, (__vector unsigned char) lhsV);
                   __builtin_mma_xvf32gerpp(&acc3, (__vector unsigned char) rhs3V, (__vector unsigned char) lhsV);
                   __builtin_mma_xvf32gerpp(&acc4, (__vector unsigned char) rhs4V, (__vector unsigned char) lhsV);

                   lhs_ptr += floatVectorSize;
                   rhs_ptr += floatVectorSize;
                   rhs_ptr2 += floatVectorSize;
                   rhs_ptr3 += floatVectorSize;
                   rhs_ptr4 += floatVectorSize;
               }

               storeAccumulator<DataMapper, Index, float>(row, col            , res, alpha, &acc );
               storeAccumulator<DataMapper, Index, float>(row, col + 1*accCols, res, alpha, &acc2);
               storeAccumulator<DataMapper, Index, float>(row, col + 2*accCols, res, alpha, &acc3);
               storeAccumulator<DataMapper, Index, float>(row, col + 3*accCols, res, alpha, &acc4);
           }
           for(; col + accCols <= cols; col += accCols){
               const float *rhs_ptr  = rhs_base + (col/accCols)*depth*floatVectorSize;
               const float *lhs_ptr = lhs_base;

               __vector_quad acc;
               __builtin_mma_xxsetaccz(&acc);
               for(int k = 0; k < depth; k++)
               {
                   __vector float lhsV = *((__vector float *)lhs_ptr);
                   __vector float rhsV = *((__vector float *)rhs_ptr);

                   __builtin_mma_xvf32gerpp(&acc, (__vector unsigned char) rhsV, (__vector unsigned char) lhsV);

                   lhs_ptr += floatVectorSize;
                   rhs_ptr += floatVectorSize;
               }

               storeAccumulator<DataMapper, Index, float>(row, col, res, alpha, &acc);
           }

           if(remaining_cols > 0)
           {
               const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize;
               const float *lhs_ptr = lhs_base;
               for(int k = 0; k < depth; k++)
               {
                  for(int arow = 0; arow < accRows; arow++)
                  {
                      for(int acol = 0; acol < remaining_cols; acol++ )
                      {
                         res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol];
                      }
                  }
                  rhs_ptr += remaining_cols;
                  lhs_ptr += floatVectorSize;
               }
           }
       }

       if(remaining_rows > 0)
       {
           const float *rhs_base = blockB;
           const float *lhs_base = blockA + (row/accRows)*depth*floatVectorSize;

           int col;
           for(col = 0; col + accCols <= cols; col += accCols)
           {
               const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize;
               const float *lhs_ptr = lhs_base;
               for(int k = 0; k < depth; k++)
               {
                  for(int arow = 0; arow < remaining_rows; arow++)
                  {
                      for(int acol = 0; acol < accCols; acol++ )
                      {
                         res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol];
                      }
                  }
                  rhs_ptr += floatVectorSize;
                  lhs_ptr += remaining_rows;
               }
           }

           if(remaining_cols > 0)
           {
               const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize;
               const float *lhs_ptr = lhs_base;
               for(int k = 0; k < depth; k++)
               {
                  for(int arow = 0; arow < remaining_rows; arow++)
                  {
                      for(int acol = 0; acol < remaining_cols; acol++ )
                      {
                         res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol];
                      }
                  }
                  rhs_ptr += remaining_cols;
                  lhs_ptr += remaining_rows;
               }
           }
       }
   }

 } // end namespace internal

 } // end namespace Eigen

 #endif // __MMA__
 #endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
	#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H

	#ifdef __MMA__

	namespace Eigen {

	namespace internal {

	const int accRows = 4;
	const int accCols = 4;
	const int accCount = 4;
	const int floatVectorSize = 4;

	typedef struct
	{
	__vector float v0;
	__vector float v1;
	__vector float v2;
	__vector float v3;
	} Packet4fx4;

	union PacketQuad
	{
	__struct_quad sc;
	Packet4fx4 sf;
	};

	template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
	struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
	{
	void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
	};

	template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
	void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
	::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
	{
	int ri = 0, j;
	for(j = 0; j + floatVectorSize < rows; j+=floatVectorSize)
	{
	int i;
	for(i = 0; i + floatVectorSize < depth; i+=floatVectorSize)
	{
	PacketBlock<Packet4f, 4> block;
	block.packet[0] = lhs.template loadPacket<Packet4f>(j, i + 0);
	block.packet[1] = lhs.template loadPacket<Packet4f>(j, i + 1);
	block.packet[2] = lhs.template loadPacket<Packet4f>(j, i + 2);
	block.packet[3] = lhs.template loadPacket<Packet4f>(j, i + 3);

	pstore<float>((float *)(blockA + ri ), block.packet[0]);
	pstore<float>((float *)(blockA + ri + 4), block.packet[1]);
	pstore<float>((float *)(blockA + ri + 8), block.packet[2]);
	pstore<float>((float *)(blockA + ri + 12), block.packet[3]);
	ri += 4*floatVectorSize;
	}
	for(; i < depth; i++)
	{
	Packet4f lhsV = lhs.template loadPacket<Packet4f>(j, i);
	pstore<float>((float *)(blockA + ri), lhsV);
	ri += floatVectorSize;
	}
	}
	for(int i = 0; i < depth; i++)
	{
	int k = j;
	for(; k < rows; k++)
	{
	blockA[ri] = lhs(k, i);
	ri += 1;
	}
	}
	}

	template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
	struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
	{
	void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
	};

	template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
	void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
	::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
	{
	int ri = 0, j;
	for(j = 0; j + floatVectorSize < cols; j+=floatVectorSize)
	{
	int i;
	for(i = 0; i + floatVectorSize < depth; i+=floatVectorSize)
	{
	PacketBlock<Packet4f, 4> block;
	block.packet[0] = rhs.template loadPacket<Packet4f>(i, j + 0);
	block.packet[1] = rhs.template loadPacket<Packet4f>(i, j + 1);
	block.packet[2] = rhs.template loadPacket<Packet4f>(i, j + 2);
	block.packet[3] = rhs.template loadPacket<Packet4f>(i, j + 3);

	ptranspose(block);

	pstore<float>((float *)(blockB + ri ), block.packet[0]);
	pstore<float>((float *)(blockB + ri + 4), block.packet[1]);
	pstore<float>((float *)(blockB + ri + 8), block.packet[2]);
	pstore<float>((float *)(blockB + ri + 12), block.packet[3]);

	ri += 4*floatVectorSize;
	}
	for(; i < depth; i++)
	{
	blockB[ri+0] = rhs(i, j+0);
	blockB[ri+1] = rhs(i, j+1);
	blockB[ri+2] = rhs(i, j+2);
	blockB[ri+3] = rhs(i, j+3);
	ri += floatVectorSize;
	}
	}
	for(int i = 0; i < depth; i++)
	{
	int k = j;
	for(; k < cols; k++)
	{
	blockB[ri] = rhs(i, k);
	ri += 1;
	}
	}
	}

	template<typename DataMapper, typename Index, typename Scalar>
	EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, Scalar alpha, __vector_quad *acc)
	{
	//[TODO]
	//
	//Packet4fx4 r;
	//
	//__builtin_mma_disassemble_acc((void )&r, acc);
	//
	PacketQuad result;
	result.sc = __builtin_mma_disassemble_acc(*acc);

	Packet4f pAlpha = pset1<Packet4f>(alpha);

	PacketBlock<Packet4f, 4> block;
	block.packet[0] = pAlpha*result.sf.v3;
	block.packet[1] = pAlpha*result.sf.v2;
	block.packet[2] = pAlpha*result.sf.v1;
	block.packet[3] = pAlpha*result.sf.v0;

	data.template storePacketBlock<Packet4f, 4>(i, j, block);
	}

	template<typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
	struct gebp_kernel<float, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
	{
	void operator()(const DataMapper& res, const float* blockA, const RhsScalar* blockB,
	Index rows, Index depth, Index cols, float alpha,
	Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
	};

	template<typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
	void gebp_kernel<float, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
	::operator()(const DataMapper& res, const float* blockA, const RhsScalar* blockB,
	Index rows, Index depth, Index cols, float alpha,
	Index strideA, Index strideB, Index offsetA, Index offsetB)
	{
	const int remaining_rows = rows % accRows;
	const int remaining_cols = cols % accCols;
	const int remaining_depth = depth % floatVectorSize;
	const int timesRows = (rows / accRows);
	const int timesCols = (cols / accCols);

	int row;
	for(row = 0; row + accRows <= rows; row += accRows)
	{
	const float *rhs_base = blockB;
	const float lhs_base = blockA + (row/accRows)depth*floatVectorSize;

	int col;
	for(col = 0; col + accCountaccCols <= cols; col += accCountaccCols){
	const float rhs_ptr = rhs_base + (col/accCols)depth*floatVectorSize;
	const float rhs_ptr2 = rhs_base + ((col/accCols) + 1)depth*floatVectorSize;
	const float rhs_ptr3 = rhs_base + ((col/accCols) + 2)depth*floatVectorSize;
	const float rhs_ptr4 = rhs_base + ((col/accCols) + 3)depth*floatVectorSize;
	const float *lhs_ptr = lhs_base;

	__vector_quad acc, acc2, acc3, acc4;
	__builtin_mma_xxsetaccz(&acc);
	__builtin_mma_xxsetaccz(&acc2);
	__builtin_mma_xxsetaccz(&acc3);
	__builtin_mma_xxsetaccz(&acc4);

	for(int k = 0; k < depth; k++)
	{
	__vector float lhsV = ((__vector float )lhs_ptr );
	__vector float rhsV = ((__vector float )rhs_ptr );
	__vector float rhs2V = ((__vector float )rhs_ptr2);
	__vector float rhs3V = ((__vector float )rhs_ptr3);
	__vector float rhs4V = ((__vector float )rhs_ptr4);

	__builtin_mma_xvf32gerpp(&acc, (__vector unsigned char) rhsV, (__vector unsigned char) lhsV);
	__builtin_mma_xvf32gerpp(&acc2, (__vector unsigned char) rhs2V, (__vector unsigned char) lhsV);
	__builtin_mma_xvf32gerpp(&acc3, (__vector unsigned char) rhs3V, (__vector unsigned char) lhsV);
	__builtin_mma_xvf32gerpp(&acc4, (__vector unsigned char) rhs4V, (__vector unsigned char) lhsV);

	lhs_ptr += floatVectorSize;
	rhs_ptr += floatVectorSize;
	rhs_ptr2 += floatVectorSize;
	rhs_ptr3 += floatVectorSize;
	rhs_ptr4 += floatVectorSize;
	}

	storeAccumulator<DataMapper, Index, float>(row, col , res, alpha, &acc );
	storeAccumulator<DataMapper, Index, float>(row, col + 1*accCols, res, alpha, &acc2);
	storeAccumulator<DataMapper, Index, float>(row, col + 2*accCols, res, alpha, &acc3);
	storeAccumulator<DataMapper, Index, float>(row, col + 3*accCols, res, alpha, &acc4);
	}
	for(; col + accCols <= cols; col += accCols){
	const float rhs_ptr = rhs_base + (col/accCols)depth*floatVectorSize;
	const float *lhs_ptr = lhs_base;

	__vector_quad acc;
	__builtin_mma_xxsetaccz(&acc);
	for(int k = 0; k < depth; k++)
	{
	__vector float lhsV = ((__vector float )lhs_ptr);
	__vector float rhsV = ((__vector float )rhs_ptr);

	__builtin_mma_xvf32gerpp(&acc, (__vector unsigned char) rhsV, (__vector unsigned char) lhsV);

	lhs_ptr += floatVectorSize;
	rhs_ptr += floatVectorSize;
	}

	storeAccumulator<DataMapper, Index, float>(row, col, res, alpha, &acc);
	}

	if(remaining_cols > 0)
	{
	const float rhs_ptr = rhs_base + (col/accCols)depth*floatVectorSize;
	const float *lhs_ptr = lhs_base;
	for(int k = 0; k < depth; k++)
	{
	for(int arow = 0; arow < accRows; arow++)
	{
	for(int acol = 0; acol < remaining_cols; acol++ )
	{
	res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol];
	}
	}
	rhs_ptr += remaining_cols;
	lhs_ptr += floatVectorSize;
	}
	}
	}

	if(remaining_rows > 0)
	{
	const float *rhs_base = blockB;
	const float lhs_base = blockA + (row/accRows)depth*floatVectorSize;

	int col;
	for(col = 0; col + accCols <= cols; col += accCols)
	{
	const float rhs_ptr = rhs_base + (col/accCols)depth*floatVectorSize;
	const float *lhs_ptr = lhs_base;
	for(int k = 0; k < depth; k++)
	{
	for(int arow = 0; arow < remaining_rows; arow++)
	{
	for(int acol = 0; acol < accCols; acol++ )
	{
	res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol];
	}
	}
	rhs_ptr += floatVectorSize;
	lhs_ptr += remaining_rows;
	}
	}

	if(remaining_cols > 0)
	{
	const float rhs_ptr = rhs_base + (col/accCols)depth*floatVectorSize;
	const float *lhs_ptr = lhs_base;
	for(int k = 0; k < depth; k++)
	{
	for(int arow = 0; arow < remaining_rows; arow++)
	{
	for(int acol = 0; acol < remaining_cols; acol++ )
	{
	res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol];
	}
	}
	rhs_ptr += remaining_cols;
	lhs_ptr += remaining_rows;
	}
	}
	}
	}

	} // end namespace internal

	} // end namespace Eigen

	#endif // __MMA__
	#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H