Clean up some old technical debt: Move FixedPoint from third_party/eigen3 to third_party/tensorflow. The fixed point library is not a part of or used by Eigen, but was developed for TensorFlow and never pushed to Eigen upstream.
This also simplifies the TF open source build: After this change tensorflow/third_party/eigen3/... consists entirely of forwarding headers (and can perhaps be removed?)
PiperOrigin-RevId: 478097075
Change-Id: Ie4785af12387ec430b73406451e01a4afae2cce3
diff --git a/unsupported/Eigen/CXX11/FixedPoint b/unsupported/Eigen/CXX11/FixedPoint
deleted file mode 100644
index 67cb111..0000000
--- a/unsupported/Eigen/CXX11/FixedPoint
+++ /dev/null
@@ -1,58 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_FIXED_POINT_MODULE
-#define EIGEN_CXX11_FIXED_POINT_MODULE
-
-#include <Eigen/Core>
-#include <stdint.h>
-
-/** \defgroup CXX11_FixedPoint_Module Fixed Point Module
- *
- * This module provides common core features for all modules that
- * explicitly depend on C++11. Currently, this is only the Tensor
- * module. Note that at this stage, you should not need to include
- * this module directly.
- *
- * It also provides a limited fallback for compilers that don't support
- * CXX11 yet, such as nvcc.
- *
- * \code
- * #include <Eigen/CXX11/FixedPoint>
- * \endcode
- */
-
-#include "src/FixedPoint/FixedPointTypes.h"
-
-// Use optimized implementations whenever available
-#if defined (EIGEN_VECTORIZE_AVX512DQ) || defined (EIGEN_VECTORIZE_AVX512BW)
-#include "src/FixedPoint/PacketMathAVX512.h"
-#include "src/FixedPoint/TypeCastingAVX512.h"
-
-#elif defined EIGEN_VECTORIZE_AVX2
-#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-#define EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
-#include "src/FixedPoint/PacketMathAVX2.h"
-#include "src/FixedPoint/MatMatProductAVX2.h"
-#include "src/FixedPoint/TypeCastingAVX2.h"
-
-#elif defined EIGEN_VECTORIZE_AVX
-#include "src/FixedPoint/PacketMathAVX.h"
-
-#elif defined EIGEN_VECTORIZE_NEON
-#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-#include "src/FixedPoint/MatMatProductNEON.h"
-#endif
-
-// Use the default implementation when no optimized code is available
-#include "src/FixedPoint/MatMatProduct.h"
-#include "src/FixedPoint/MatVecProduct.h"
-
-
-#endif // EIGEN_CXX11_FIXED_POINT_MODULE
diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h b/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
deleted file mode 100644
index f37897b..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
+++ /dev/null
@@ -1,345 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef CXX11_SRC_FIXEDPOINT_FIXEDPOINTTYPES_H_
-#define CXX11_SRC_FIXEDPOINT_FIXEDPOINTTYPES_H_
-
-#include <cmath>
-#include <iostream>
-
-namespace Eigen {
-
-// The mantissa part of the fixed point representation. See
-// go/tensorfixedpoint for details
-struct QInt8;
-struct QUInt8;
-struct QInt16;
-struct QUInt16;
-struct QInt32;
-
-template <>
-struct NumTraits<QInt8> : GenericNumTraits<int8_t> {};
-template <>
-struct NumTraits<QUInt8> : GenericNumTraits<uint8_t> {};
-template <>
-struct NumTraits<QInt16> : GenericNumTraits<int16_t> {};
-template <>
-struct NumTraits<QUInt16> : GenericNumTraits<uint16_t> {};
-template <>
-struct NumTraits<QInt32> : GenericNumTraits<int32_t> {};
-
-namespace internal {
-template <>
-struct scalar_product_traits<QInt32, double> {
- enum {
- // Cost = NumTraits<T>::MulCost,
- Defined = 1
- };
- typedef QInt32 ReturnType;
-};
-}
-
-// Wrap the 8bit int into a QInt8 struct instead of using a typedef to prevent
-// the compiler from silently type cast the mantissa into a bigger or a smaller
-// representation.
-struct QInt8 {
- QInt8() : value(0) {}
- QInt8(const int8_t v) : value(v) {}
- QInt8(const QInt32 v);
-
- operator int() const { return static_cast<int>(value); }
-
- int8_t value;
-};
-
-struct QUInt8 {
- QUInt8() : value(0) {}
- QUInt8(const uint8_t v) : value(v) {}
- QUInt8(const QInt32 v);
-
- operator int() const { return static_cast<int>(value); }
-
- uint8_t value;
-};
-
-struct QInt16 {
- QInt16() : value(0) {}
- QInt16(const int16_t v) : value(v) {}
- QInt16(const QInt32 v);
- operator int() const { return static_cast<int>(value); }
-
- int16_t value;
-};
-
-struct QUInt16 {
- QUInt16() : value(0) {}
- QUInt16(const uint16_t v) : value(v) {}
- QUInt16(const QInt32 v);
- operator int() const { return static_cast<int>(value); }
-
- uint16_t value;
-};
-
-struct QInt32 {
- QInt32() : value(0) {}
- QInt32(const int8_t v) : value(v) {}
- QInt32(const int32_t v) : value(v) {}
- QInt32(const uint32_t v) : value(static_cast<int32_t>(v)) {}
- QInt32(const QInt8 v) : value(v.value) {}
- QInt32(const float v) : value(static_cast<int32_t>(lrint(v))) {}
-#ifdef EIGEN_MAKING_DOCS
- // Workaround to fix build on PPC.
- QInt32(unsigned long v) : value(v) {}
-#endif
-
- operator float() const { return static_cast<float>(value); }
-
- int32_t value;
-};
-
-EIGEN_STRONG_INLINE QInt8::QInt8(const QInt32 v)
- : value(static_cast<int8_t>(
- v.value > 127 ? 127 : (v.value < -128 ? -128 : v.value))) {}
-EIGEN_STRONG_INLINE QUInt8::QUInt8(const QInt32 v)
- : value(static_cast<uint8_t>(v.value > 255 ? 255
- : (v.value < 0 ? 0 : v.value))) {
-}
-EIGEN_STRONG_INLINE QInt16::QInt16(const QInt32 v)
- : value(static_cast<int16_t>(
- v.value > 32767 ? 32767 : (v.value < -32768 ? -32768 : v.value))) {}
-EIGEN_STRONG_INLINE QUInt16::QUInt16(const QInt32 v)
- : value(static_cast<uint16_t>(
- v.value > 65535 ? 65535 : (v.value < 0 ? 0 : v.value))) {}
-
-// Basic widening 8-bit operations: This will be vectorized in future CLs.
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt8 b) {
- return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QUInt8 b) {
- return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt8 b) {
- return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt8 b) {
- return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value));
-}
-
-// Basic widening 16-bit operations: This will be vectorized in future CLs.
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt16 b) {
- return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QUInt16 b) {
- return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt16 b) {
- return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt16 b) {
- return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value));
-}
-
-// Mixed QInt32 op QInt8 operations. This will be vectorized in future CLs.
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt8 b) {
- return QInt32(a.value + static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt32 b) {
- return QInt32(static_cast<int32_t>(a.value) + b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt8 b) {
- return QInt32(a.value - static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt32 b) {
- return QInt32(static_cast<int32_t>(a.value) - b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt8 b) {
- return QInt32(a.value * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt32 b) {
- return QInt32(static_cast<int32_t>(a.value) * b.value);
-}
-
-// Mixed QInt32 op QInt16 operations. This will be vectorized in future CLs.
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt16 b) {
- return QInt32(a.value + static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt32 b) {
- return QInt32(static_cast<int32_t>(a.value) + b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt16 b) {
- return QInt32(a.value - static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt32 b) {
- return QInt32(static_cast<int32_t>(a.value) - b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt16 b) {
- return QInt32(a.value * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt32 b) {
- return QInt32(static_cast<int32_t>(a.value) * b.value);
-}
-
-// Mixed QInt32 op QUInt8 operations. This will be vectorized in future CLs.
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt8 b) {
- return QInt32(a.value + static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator+(const QUInt8 a, const QInt32 b) {
- return QInt32(static_cast<int32_t>(a.value) + b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt8 b) {
- return QInt32(a.value - static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QUInt8 a, const QInt32 b) {
- return QInt32(static_cast<int32_t>(a.value) - b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt8 b) {
- return QInt32(a.value * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QUInt8 a, const QInt32 b) {
- return QInt32(static_cast<int32_t>(a.value) * b.value);
-}
-
-// Mixed QInt32 op QUInt16 operations. This will be vectorized in future CLs.
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt16 b) {
- return QInt32(a.value + static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator+(const QUInt16 a, const QInt32 b) {
- return QInt32(static_cast<int32_t>(a.value) + b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt16 b) {
- return QInt32(a.value - static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QUInt16 a, const QInt32 b) {
- return QInt32(static_cast<int32_t>(a.value) - b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt16 b) {
- return QInt32(a.value * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QUInt16 a, const QInt32 b) {
- return QInt32(static_cast<int32_t>(a.value) * b.value);
-}
-
-// Basic arithmetic operations on QInt32, which behaves like a int32_t.
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt32 b) {
- return a.value + b.value;
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt32 b) {
- return a.value - b.value;
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt32 b) {
- return a.value * b.value;
-}
-EIGEN_STRONG_INLINE QInt32 operator/(const QInt32 a, const QInt32 b) {
- return a.value / b.value;
-}
-EIGEN_STRONG_INLINE QInt32& operator+=(QInt32& a, const QInt32 b) {
- a.value += b.value;
- return a;
-}
-EIGEN_STRONG_INLINE QInt32& operator-=(QInt32& a, const QInt32 b) {
- a.value -= b.value;
- return a;
-}
-EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const QInt32 b) {
- a.value *= b.value;
- return a;
-}
-EIGEN_STRONG_INLINE QInt32& operator/=(QInt32& a, const QInt32 b) {
- a.value /= b.value;
- return a;
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) { return -a.value; }
-
-// Scaling QInt32 by double. We do the arithmetic in double because
-// float only has 23 bits of mantissa, so casting QInt32 to float might reduce
-// accuracy by discarding up to 7 (least significant) bits.
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const double b) {
- return static_cast<int32_t>(lrint(static_cast<double>(a.value) * b));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const double a, const QInt32 b) {
- return static_cast<int32_t>(lrint(a * static_cast<double>(b.value)));
-}
-EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const double b) {
- a.value = static_cast<int32_t>(lrint(static_cast<double>(a.value) * b));
- return a;
-}
-
-// Comparisons
-EIGEN_STRONG_INLINE bool operator==(const QInt8 a, const QInt8 b) {
- return a.value == b.value;
-}
-EIGEN_STRONG_INLINE bool operator==(const QUInt8 a, const QUInt8 b) {
- return a.value == b.value;
-}
-EIGEN_STRONG_INLINE bool operator==(const QInt16 a, const QInt16 b) {
- return a.value == b.value;
-}
-EIGEN_STRONG_INLINE bool operator==(const QUInt16 a, const QUInt16 b) {
- return a.value == b.value;
-}
-EIGEN_STRONG_INLINE bool operator==(const QInt32 a, const QInt32 b) {
- return a.value == b.value;
-}
-
-EIGEN_STRONG_INLINE bool operator<(const QInt8 a, const QInt8 b) {
- return a.value < b.value;
-}
-EIGEN_STRONG_INLINE bool operator<(const QUInt8 a, const QUInt8 b) {
- return a.value < b.value;
-}
-EIGEN_STRONG_INLINE bool operator<(const QInt16 a, const QInt16 b) {
- return a.value < b.value;
-}
-EIGEN_STRONG_INLINE bool operator<(const QUInt16 a, const QUInt16 b) {
- return a.value < b.value;
-}
-EIGEN_STRONG_INLINE bool operator<(const QInt32 a, const QInt32 b) {
- return a.value < b.value;
-}
-
-EIGEN_STRONG_INLINE bool operator>(const QInt8 a, const QInt8 b) {
- return a.value > b.value;
-}
-EIGEN_STRONG_INLINE bool operator>(const QUInt8 a, const QUInt8 b) {
- return a.value > b.value;
-}
-EIGEN_STRONG_INLINE bool operator>(const QInt16 a, const QInt16 b) {
- return a.value > b.value;
-}
-EIGEN_STRONG_INLINE bool operator>(const QUInt16 a, const QUInt16 b) {
- return a.value > b.value;
-}
-EIGEN_STRONG_INLINE bool operator>(const QInt32 a, const QInt32 b) {
- return a.value > b.value;
-}
-
-EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt8 a) {
- os << static_cast<int>(a.value);
- return os;
-}
-EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt8 a) {
- os << static_cast<int>(a.value);
- return os;
-}
-EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt16 a) {
- os << static_cast<int>(a.value);
- return os;
-}
-EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt16 a) {
- os << static_cast<int>(a.value);
- return os;
-}
-EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt32 a) {
- os << a.value;
- return os;
-}
-
-} // namespace Eigen
-
-#endif // CXX11_SRC_FIXEDPOINT_FIXEDPOINTTYPES_H_
diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h b/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
deleted file mode 100644
index 3f93f9f..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
+++ /dev/null
@@ -1,345 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef CXX11_SRC_FIXEDPOINT_MATMATPRODUCT_H_
-#define CXX11_SRC_FIXEDPOINT_MATMATPRODUCT_H_
-
-namespace Eigen {
-namespace internal {
-
-// Accumulate the product of 2 QInt8 inputs on 32 bits to prevent
-// overflows
-template <>
-struct scalar_product_traits<QInt8, QInt8> {
- enum { Defined = 1 };
- typedef QInt32 ReturnType;
-};
-
-// Accumulate the product of 2 QInt16 inputs on 32 bits to prevent
-// overflows
-template <>
-struct scalar_product_traits<QInt16, QInt16> {
- enum { Defined = 1 };
- typedef QInt32 ReturnType;
-};
-
-// Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits
-// to prevent overflows
-template <>
-struct scalar_product_traits<QInt8, QUInt8> {
- enum { Defined = 1 };
- typedef QInt32 ReturnType;
-};
-
-// Accumulate the product of QUInt8 inputs with Qint8 inputs on 32 bits
-// to prevent overflows
-template <>
-struct scalar_product_traits<QUInt8, QInt8> {
- enum { Defined = 1 };
- typedef QInt32 ReturnType;
-};
-
-// Description of the product implementation. It's pretty simple now since
-// nothing is vectorized yet.
-// This definition tackle the case where both lhs and rhs are encoded using
-// signed 8bit integers
-#ifndef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT
-
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs> {
- public:
- typedef QInt8 LhsScalar;
- typedef QInt8 RhsScalar;
- typedef QInt32 ResScalar;
-
- typedef typename packet_traits<LhsScalar>::type LhsPacket;
- typedef LhsPacket LhsPacket4Packing;
-
- enum {
- // register block size along the M and N directions
- // One for the current implementation
- nr = 1,
- mr = 1,
- // Progress made at each iteration of the product loop
- // also 1 for the current implementation
- LhsProgress = 1,
- RhsProgress = 1
- };
-};
-
-// The signed 8bit Mat-Mat product itself.
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
- ConjugateRhs> {
- EIGEN_DONT_INLINE
- void operator()(const DataMapper& res, const QInt8* blockA,
- const QInt8* blockB, Index rows, Index depth, Index cols,
- QInt32 alpha, Index strideA = -1, Index strideB = -1,
- Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr,
- ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
- Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
- Index strideB, Index offsetA, Index offsetB) {
- EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
- eigen_assert(alpha.value == 1);
- eigen_assert(strideA == -1);
- eigen_assert(strideB == -1);
- eigen_assert(offsetA == 0);
- eigen_assert(offsetB == 0);
-
- eigen_assert(rows > 0);
- eigen_assert(cols > 0);
- eigen_assert(depth > 0);
- eigen_assert(blockA);
- eigen_assert(blockB);
-
- for (Index j = 0; j < cols; ++j) {
- Index startB = j * depth;
-
- for (Index i = 0; i < rows; ++i) {
- Index startA = i * depth;
-
- for (Index k = 0; k < depth; ++k) {
- res(i, j) += blockA[startA + k] * blockB[startB + k];
- }
- }
- }
-}
-#endif
-
-// This definition tackle the case where the lhs is encoded using signed 8bit
-// integers and the rhs using unsigned 8bit integers.
-#ifndef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
- public:
- typedef QInt8 LhsScalar;
- typedef QUInt8 RhsScalar;
- typedef QInt32 ResScalar;
-
- typedef typename packet_traits<LhsScalar>::type LhsPacket;
- typedef LhsPacket LhsPacket4Packing;
-
- enum {
- // register block size along the M and N directions
- // One for the current implementation
- nr = 1,
- mr = 1,
- // Progress made at each iteration of the product loop
- // also 1 for the current implementation
- LhsProgress = 1,
- RhsProgress = 1
- };
-};
-
-// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
- ConjugateRhs> {
- EIGEN_DONT_INLINE
- void operator()(const DataMapper& res, const QInt8* blockA,
- const QUInt8* blockB, Index rows, Index depth, Index cols,
- QInt32 alpha, Index strideA = -1, Index strideB = -1,
- Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
- ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
- Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
- Index strideB, Index offsetA, Index offsetB) {
- EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
- eigen_assert(alpha.value == 1);
- eigen_assert(strideA == -1);
- eigen_assert(strideB == -1);
- eigen_assert(offsetA == 0);
- eigen_assert(offsetB == 0);
-
- eigen_assert(rows > 0);
- eigen_assert(cols > 0);
- eigen_assert(depth > 0);
- eigen_assert(blockA);
- eigen_assert(blockB);
-
- for (Index j = 0; j < cols; ++j) {
- Index startB = j * depth;
-
- for (Index i = 0; i < rows; ++i) {
- Index startA = i * depth;
-
- for (Index k = 0; k < depth; ++k) {
- res(i, j) += blockA[startA + k] * blockB[startB + k];
- }
- }
- }
-}
-#endif
-
-// This definition tackle the case where the khs is encoded using unsigned 8bit
-// integers and the rhs using signed 8bit integers.
-#ifndef EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs> {
- public:
- typedef QUInt8 LhsScalar;
- typedef QInt8 RhsScalar;
- typedef QInt32 ResScalar;
-
- typedef typename packet_traits<LhsScalar>::type LhsPacket;
- typedef LhsPacket LhsPacket4Packing;
-
- enum {
- // register block size along the M and N directions
- // One for the current implementation
- nr = 1,
- mr = 1,
- // Progress made at each iteration of the product loop
- // also 1 for the current implementation
- LhsProgress = 1,
- RhsProgress = 1
- };
-};
-
-// Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
- ConjugateRhs> {
- EIGEN_DONT_INLINE
- void operator()(const DataMapper& res, const QUInt8* blockA,
- const QInt8* blockB, Index rows, Index depth, Index cols,
- QInt32 alpha, Index strideA = -1, Index strideB = -1,
- Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr,
- ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
- Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
- Index strideB, Index offsetA, Index offsetB) {
- EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
- eigen_assert(alpha.value == 1);
- eigen_assert(strideA == -1);
- eigen_assert(strideB == -1);
- eigen_assert(offsetA == 0);
- eigen_assert(offsetB == 0);
-
- eigen_assert(rows > 0);
- eigen_assert(cols > 0);
- eigen_assert(depth > 0);
- eigen_assert(blockA);
- eigen_assert(blockB);
-
- for (Index j = 0; j < cols; ++j) {
- Index startB = j * depth;
-
- for (Index i = 0; i < rows; ++i) {
- Index startA = i * depth;
-
- for (Index k = 0; k < depth; ++k) {
- res(i, j) += blockA[startA + k] * blockB[startB + k];
- }
- }
- }
-}
-#endif
-
-#ifndef EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
-
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
- public:
- typedef QInt16 LhsScalar;
- typedef QInt16 RhsScalar;
- typedef QInt32 ResScalar;
-
- typedef typename packet_traits<LhsScalar>::type LhsPacket;
- typedef LhsPacket LhsPacket4Packing;
-
- enum {
- // register block size along the M and N directions
- // One for the current implementation
- nr = 1,
- mr = 1,
- // Progress made at each iteration of the product loop
- // also 1 for the current implementation
- LhsProgress = 1,
- RhsProgress = 1
- };
-};
-
-// The signed 16bit Mat-Mat product itself.
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
- ConjugateRhs> {
- EIGEN_DONT_INLINE
- void operator()(const DataMapper& res, const QInt16* blockA,
- const QInt16* blockB, Index rows, Index depth, Index cols,
- QInt32 alpha, Index strideA = -1, Index strideB = -1,
- Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr,
- ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB,
- Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
- Index strideB, Index offsetA, Index offsetB) {
- EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
- eigen_assert(alpha.value == 1);
- eigen_assert(strideA == -1);
- eigen_assert(strideB == -1);
- eigen_assert(offsetA == 0);
- eigen_assert(offsetB == 0);
-
- eigen_assert(rows > 0);
- eigen_assert(cols > 0);
- eigen_assert(depth > 0);
- eigen_assert(blockA);
- eigen_assert(blockB);
-
- for (Index j = 0; j < cols; ++j) {
- Index startB = j * depth;
-
- for (Index i = 0; i < rows; ++i) {
- Index startA = i * depth;
-
- for (Index k = 0; k < depth; ++k) {
- res(i, j) += blockA[startA + k] * blockB[startB + k];
- }
- }
- }
-}
-#endif
-
-} // namespace internal
-} // namespace Eigen
-
-#endif // CXX11_SRC_FIXEDPOINT_MATMATPRODUCT_H_
diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h b/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
deleted file mode 100644
index 8547dca..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
+++ /dev/null
@@ -1,2289 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2015 Matthew Sarett <msarett@google.com>
-// Copyright (C) 2016 Nishant Patil <nishantpatil@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef CXX11_SRC_FIXEDPOINT_MATMATPRODUCTAVX2_H_
-#define CXX11_SRC_FIXEDPOINT_MATMATPRODUCTAVX2_H_
-
-namespace Eigen {
-namespace internal {
-
-// AVX2 optimized implementation of Mat-Mat product.
-// LHS is encoded using signed 16-bit integers.
-// RHS is encoded using signed 16-bit integers.
-#ifdef EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
-
-// Define quantized traits
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
- public:
- typedef QInt16 LhsScalar;
- typedef QInt16 RhsScalar;
- typedef QInt32 ResScalar;
-
- typedef typename packet_traits<LhsScalar>::type LhsPacket;
- typedef LhsPacket LhsPacket4Packing;
-
- enum {
- // Define register blocking scheme.
- nr = 16,
- mr = 16,
- kr = 4,
- // Ignore progress tracking per loop iteration.
- LhsProgress = -1,
- RhsProgress = -1
- };
-};
-
-// Specialized blocking for quantized implementations.
-// Used by TensorContractionThreadPool, inputs must have dimensions that are
-// multiples of 32.
-template <typename Index, int ShardingType>
-class TensorContractionBlocking<QInt16, QInt16, QInt16, Index, ShardingType> {
- public:
- TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
- : kc_(((k + 15) / 16) * 16),
- mc_(((m + 15) / 16) * 16),
- nc_(((n + 15) / 16) * 16) {
- eigen_assert(mc_ % 16 == 0);
- eigen_assert(kc_ % 16 == 0);
- if (!k || !m || !n) {
- return;
- }
-
- if (ShardingType == ShardByCol) {
- eigen_assert(nc_ % 16 == 0);
- nc_ = (((nc_ / num_threads) + 15) / 16) * 16;
- } else {
- eigen_assert(nc_ % 16 == 0);
- mc_ = (((mc_ / num_threads) + 15) / 16) * 16;
- }
- }
-
- EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
- EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
- EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
-
- private:
- Index kc_;
- Index mc_;
- Index nc_;
-};
-
-// Specialized blocking for quantized implementations.
-// Used by TensorContraction and GeneralMatrixMatrix, inputs are padded to
-// multiples of 32.
-template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<ColMajor, QInt16, QInt16, MaxRows, MaxCols, MaxDepth,
- KcFactor, false>
- : public level3_blocking<QInt16, QInt16> {
- DenseIndex m_sizeA;
- DenseIndex m_sizeB;
-
- public:
- gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
- DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
- this->m_mc = ((rows + 15) / 16) * 16;
- this->m_nc = ((cols + 15) / 16) * 16;
- this->m_kc = ((depth + 15) / 16) * 16;
- m_sizeA = this->m_mc * this->m_kc;
- m_sizeB = this->m_kc * this->m_nc;
- }
- void allocateA() {
- if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt16>(m_sizeA);
- }
- void allocateB() {
- if (this->m_blockB == 0) this->m_blockB = aligned_new<QInt16>(m_sizeB);
- }
- void allocateAll() {
- allocateA();
- allocateB();
- }
- ~gemm_blocking_space() {
- aligned_delete(this->m_blockA, m_sizeA);
- aligned_delete(this->m_blockB, m_sizeB);
- }
-};
-
-// Below are the fully optimized versions that are correct only for sizes that
-// are multiple of 16. It is about a 10% performance benefit to keep these
-// implementations separate.
-
-// Arrange a block of the left input matrix in contiguous memory.
-//
-// Given column major input (A0 beside A1 in memory):
-// A0 B0 C0 D0 E0 F0 G0 H0 ...
-// A1 B1 C1 D1 E1 F1 G1 H1 ...
-// A2 B2 C2 D2 E2 F2 G2 H2 ...
-// A3 B3 C3 D3 E3 F3 G3 H3 ...
-// A4 B4 C4 D4 E4 F4 G4 H4 ...
-// A5 B5 C5 D5 E5 F5 G5 H5 ...
-// A6 B6 C6 D6 E6 F6 G6 H6 ...
-// A7 B7 C7 D7 E7 F7 G7 H7 ...
-// A8 ...
-// ...
-//
-// Packing with m = 8 yields row major output (A0 beside B0 in memory):
-// A0 B0
-// A1 B1
-// A2 B2
-// A3 B3
-// A4 B4
-// A5 B5
-// A6 B6
-// A7 B7
-// ...
-//
-// The purpose is to collect m rows of size k. Two elements of the same
-// row are arranged contiguously because madd performs an adjacent addition
-// in the kernel.
-
-template <typename Index, typename DataMapper, int Pack1, int Pack2,
- bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, QInt16, ColMajor,
- Conjugate, PanelMode> {
- EIGEN_DONT_INLINE void operator()(QInt16* blockA, const DataMapper& lhs,
- Index depth, Index rows, Index stride = 0,
- Index offset = 0);
-};
-
-template <typename Index, typename DataMapper, int Pack1, int Pack2,
- bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2,
- QInt16, ColMajor, Conjugate, PanelMode>::
-operator()(QInt16* blockA, const DataMapper& lhs, Index depth, Index rows,
- Index stride, Index offset) {
- eigen_assert(stride == 0);
- eigen_assert(offset == 0);
-
- typedef typename packet_traits<QInt16>::type Packet;
-
- // Use alternate function for weird sizes
- if (rows % 16 != 0 || depth % 16 != 0) {
- assert(false &&
- "only depths and rows that are a multiple of 16 are currently "
- "supported");
- // gemm_pack_lhs_any<QInt16, Index, DataMapper, Pack1, Pack2, ColMajor,
- // Conjugate, PanelMode> lhs_pack;
- // return lhs_pack(blockA, lhs, depth, rows, stride, offset);
- }
-
- // Get vector pointer
- __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
-
- // Pack rows in sets of 16
- for (Index m = 0; m < rows; m += 16) {
- // Pack depth in sets of 4
- for (Index k = 0; k < depth; k += 4) {
- // Load vectors
- __m256i L_A = lhs.template loadPacket<Packet>(m, k);
- __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
- __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
- __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
-
- // Rearrange the inputs as required by the kernel
- __m256i L_AB0_AB7 = _mm256_unpacklo_epi16(L_A, L_B);
- __m256i L_AB8_AB15 = _mm256_unpackhi_epi16(L_A, L_B);
- __m256i L_CD0_CD7 = _mm256_unpacklo_epi16(L_C, L_D);
- __m256i L_CD8_CD15 = _mm256_unpackhi_epi16(L_C, L_D);
-
- __m256i L_AD0 = _mm256_permute2x128_si256(L_AB0_AB7, L_AB8_AB15, 0x20);
- _mm256_store_si256(blockA_256++, L_AD0);
- __m256i L_AD8 = _mm256_permute2x128_si256(L_CD0_CD7, L_CD8_CD15, 0x20);
- _mm256_store_si256(blockA_256++, L_AD8);
- __m256i L_AD16 = _mm256_permute2x128_si256(L_AB0_AB7, L_AB8_AB15, 0x31);
- _mm256_store_si256(blockA_256++, L_AD16);
- __m256i L_AD24 = _mm256_permute2x128_si256(L_CD0_CD7, L_CD8_CD15, 0x31);
- _mm256_store_si256(blockA_256++, L_AD24);
- }
- }
-}
-
-// Arrange a block of the right input matrix in contiguous memory.
-//
-// Given column major input (A0 beside A1 in memory):
-// A0 B0 C0 D0 E0 F0 G0 H0 ...
-// A1 B1 C1 D1 E1 F1 G1 H1 ...
-// A2 B2 C2 D2 E2 F2 G2 H2 ...
-// A3 B3 C3 D3 E3 F3 G3 H3 ...
-// A4 B4 C4 D4 E4 F4 G4 H4 ...
-// A5 B5 C5 D5 E5 F5 G5 H5 ...
-// A6 B6 C6 D6 E6 F6 G6 H6 ...
-// A7 B7 C7 D7 E7 F7 G7 H7 ...
-// A8 ...
-// ...
-// Packing yields row major output (A0 beside A1 in memory):
-// A0 A1 A2 A3 A4 A5 A6 A7
-// B0 B1 B2 B3 B4 B5 B6 B7
-// ...
-//
-// At least two elements of the same col are arranged contiguously because
-// maddubs and madd both perform an adjacent addition in the kernel. We can
-// save work by leaving 4 adjacent elements because kr = 4.
-// The purpose is to collect n cols of size k. Two elements of the same
-// col are arranged contiguously because madd performs an adjacent addition
-// in the kernel.
-template <typename Index, typename DataMapper, int nr, bool Conjugate,
- bool PanelMode>
-struct gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor, Conjugate,
- PanelMode> {
- EIGEN_DONT_INLINE void operator()(QInt16* blockB, const DataMapper& rhs,
- Index depth, Index cols, Index stride = 0,
- Index offset = 0);
-};
-
-template <typename Index, typename DataMapper, int nr, bool Conjugate,
- bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor,
- Conjugate, PanelMode>::
-operator()(QInt16* blockB, const DataMapper& rhs, Index depth, Index cols,
- Index stride, Index offset) {
- eigen_assert(stride == 0);
- eigen_assert(offset == 0);
-
- typedef typename packet_traits<QInt16>::type Packet;
-
- // Use alternate function for weird sizes
- if (cols % 16 != 0 || depth % 16 != 0) {
- assert(false &&
- "only depths and cols that are a multiple of 16 are currently "
- "supported");
- // gemm_pack_rhs_any<QInt16, Index, DataMapper, nr, ColMajor, Conjugate,
- // PanelMode> rhs_pack;
- // return rhs_pack(blockB, rhs, depth, cols, stride, offset);
- }
-
- // Get vector pointer
- __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
-
- // Perform a step of the packing for 4 columns
- __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_4, R_AD_8, R_AD_12;
-#define PACK_STEP \
- R_AB_L = _mm256_unpacklo_epi64(R_A, R_B); \
- R_CD_L = _mm256_unpacklo_epi64(R_C, R_D); \
- R_AB_H = _mm256_unpackhi_epi64(R_A, R_B); \
- R_CD_H = _mm256_unpackhi_epi64(R_C, R_D); \
- R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20); \
- R_AD_8 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
- R_AD_4 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20); \
- R_AD_12 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
- _mm256_store_si256(blockB_256, R_AD_0); \
- _mm256_store_si256(blockB_256 + 4, R_AD_4); \
- _mm256_store_si256(blockB_256 + 8, R_AD_8); \
- _mm256_store_si256(blockB_256 + 12, R_AD_12); \
- blockB_256++;
-
- // Pack cols in sets of 16
- for (Index n = 0; n < cols; n += 16) {
- // Pack depth in sets of 16
- for (Index k = 0; k < depth; k += 16) {
- __m256i R_A = rhs.template loadPacket<Packet>(k, n);
- __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
- __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
- __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 4);
- R_B = rhs.template loadPacket<Packet>(k, n + 5);
- R_C = rhs.template loadPacket<Packet>(k, n + 6);
- R_D = rhs.template loadPacket<Packet>(k, n + 7);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 8);
- R_B = rhs.template loadPacket<Packet>(k, n + 9);
- R_C = rhs.template loadPacket<Packet>(k, n + 10);
- R_D = rhs.template loadPacket<Packet>(k, n + 11);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 12);
- R_B = rhs.template loadPacket<Packet>(k, n + 13);
- R_C = rhs.template loadPacket<Packet>(k, n + 14);
- R_D = rhs.template loadPacket<Packet>(k, n + 15);
- PACK_STEP;
-
- blockB_256 += 12;
- }
- }
-#undef PACK_STEP
-}
-
-// Perform the actual multiplication on packed inputs
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
- ConjugateRhs> {
- typedef typename DataMapper::LinearMapper LinearMapper;
-
- EIGEN_DONT_INLINE
- void operator()(const DataMapper& res, const QInt16* blockA,
- const QInt16* blockB, Index rows, Index depth, Index cols,
- QInt32 alpha, Index strideA = -1, Index strideB = -1,
- Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr,
- ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB,
- Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
- Index strideB, Index offsetA, Index offsetB) {
- EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
- eigen_assert(alpha.value == 1);
- eigen_assert(strideA == -1);
- eigen_assert(strideB == -1);
- eigen_assert(offsetA == 0);
- eigen_assert(offsetB == 0);
- eigen_assert(rows > 0);
- eigen_assert(cols > 0);
- eigen_assert(depth > 0);
- eigen_assert(blockA);
- eigen_assert(blockB);
-
- // Use alternate function for weird sizes
- if (rows % 16 != 0 || cols % 16 != 0 || depth % 16 != 0) {
- assert(false &&
- "only depths, cols and rows that are a multiple of 16 are currently "
- "supported");
- // gebp_kernel_any<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
- // ConjugateRhs> gebp;
- // return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA,
- // strideB, offsetA, offsetB);
- }
-
- // Create result block
- QInt32* blockO = aligned_new<QInt32>(16 * 16);
- memset(blockO, 0, 16 * 16 * sizeof(QInt32));
-
- // Get vectorized pointers
- __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
- const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
- const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
-
- // Loop over blocks of 16 columns
- for (Index n = 0; n < cols; n += 16) {
- // Reset index into blockA
- Index indexL = 0;
- // Loop over blocks of 16 rows
- for (Index m = 0; m < rows; m += 16) {
- // Reset index into blockB
- Index indexR = n / 16 * depth;
- // Loop over blocks of 4 on depth
- for (Index k = 0; k < depth; k += 4) {
- // Load inputs
- __m256i L_AD0 = blockA_256[indexL++];
- __m256i L_AD8 = blockA_256[indexL++];
- __m256i L_EH0 = blockA_256[indexL++];
- __m256i L_EH8 = blockA_256[indexL++];
-
- __m256i R_AH0 = blockB_256[indexR++];
- __m256i R_AH4 = blockB_256[indexR++];
- __m256i R_AH8 = blockB_256[indexR++];
- __m256i R_AH12 = blockB_256[indexR++];
-
- // Declare variables used in COMPUTE_STEP
- __m256i P_32_A, P_32_B, P_32;
-
-#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET) \
- P_32_A = _mm256_madd_epi16(R_INPUT_A, L_AD0); \
- P_32_B = _mm256_madd_epi16(R_INPUT_B, L_AD8); \
- P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
- _mm256_store_si256( \
- blockO_256 + 2 * OFFSET, \
- _mm256_add_epi32(_mm256_load_si256(blockO_256 + 2 * OFFSET), P_32)); \
- \
- P_32_A = _mm256_madd_epi16(R_INPUT_A, L_EH0); \
- P_32_B = _mm256_madd_epi16(R_INPUT_B, L_EH8); \
- P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
- _mm256_store_si256( \
- blockO_256 + 2 * OFFSET + 1, \
- _mm256_add_epi32(_mm256_load_si256(blockO_256 + 2 * OFFSET + 1), P_32));
-
- // Permute and shuffle to copy a single value across the entire vector
- // Then compute the multiplication
- // Replicate lower 128-bits of R_AH0 across both lanes
- __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
- // Copy first two elements of R_AH0 across entire vector
- __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- // Copy second two elements of R_AH0 across entire vector
- __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-
- COMPUTE_STEP(R_AD0, R_EH0, 0);
- __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 1);
-
- // Replicate upper 128-bits of R_AH0 across both lanes
- R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
- __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 2);
- __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 3);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 4);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 5);
- R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 6);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 7);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 8);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 9);
- R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 10);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 11);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 12);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 13);
- R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 14);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 15);
-
-#undef COMPUTE_STEP
- }
-
- // Transfer the results to the result matrix
- Index i = 0;
- for (Index j = n; j < n + 16; j++) {
- LinearMapper r0 = res.getLinearMapper(m, j);
- LinearMapper r1 = res.getLinearMapper(m + 8, j);
- typedef typename packet_traits<QInt32>::type Packet;
- r0.template storePacket<Packet>(
- 0, _mm256_add_epi32(blockO_256[i++],
- r0.template loadPacket<Packet>(0)));
- r1.template storePacket<Packet>(
- 0, _mm256_add_epi32(blockO_256[i++],
- r1.template loadPacket<Packet>(0)));
- }
-
- // Zero the result block so it can be reused
- memset(blockO, 0, 16 * 16 * sizeof(QInt32));
- }
- }
- aligned_delete(blockO, 16 * 16);
-}
-
-#endif
-
-// AVX2 optimized implementation of Mat-Mat product.
-// LHS is encoded using signed 8-bit integers.
-// RHS is encoded using unsigned 8-bit integers.
-#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-
-// Define quantized traits
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
- public:
- typedef QInt8 LhsScalar;
- typedef QUInt8 RhsScalar;
- typedef QInt32 ResScalar;
-
- typedef typename packet_traits<LhsScalar>::type LhsPacket;
- typedef LhsPacket LhsPacket4Packing;
-
- enum {
- // Define register blocking scheme.
- nr = 32,
- mr = 32,
- kr = 8,
- // Ignore progress tracking per loop iteration.
- LhsProgress = -1,
- RhsProgress = -1
- };
-};
-
-// Specialized blocking for quantized implementations.
-// Used by TensorContractionThreadPool, inputs must have dimensions that are
-// multiples of 32.
-template <typename ResScalar, typename Index, typename LeftTensor,
- typename left_nocontract_t, typename left_contract_t,
- bool left_inner_dim_contiguous, bool left_inner_dim_reordered,
- int LeftAlignment, typename RightTensor, typename right_nocontract_t,
- typename right_contract_t, bool right_inner_dim_contiguous,
- bool right_inner_dim_reordered, int RightAlignment, int ShardingType>
-class TensorContractionBlocking<
- ResScalar,
- TensorContractionInputMapper<
- QInt8, Index, Lhs, LeftTensor, left_nocontract_t, left_contract_t, 32,
- left_inner_dim_contiguous, left_inner_dim_reordered, LeftAlignment>,
- TensorContractionInputMapper<QUInt8, Index, Rhs, RightTensor,
- right_nocontract_t, right_contract_t, 32,
- right_inner_dim_contiguous,
- right_inner_dim_reordered, RightAlignment>,
- Index, ShardingType> {
- public:
- typedef QInt8 LhsScalar;
- typedef QUInt8 RhsScalar;
-
- TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
- : kc_(k), mc_(m), nc_(n) {
- eigen_assert(m % 32 == 0);
- eigen_assert(k % 32 == 0);
- if (!k || !m || !n) {
- return;
- }
-
- if (ShardingType == ShardByCol) {
- eigen_assert(n % 32 == 0);
- nc_ = (((n / num_threads) + 31) / 32) * 32;
- } else {
- eigen_assert(n % 32 == 0 || n == 1);
- // Special case to avoid breaking the unimplemented matrix-vector case
- if (n == 1) {
- nc_ = 32;
- }
- mc_ = (((m / num_threads) + 31) / 32) * 32;
- }
- }
-
- EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
- EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
- EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
-
- private:
- Index kc_;
- Index mc_;
- Index nc_;
-};
-
-// Specialized blocking for quantized implementations.
-// Used by TensorContraction and GeneralMatrixMatrix, inputs are padded to
-// multiples of 32.
-template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<ColMajor, QInt8, QInt8, MaxRows, MaxCols, MaxDepth,
- KcFactor, false>
- : public level3_blocking<QInt8, QInt8> {
- DenseIndex m_sizeA;
- DenseIndex m_sizeB;
-
- public:
- gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
- DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
- this->m_mc = ((rows + 31) / 32) * 32;
- this->m_nc = ((cols + 31) / 32) * 32;
- this->m_kc = ((depth + 31) / 32) * 32;
- m_sizeA = this->m_mc * this->m_kc;
- m_sizeB = this->m_kc * this->m_nc;
- }
- void allocateA() {
- if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA);
- }
- void allocateB() {
- if (this->m_blockB == 0) this->m_blockB = aligned_new<QInt8>(m_sizeB);
- }
- void allocateAll() {
- allocateA();
- allocateB();
- }
- ~gemm_blocking_space() {
- aligned_delete(this->m_blockA, m_sizeA);
- aligned_delete(this->m_blockB, m_sizeB);
- }
-};
-
-template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth,
- KcFactor, false>
- : public level3_blocking<QInt8, QUInt8> {
- DenseIndex m_sizeA;
- DenseIndex m_sizeB;
-
- public:
- gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
- DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
- this->m_mc = ((rows + 31) / 32) * 32;
- this->m_nc = ((cols + 31) / 32) * 32;
- this->m_kc = ((depth + 31) / 32) * 32;
- m_sizeA = this->m_mc * this->m_kc;
- m_sizeB = this->m_kc * this->m_nc;
- }
- void allocateA() {
- if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA);
- }
- void allocateB() {
- if (this->m_blockB == 0) this->m_blockB = aligned_new<QUInt8>(m_sizeB);
- }
- void allocateAll() {
- allocateA();
- allocateB();
- }
- ~gemm_blocking_space() {
- aligned_delete(this->m_blockA, m_sizeA);
- aligned_delete(this->m_blockB, m_sizeB);
- }
-};
-
-// Alternate templates for any input sizes
-template <typename Scalar, typename Index, typename DataMapper, int Pack1,
- int Pack2, int StorageOrder, bool Conjugate = false,
- bool PanelMode = false>
-struct gemm_pack_lhs_any;
-template <typename Index, typename DataMapper, int Pack1, int Pack2,
- bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
- Conjugate, PanelMode> {
- EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
- Index depth, Index rows, Index stride = 0,
- Index offset = 0);
-};
-
-template <typename Scalar, typename Index, typename DataMapper, int nr,
- int StorageOrder, bool Conjugate = false, bool PanelMode = false>
-struct gemm_pack_rhs_any;
-template <typename Index, typename DataMapper, int nr, bool Conjugate,
- bool PanelMode>
-struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
- PanelMode> {
- EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs,
- Index depth, Index cols, Index stride = 0,
- Index offset = 0);
-};
-
-template <typename LhsScalar, typename RhsScalar, typename Index,
- typename DataMapper, int mr, int nr, bool ConjugateLhs = false,
- bool ConjugateRhs = false>
-struct gebp_kernel_any;
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
- ConjugateRhs> {
- typedef typename DataMapper::LinearMapper LinearMapper;
-
- EIGEN_DONT_INLINE
- void operator()(const DataMapper& res, const QInt8* blockA,
- const QUInt8* blockB, Index rows, Index depth, Index cols,
- QInt32 alpha, Index strideA = -1, Index strideB = -1,
- Index offsetA = 0, Index offsetB = 0);
-};
-
-// Alternate implementations for any input sizes
-template <typename Index, typename DataMapper, int Pack1, int Pack2,
- bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2,
- ColMajor, Conjugate, PanelMode>::
-operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
- Index stride, Index offset) {
- eigen_assert(stride == 0);
- eigen_assert(offset == 0);
-
- typedef typename packet_traits<QInt8>::type Packet;
-
- // Get vector pointer
- __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
-
- // Get even multiples of the dimensions
- Index rows_32 = (rows / 32) * 32;
- Index depth_8 = (depth / 8) * 8;
-
- // Get padding for when depth is not a multiple of 32
- int padding = 0;
- if (depth % 32 != 0) {
- int depth_32 = (depth / 32) * 32;
- int extra_depth = depth - depth_32;
- int extra_depth_8 = ((extra_depth + 7) / 8) * 8;
- padding = 32 - extra_depth_8;
- }
-
- // Pack rows in sets of 32
- for (Index m = 0; m < rows_32; m += 32) {
- // Pack depth in sets of 8
- for (Index k = 0; k < depth_8; k += 8) {
- // Load vectors
- __m256i L_A = lhs.template loadPacket<Packet>(m, k);
- __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
-
- // Interleave 8-bit elements
- __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
- __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
-
- __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
- __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
- __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
- __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
-
- // Interleave 16-bit elements
- __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
- __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
-
- // Use permute before we store to cross 128-bit lanes
- __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
- _mm256_store_si256(blockA_256++, L_AD0);
-
- // Complete packing for 32 x 8 block
- __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
- __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
- __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
- __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
- _mm256_store_si256(blockA_256++, L_AD8);
- _mm256_store_si256(blockA_256++, L_AD16);
- __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
- _mm256_store_si256(blockA_256++, L_AD24);
- __m256i L_E = lhs.template loadPacket<Packet>(m, k + 4);
- __m256i L_F = lhs.template loadPacket<Packet>(m, k + 5);
- __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
- __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
- __m256i L_G = lhs.template loadPacket<Packet>(m, k + 6);
- __m256i L_H = lhs.template loadPacket<Packet>(m, k + 7);
- __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
- __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
- __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
- __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
- __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
- _mm256_store_si256(blockA_256++, L_EH0);
- __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
- __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
- __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
- __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
- _mm256_store_si256(blockA_256++, L_EH8);
- _mm256_store_si256(blockA_256++, L_EH16);
- __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
- _mm256_store_si256(blockA_256++, L_EH24);
- }
-
- // Finish the k dimension, padding with zeros
- if (depth_8 < depth) {
- __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
- switch (depth - depth_8) {
- case 1:
- L_A = lhs.template loadPacket<Packet>(m, depth_8);
- L_B = _mm256_setzero_si256();
- L_C = _mm256_setzero_si256();
- L_D = _mm256_setzero_si256();
- L_E = _mm256_setzero_si256();
- L_F = _mm256_setzero_si256();
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- break;
- case 2:
- L_A = lhs.template loadPacket<Packet>(m, depth_8);
- L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
- L_C = _mm256_setzero_si256();
- L_D = _mm256_setzero_si256();
- L_E = _mm256_setzero_si256();
- L_F = _mm256_setzero_si256();
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- break;
- case 3:
- L_A = lhs.template loadPacket<Packet>(m, depth_8);
- L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
- L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
- L_D = _mm256_setzero_si256();
- L_E = _mm256_setzero_si256();
- L_F = _mm256_setzero_si256();
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- break;
- case 4:
- L_A = lhs.template loadPacket<Packet>(m, depth_8);
- L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
- L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
- L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
- L_E = _mm256_setzero_si256();
- L_F = _mm256_setzero_si256();
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- break;
- case 5:
- L_A = lhs.template loadPacket<Packet>(m, depth_8);
- L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
- L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
- L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
- L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
- L_F = _mm256_setzero_si256();
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- break;
- case 6:
- L_A = lhs.template loadPacket<Packet>(m, depth_8);
- L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
- L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
- L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
- L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
- L_F = lhs.template loadPacket<Packet>(m, depth_8 + 5);
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- break;
- case 7:
- L_A = lhs.template loadPacket<Packet>(m, depth_8);
- L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
- L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
- L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
- L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
- L_F = lhs.template loadPacket<Packet>(m, depth_8 + 5);
- L_G = lhs.template loadPacket<Packet>(m, depth_8 + 6);
- L_H = _mm256_setzero_si256();
- break;
- }
-
- // Interleave 8-bit elements
- __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
- __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
-
- __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
- __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
-
- // Interleave 16-bit elements
- __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
- __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
-
- // Use permute before we store to cross 128-bit lanes
- __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
- _mm256_store_si256(blockA_256++, L_AD0);
-
- // Complete packing
- __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
- __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
- __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
- __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
- _mm256_store_si256(blockA_256++, L_AD8);
- _mm256_store_si256(blockA_256++, L_AD16);
- __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
- _mm256_store_si256(blockA_256++, L_AD24);
- __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
- __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
- __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
- __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
- __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
- __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
- __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
- _mm256_store_si256(blockA_256++, L_EH0);
- __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
- __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
- __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
- __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
- _mm256_store_si256(blockA_256++, L_EH8);
- _mm256_store_si256(blockA_256++, L_EH16);
- __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
- _mm256_store_si256(blockA_256++, L_EH24);
- }
- blockA_256 += padding;
- }
-
- // Finish the m dimension, padding with zeros
- if (rows_32 < rows) {
- // Pack depth in sets of 8
- for (Index k = 0; k < depth_8; k += 8) {
- // Load vectors
- __m256i L_A = _mm256_setzero_si256();
- __m256i L_B = _mm256_setzero_si256();
- __m256i L_C = _mm256_setzero_si256();
- __m256i L_D = _mm256_setzero_si256();
- __m256i L_E = _mm256_setzero_si256();
- __m256i L_F = _mm256_setzero_si256();
- __m256i L_G = _mm256_setzero_si256();
- __m256i L_H = _mm256_setzero_si256();
- for (Index m = 0; m < rows - rows_32; m++) {
- QInt8* ptr = (QInt8*)&L_A;
- ptr[m] = lhs(rows_32 + m, k);
- ptr = (QInt8*)&L_B;
- ptr[m] = lhs(rows_32 + m, k + 1);
- ptr = (QInt8*)&L_C;
- ptr[m] = lhs(rows_32 + m, k + 2);
- ptr = (QInt8*)&L_D;
- ptr[m] = lhs(rows_32 + m, k + 3);
- ptr = (QInt8*)&L_E;
- ptr[m] = lhs(rows_32 + m, k + 4);
- ptr = (QInt8*)&L_F;
- ptr[m] = lhs(rows_32 + m, k + 5);
- ptr = (QInt8*)&L_G;
- ptr[m] = lhs(rows_32 + m, k + 6);
- ptr = (QInt8*)&L_H;
- ptr[m] = lhs(rows_32 + m, k + 7);
- }
-
- // Interleave 8-bit elements
- __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
- __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
- __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
- __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
-
- // Interleave 16-bit elements
- __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
- __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
-
- // Use permute before we store to cross 128-bit lanes
- __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
- _mm256_store_si256(blockA_256++, L_AD0);
-
- // Complete packing for 32 x 8 block
- __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
- __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
- __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
- __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
- _mm256_store_si256(blockA_256++, L_AD8);
- _mm256_store_si256(blockA_256++, L_AD16);
- __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
- _mm256_store_si256(blockA_256++, L_AD24);
- __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
- __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
- __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
- __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
- __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
- __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
- __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
- _mm256_store_si256(blockA_256++, L_EH0);
- __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
- __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
- __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
- __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
- _mm256_store_si256(blockA_256++, L_EH8);
- _mm256_store_si256(blockA_256++, L_EH16);
- __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
- _mm256_store_si256(blockA_256++, L_EH24);
- }
-
- // Finish the k dimension, padding with zeros
- if (depth_8 < depth) {
- __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
- QInt8* ptr;
- switch (depth - depth_8) {
- case 1:
- L_A = _mm256_setzero_si256();
- L_B = _mm256_setzero_si256();
- L_C = _mm256_setzero_si256();
- L_D = _mm256_setzero_si256();
- L_E = _mm256_setzero_si256();
- L_F = _mm256_setzero_si256();
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- for (Index m = 0; m < rows - rows_32; m++) {
- QInt8* ptr = (QInt8*)&L_A;
- ptr[m] = lhs(rows_32 + m, depth_8);
- }
- break;
- case 2:
- L_A = _mm256_setzero_si256();
- L_B = _mm256_setzero_si256();
- L_C = _mm256_setzero_si256();
- L_D = _mm256_setzero_si256();
- L_E = _mm256_setzero_si256();
- L_F = _mm256_setzero_si256();
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- for (Index m = 0; m < rows - rows_32; m++) {
- ptr = (QInt8*)&L_A;
- ptr[m] = lhs(rows_32 + m, depth_8);
- ptr = (QInt8*)&L_B;
- ptr[m] = lhs(rows_32 + m, depth_8 + 1);
- }
- break;
- case 3:
- L_A = _mm256_setzero_si256();
- L_B = _mm256_setzero_si256();
- L_C = _mm256_setzero_si256();
- L_D = _mm256_setzero_si256();
- L_E = _mm256_setzero_si256();
- L_F = _mm256_setzero_si256();
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- for (Index m = 0; m < rows - rows_32; m++) {
- ptr = (QInt8*)&L_A;
- ptr[m] = lhs(rows_32 + m, depth_8);
- ptr = (QInt8*)&L_B;
- ptr[m] = lhs(rows_32 + m, depth_8 + 1);
- ptr = (QInt8*)&L_C;
- ptr[m] = lhs(rows_32 + m, depth_8 + 2);
- }
- break;
- case 4:
- L_A = _mm256_setzero_si256();
- L_B = _mm256_setzero_si256();
- L_C = _mm256_setzero_si256();
- L_D = _mm256_setzero_si256();
- L_E = _mm256_setzero_si256();
- L_F = _mm256_setzero_si256();
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- for (Index m = 0; m < rows - rows_32; m++) {
- ptr = (QInt8*)&L_A;
- ptr[m] = lhs(rows_32 + m, depth_8);
- ptr = (QInt8*)&L_B;
- ptr[m] = lhs(rows_32 + m, depth_8 + 1);
- ptr = (QInt8*)&L_C;
- ptr[m] = lhs(rows_32 + m, depth_8 + 2);
- ptr = (QInt8*)&L_D;
- ptr[m] = lhs(rows_32 + m, depth_8 + 3);
- }
- break;
- case 5:
- L_A = _mm256_setzero_si256();
- L_B = _mm256_setzero_si256();
- L_C = _mm256_setzero_si256();
- L_D = _mm256_setzero_si256();
- L_E = _mm256_setzero_si256();
- L_F = _mm256_setzero_si256();
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- for (Index m = 0; m < rows - rows_32; m++) {
- ptr = (QInt8*)&L_A;
- ptr[m] = lhs(rows_32 + m, depth_8);
- ptr = (QInt8*)&L_B;
- ptr[m] = lhs(rows_32 + m, depth_8 + 1);
- ptr = (QInt8*)&L_C;
- ptr[m] = lhs(rows_32 + m, depth_8 + 2);
- ptr = (QInt8*)&L_D;
- ptr[m] = lhs(rows_32 + m, depth_8 + 3);
- ptr = (QInt8*)&L_E;
- ptr[m] = lhs(rows_32 + m, depth_8 + 4);
- }
- break;
- case 6:
- L_A = _mm256_setzero_si256();
- L_B = _mm256_setzero_si256();
- L_C = _mm256_setzero_si256();
- L_D = _mm256_setzero_si256();
- L_E = _mm256_setzero_si256();
- L_F = _mm256_setzero_si256();
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- for (Index m = 0; m < rows - rows_32; m++) {
- ptr = (QInt8*)&L_A;
- ptr[m] = lhs(rows_32 + m, depth_8);
- ptr = (QInt8*)&L_B;
- ptr[m] = lhs(rows_32 + m, depth_8 + 1);
- ptr = (QInt8*)&L_C;
- ptr[m] = lhs(rows_32 + m, depth_8 + 2);
- ptr = (QInt8*)&L_D;
- ptr[m] = lhs(rows_32 + m, depth_8 + 3);
- ptr = (QInt8*)&L_E;
- ptr[m] = lhs(rows_32 + m, depth_8 + 4);
- ptr = (QInt8*)&L_F;
- ptr[m] = lhs(rows_32 + m, depth_8 + 5);
- }
- break;
- case 7:
- L_A = _mm256_setzero_si256();
- L_B = _mm256_setzero_si256();
- L_C = _mm256_setzero_si256();
- L_D = _mm256_setzero_si256();
- L_E = _mm256_setzero_si256();
- L_F = _mm256_setzero_si256();
- L_G = _mm256_setzero_si256();
- L_H = _mm256_setzero_si256();
- for (Index m = 0; m < rows - rows_32; m++) {
- ptr = (QInt8*)&L_A;
- ptr[m] = lhs(rows_32 + m, depth_8);
- ptr = (QInt8*)&L_B;
- ptr[m] = lhs(rows_32 + m, depth_8 + 1);
- ptr = (QInt8*)&L_C;
- ptr[m] = lhs(rows_32 + m, depth_8 + 2);
- ptr = (QInt8*)&L_D;
- ptr[m] = lhs(rows_32 + m, depth_8 + 3);
- ptr = (QInt8*)&L_E;
- ptr[m] = lhs(rows_32 + m, depth_8 + 4);
- ptr = (QInt8*)&L_F;
- ptr[m] = lhs(rows_32 + m, depth_8 + 5);
- ptr = (QInt8*)&L_G;
- ptr[m] = lhs(rows_32 + m, depth_8 + 6);
- }
- break;
- }
-
- // Interleave 8-bit elements
- __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
- __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
- __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
- __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
-
- // Interleave 16-bit elements
- __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
- __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
-
- // Use permute before we store to cross 128-bit lanes
- __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
- _mm256_store_si256(blockA_256++, L_AD0);
-
- // Complete packing
- __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
- __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
- __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
- __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
- _mm256_store_si256(blockA_256++, L_AD8);
- _mm256_store_si256(blockA_256++, L_AD16);
- __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
- _mm256_store_si256(blockA_256++, L_AD24);
- __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
- __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
- __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
- __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
- __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
- __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
- __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
- _mm256_store_si256(blockA_256++, L_EH0);
- __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
- __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
- __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
- __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
- _mm256_store_si256(blockA_256++, L_EH8);
- _mm256_store_si256(blockA_256++, L_EH16);
- __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
- _mm256_store_si256(blockA_256++, L_EH24);
- }
- }
-}
-
-template <typename Index, typename DataMapper, int nr, bool Conjugate,
- bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr,
- ColMajor, Conjugate, PanelMode>::
-operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
- Index stride, Index offset) {
- eigen_assert(stride == 0);
- eigen_assert(offset == 0);
-
- typedef typename packet_traits<QUInt8>::type Packet;
-
- // Get vector pointer
- __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
-
- // Get even multiples of the dimensions
- Index cols_32 = (cols / 32) * 32;
- Index depth_32 = (depth / 32) * 32;
-
- // Perform a step of the packing for 4 columns
- __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24;
-#define PACK_STEP \
- R_AB_L = _mm256_unpacklo_epi64(R_A, R_B); \
- R_CD_L = _mm256_unpacklo_epi64(R_C, R_D); \
- R_AB_H = _mm256_unpackhi_epi64(R_A, R_B); \
- R_CD_H = _mm256_unpackhi_epi64(R_C, R_D); \
- R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20); \
- R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
- R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20); \
- R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
- _mm256_store_si256(blockB_256, R_AD_0); \
- _mm256_store_si256(blockB_256 + 8, R_AD_8); \
- _mm256_store_si256(blockB_256 + 16, R_AD_16); \
- _mm256_store_si256(blockB_256 + 24, R_AD_24); \
- blockB_256++;
-
- // Pack cols in sets of 32
- for (Index n = 0; n < cols_32; n += 32) {
- // Pack depth in sets of 32
- for (Index k = 0; k < depth_32; k += 32) {
- __m256i R_A = rhs.template loadPacket<Packet>(k, n);
- __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
- __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
- __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 4);
- R_B = rhs.template loadPacket<Packet>(k, n + 5);
- R_C = rhs.template loadPacket<Packet>(k, n + 6);
- R_D = rhs.template loadPacket<Packet>(k, n + 7);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 8);
- R_B = rhs.template loadPacket<Packet>(k, n + 9);
- R_C = rhs.template loadPacket<Packet>(k, n + 10);
- R_D = rhs.template loadPacket<Packet>(k, n + 11);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 12);
- R_B = rhs.template loadPacket<Packet>(k, n + 13);
- R_C = rhs.template loadPacket<Packet>(k, n + 14);
- R_D = rhs.template loadPacket<Packet>(k, n + 15);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 16);
- R_B = rhs.template loadPacket<Packet>(k, n + 17);
- R_C = rhs.template loadPacket<Packet>(k, n + 18);
- R_D = rhs.template loadPacket<Packet>(k, n + 19);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 20);
- R_B = rhs.template loadPacket<Packet>(k, n + 21);
- R_C = rhs.template loadPacket<Packet>(k, n + 22);
- R_D = rhs.template loadPacket<Packet>(k, n + 23);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 24);
- R_B = rhs.template loadPacket<Packet>(k, n + 25);
- R_C = rhs.template loadPacket<Packet>(k, n + 26);
- R_D = rhs.template loadPacket<Packet>(k, n + 27);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 28);
- R_B = rhs.template loadPacket<Packet>(k, n + 29);
- R_C = rhs.template loadPacket<Packet>(k, n + 30);
- R_D = rhs.template loadPacket<Packet>(k, n + 31);
- PACK_STEP;
-
- blockB_256 += 24;
- }
-
- if (depth_32 < depth) {
- QUInt8* ptr;
- __m256i R_A = _mm256_setzero_si256();
- __m256i R_B = _mm256_setzero_si256();
- __m256i R_C = _mm256_setzero_si256();
- __m256i R_D = _mm256_setzero_si256();
- for (Index k = depth_32; k < depth; k++) {
- ptr = (QUInt8*)&R_A;
- ptr[k - depth_32] = rhs(k, n);
- ptr = (QUInt8*)&R_B;
- ptr[k - depth_32] = rhs(k, n + 1);
- ptr = (QUInt8*)&R_C;
- ptr[k - depth_32] = rhs(k, n + 2);
- ptr = (QUInt8*)&R_D;
- ptr[k - depth_32] = rhs(k, n + 3);
- }
- PACK_STEP;
-
- R_A = _mm256_setzero_si256();
- R_B = _mm256_setzero_si256();
- R_C = _mm256_setzero_si256();
- R_D = _mm256_setzero_si256();
- for (Index k = depth_32; k < depth; k++) {
- ptr = (QUInt8*)&R_A;
- ptr[k - depth_32] = rhs(k, n + 4);
- ptr = (QUInt8*)&R_B;
- ptr[k - depth_32] = rhs(k, n + 5);
- ptr = (QUInt8*)&R_C;
- ptr[k - depth_32] = rhs(k, n + 6);
- ptr = (QUInt8*)&R_D;
- ptr[k - depth_32] = rhs(k, n + 7);
- }
- PACK_STEP;
-
- R_A = _mm256_setzero_si256();
- R_B = _mm256_setzero_si256();
- R_C = _mm256_setzero_si256();
- R_D = _mm256_setzero_si256();
- for (Index k = depth_32; k < depth; k++) {
- ptr = (QUInt8*)&R_A;
- ptr[k - depth_32] = rhs(k, n + 8);
- ptr = (QUInt8*)&R_B;
- ptr[k - depth_32] = rhs(k, n + 9);
- ptr = (QUInt8*)&R_C;
- ptr[k - depth_32] = rhs(k, n + 10);
- ptr = (QUInt8*)&R_D;
- ptr[k - depth_32] = rhs(k, n + 11);
- }
- PACK_STEP;
-
- R_A = _mm256_setzero_si256();
- R_B = _mm256_setzero_si256();
- R_C = _mm256_setzero_si256();
- R_D = _mm256_setzero_si256();
- for (Index k = depth_32; k < depth; k++) {
- ptr = (QUInt8*)&R_A;
- ptr[k - depth_32] = rhs(k, n + 12);
- ptr = (QUInt8*)&R_B;
- ptr[k - depth_32] = rhs(k, n + 13);
- ptr = (QUInt8*)&R_C;
- ptr[k - depth_32] = rhs(k, n + 14);
- ptr = (QUInt8*)&R_D;
- ptr[k - depth_32] = rhs(k, n + 15);
- }
- PACK_STEP;
-
- R_A = _mm256_setzero_si256();
- R_B = _mm256_setzero_si256();
- R_C = _mm256_setzero_si256();
- R_D = _mm256_setzero_si256();
- for (Index k = depth_32; k < depth; k++) {
- ptr = (QUInt8*)&R_A;
- ptr[k - depth_32] = rhs(k, n + 16);
- ptr = (QUInt8*)&R_B;
- ptr[k - depth_32] = rhs(k, n + 17);
- ptr = (QUInt8*)&R_C;
- ptr[k - depth_32] = rhs(k, n + 18);
- ptr = (QUInt8*)&R_D;
- ptr[k - depth_32] = rhs(k, n + 19);
- }
- PACK_STEP;
-
- R_A = _mm256_setzero_si256();
- R_B = _mm256_setzero_si256();
- R_C = _mm256_setzero_si256();
- R_D = _mm256_setzero_si256();
- for (Index k = depth_32; k < depth; k++) {
- ptr = (QUInt8*)&R_A;
- ptr[k - depth_32] = rhs(k, n + 20);
- ptr = (QUInt8*)&R_B;
- ptr[k - depth_32] = rhs(k, n + 21);
- ptr = (QUInt8*)&R_C;
- ptr[k - depth_32] = rhs(k, n + 22);
- ptr = (QUInt8*)&R_D;
- ptr[k - depth_32] = rhs(k, n + 23);
- }
- PACK_STEP;
-
- R_A = _mm256_setzero_si256();
- R_B = _mm256_setzero_si256();
- R_C = _mm256_setzero_si256();
- R_D = _mm256_setzero_si256();
- for (Index k = depth_32; k < depth; k++) {
- ptr = (QUInt8*)&R_A;
- ptr[k - depth_32] = rhs(k, n + 24);
- ptr = (QUInt8*)&R_B;
- ptr[k - depth_32] = rhs(k, n + 25);
- ptr = (QUInt8*)&R_C;
- ptr[k - depth_32] = rhs(k, n + 26);
- ptr = (QUInt8*)&R_D;
- ptr[k - depth_32] = rhs(k, n + 27);
- }
- PACK_STEP;
-
- R_A = _mm256_setzero_si256();
- R_B = _mm256_setzero_si256();
- R_C = _mm256_setzero_si256();
- R_D = _mm256_setzero_si256();
- for (Index k = depth_32; k < depth; k++) {
- ptr = (QUInt8*)&R_A;
- ptr[k - depth_32] = rhs(k, n + 28);
- ptr = (QUInt8*)&R_B;
- ptr[k - depth_32] = rhs(k, n + 29);
- ptr = (QUInt8*)&R_C;
- ptr[k - depth_32] = rhs(k, n + 30);
- ptr = (QUInt8*)&R_D;
- ptr[k - depth_32] = rhs(k, n + 31);
- }
- PACK_STEP;
- blockB_256 += 24;
- }
- }
-
- // Finish packing cols
- if (cols_32 < cols) {
- // Pack depth in sets of 32
- for (Index k = 0; k < depth_32; k += 32) {
- __m256i R_A, R_B, R_C, R_D;
- Index n;
- for (n = cols_32; n < cols; n += 4) {
- switch (cols - n) {
- case 1:
- R_A = rhs.template loadPacket<Packet>(k, n);
- R_B = _mm256_setzero_si256();
- R_C = _mm256_setzero_si256();
- R_D = _mm256_setzero_si256();
- PACK_STEP;
- break;
- case 2:
- R_A = rhs.template loadPacket<Packet>(k, n);
- R_B = rhs.template loadPacket<Packet>(k, n + 1);
- R_C = _mm256_setzero_si256();
- R_D = _mm256_setzero_si256();
- PACK_STEP;
- break;
- case 3:
- R_A = rhs.template loadPacket<Packet>(k, n);
- R_B = rhs.template loadPacket<Packet>(k, n + 1);
- R_C = rhs.template loadPacket<Packet>(k, n + 2);
- R_D = _mm256_setzero_si256();
- PACK_STEP;
- break;
- default:
- R_A = rhs.template loadPacket<Packet>(k, n);
- R_B = rhs.template loadPacket<Packet>(k, n + 1);
- R_C = rhs.template loadPacket<Packet>(k, n + 2);
- R_D = rhs.template loadPacket<Packet>(k, n + 3);
- PACK_STEP;
- break;
- }
- }
-
- // Increment the block pointer.
- // We must pad if cols is not a multiple of 32.
- blockB_256 += 32 - (n - cols_32) / 4;
- }
-
- if (depth_32 < depth) {
- for (Index n = cols_32; n < cols; n += 4) {
- QUInt8* ptr;
- __m256i R_A = _mm256_setzero_si256();
- __m256i R_B = _mm256_setzero_si256();
- __m256i R_C = _mm256_setzero_si256();
- __m256i R_D = _mm256_setzero_si256();
- switch (cols - n) {
- case 1:
- for (Index k = depth_32; k < depth; k++) {
- ptr = (QUInt8*)&R_A;
- ptr[k - depth_32] = rhs(k, n);
- }
- PACK_STEP;
- break;
- case 2:
- for (Index k = depth_32; k < depth; k++) {
- ptr = (QUInt8*)&R_A;
- ptr[k - depth_32] = rhs(k, n);
- ptr = (QUInt8*)&R_B;
- ptr[k - depth_32] = rhs(k, n + 1);
- }
- PACK_STEP;
- break;
- case 3:
- for (Index k = depth_32; k < depth; k++) {
- ptr = (QUInt8*)&R_A;
- ptr[k - depth_32] = rhs(k, n);
- ptr = (QUInt8*)&R_B;
- ptr[k - depth_32] = rhs(k, n + 1);
- ptr = (QUInt8*)&R_C;
- ptr[k - depth_32] = rhs(k, n + 2);
- }
- PACK_STEP;
- break;
- default:
- for (Index k = depth_32; k < depth; k++) {
- ptr = (QUInt8*)&R_A;
- ptr[k - depth_32] = rhs(k, n);
- ptr = (QUInt8*)&R_B;
- ptr[k - depth_32] = rhs(k, n + 1);
- ptr = (QUInt8*)&R_C;
- ptr[k - depth_32] = rhs(k, n + 2);
- ptr = (QUInt8*)&R_D;
- ptr[k - depth_32] = rhs(k, n + 3);
- }
- PACK_STEP;
- break;
- }
- }
- }
- }
-#undef PACK_STEP
-}
-
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr,
- ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
- Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
- Index strideB, Index offsetA, Index offsetB) {
- EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
- eigen_assert(alpha.value == 1);
- eigen_assert(strideA == -1);
- eigen_assert(strideB == -1);
- eigen_assert(offsetA == 0);
- eigen_assert(offsetB == 0);
- eigen_assert(rows > 0);
- eigen_assert(cols > 0);
- eigen_assert(depth > 0);
- eigen_assert(blockA);
- eigen_assert(blockB);
-
- Index rows_32 = ((rows + 31) / 32) * 32;
- Index cols_32 = ((cols + 31) / 32) * 32;
- Index depth_32 = ((depth + 31) / 32) * 32;
-
- // Create result block
- ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0);
- memset(blockO, 0, 32 * 32 * sizeof(QInt32));
-
- // Get vectorized pointers
- __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
- const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
- const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
-
- // Loop over blocks of 32 columns
- for (Index n = 0; n < cols_32; n += 32) {
- // Reset index into blockA
- Index indexL = 0;
- // Loop over blocks of 32 rows
- for (Index m = 0; m < rows_32; m += 32) {
- // Reset index into blockB
- Index indexR = n / 32 * depth_32;
- // Loop over blocks of 8 on depth
- for (Index k = 0; k < depth_32; k += 8) {
- // Load inputs
- __m256i L_AD0 = blockA_256[indexL++];
- __m256i L_AD8 = blockA_256[indexL++];
- __m256i L_AD16 = blockA_256[indexL++];
- __m256i L_AD24 = blockA_256[indexL++];
- __m256i L_EH0 = blockA_256[indexL++];
- __m256i L_EH8 = blockA_256[indexL++];
- __m256i L_EH16 = blockA_256[indexL++];
- __m256i L_EH24 = blockA_256[indexL++];
- __m256i R_AH0 = blockB_256[indexR++];
- __m256i R_AH4 = blockB_256[indexR++];
- __m256i R_AH8 = blockB_256[indexR++];
- __m256i R_AH12 = blockB_256[indexR++];
- __m256i R_AH16 = blockB_256[indexR++];
- __m256i R_AH20 = blockB_256[indexR++];
- __m256i R_AH24 = blockB_256[indexR++];
- __m256i R_AH28 = blockB_256[indexR++];
-
- // This constant is used with madd to convert 16 bit to 32 bit
- const __m256i ONE = _mm256_set1_epi32(0x00010001);
-
- // Declare variables used in COMPUTE_STEP
- __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32;
-
-#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET) \
- P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0); \
- P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
- P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0); \
- P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
- P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
- _mm256_store_si256( \
- blockO_256 + 4 * OFFSET, \
- _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32)); \
- \
- P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8); \
- P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
- P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8); \
- P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
- P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
- _mm256_store_si256( \
- blockO_256 + 4 * OFFSET + 1, \
- _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \
- \
- P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16); \
- P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
- P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16); \
- P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
- P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
- _mm256_store_si256( \
- blockO_256 + 4 * OFFSET + 2, \
- _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \
- \
- P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24); \
- P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
- P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24); \
- P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
- P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
- _mm256_store_si256( \
- blockO_256 + 4 * OFFSET + 3, \
- _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32));
-
- // Permute and shuffle to copy a single value across the entire vector
- // Then compute the multiplication
- __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
- __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 0);
- __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 1);
- R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
- __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 2);
- __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 3);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 4);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 5);
- R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 6);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 7);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 8);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 9);
- R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 10);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 11);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 12);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 13);
- R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 14);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 15);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 16);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 17);
- R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 18);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 19);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 20);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 21);
- R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 22);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 23);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 24);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 25);
- R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 26);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 27);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 28);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 29);
- R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 30);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 31);
-
-#undef COMPUTE_STEP
- }
-
- // Transfer the results to the result matrix.
- if (m + 32 <= rows && n + 32 <= cols) {
- Index i = 0;
- for (Index j = n; j < n + 32; j++) {
- LinearMapper r0 = res.getLinearMapper(m, j);
- LinearMapper r1 = res.getLinearMapper(m + 8, j);
- LinearMapper r2 = res.getLinearMapper(m + 16, j);
- LinearMapper r3 = res.getLinearMapper(m + 24, j);
- typedef typename packet_traits<QInt32>::type Packet;
- r0.template storePacket<Packet>(
- 0, _mm256_add_epi32(blockO_256[i++],
- r0.template loadPacket<Packet>(0)));
- r1.template storePacket<Packet>(
- 0, _mm256_add_epi32(blockO_256[i++],
- r1.template loadPacket<Packet>(0)));
- r2.template storePacket<Packet>(
- 0, _mm256_add_epi32(blockO_256[i++],
- r2.template loadPacket<Packet>(0)));
- r3.template storePacket<Packet>(
- 0, _mm256_add_epi32(blockO_256[i++],
- r3.template loadPacket<Packet>(0)));
- }
- } else {
- for (Index j = n; j < cols; j++) {
- for (Index i = m; i < rows; i++) {
- res(i, j) = blockO[(j - n) * 32 + (i - m)];
- }
- }
- }
-
- // Zero the result block so it can be reused
- memset(blockO, 0, 32 * 32 * sizeof(QInt32));
- }
- }
-}
-
-// Below are the fully optimized versions that are correct only for sizes that
-// are multiple of 32. It is about a 10% performance benefit to keep these
-// implementations separate.
-
-// Arrange a block of the left input matrix in contiguous memory.
-//
-// Given column major input (A0 beside A1 in memory):
-// A0 B0 C0 D0 E0 F0 G0 H0 ...
-// A1 B1 C1 D1 E1 F1 G1 H1 ...
-// A2 B2 C2 D2 E2 F2 G2 H2 ...
-// A3 B3 C3 D3 E3 F3 G3 H3 ...
-// A4 B4 C4 D4 E4 F4 G4 H4 ...
-// A5 B5 C5 D5 E5 F5 G5 H5 ...
-// A6 B6 C6 D6 E6 F6 G6 H6 ...
-// A7 B7 C7 D7 E7 F7 G7 H7 ...
-// A8 ...
-// ...
-//
-// Packing yields output (A0 beside B0 in memory):
-// A0 B0 C0 D0
-// A1 B1 C1 D1
-// A2 B2 C2 D2
-// A3 B3 C3 D3
-// A4 B4 C4 D4
-// A5 B5 C5 D5
-// A6 B6 C6 D6
-// A7 B7 C7 D7
-// ...
-// A31 B31 C31 D31
-// E0 F0 G0 H0
-// E1 F1 G1 H1
-// E2 F2 G2 H2
-// E3 F3 G3 H3
-// E4 F4 G4 H4
-// E5 F5 G5 H5
-// E6 F6 G6 H6
-// E7 F7 G7 H7
-// ...
-//
-// Four elements of the same row are arranged contiguously because maddubs and
-// madd both perform an adjacent addition in the kernel.
-template <typename Index, typename DataMapper, int Pack1, int Pack2,
- bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, QInt8, ColMajor,
- Conjugate, PanelMode> {
- EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
- Index depth, Index rows, Index stride = 0,
- Index offset = 0);
-};
-
-template <typename Index, typename DataMapper, int Pack1, int Pack2,
- bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2,
- QInt8, ColMajor, Conjugate, PanelMode>::
-operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
- Index stride, Index offset) {
- eigen_assert(stride == 0);
- eigen_assert(offset == 0);
-
- typedef typename packet_traits<QInt8>::type Packet;
-
- // Use alternate function for weird sizes
- if (rows % 32 != 0 || depth % 32 != 0) {
- gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
- Conjugate, PanelMode> lhs_pack;
- return lhs_pack(blockA, lhs, depth, rows, stride, offset);
- }
-
- // Get vector pointer
- __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
-
- // Pack rows in sets of 32
- for (Index m = 0; m < rows; m += 32) {
- // Pack depth in sets of 8
- for (Index k = 0; k < depth; k += 8) {
- // Load vectors
- __m256i L_A = lhs.template loadPacket<Packet>(m, k);
- __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
-
- // Interleave 8-bit elements
- __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
- __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
-
- __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
- __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
- __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
- __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
-
- // Interleave 16-bit elements
- __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
- __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
-
- // Use permute before we store to cross 128-bit lanes
- __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
- _mm256_store_si256(blockA_256++, L_AD0);
-
- // Complete packing for 32 x 8 block
- __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
- __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
- __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
- __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
- _mm256_store_si256(blockA_256++, L_AD8);
- _mm256_store_si256(blockA_256++, L_AD16);
- __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
- _mm256_store_si256(blockA_256++, L_AD24);
- __m256i L_E = lhs.template loadPacket<Packet>(m, k + 4);
- __m256i L_F = lhs.template loadPacket<Packet>(m, k + 5);
- __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
- __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
- __m256i L_G = lhs.template loadPacket<Packet>(m, k + 6);
- __m256i L_H = lhs.template loadPacket<Packet>(m, k + 7);
- __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
- __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
- __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
- __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
- __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
- _mm256_store_si256(blockA_256++, L_EH0);
- __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
- __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
- __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
- __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
- _mm256_store_si256(blockA_256++, L_EH8);
- _mm256_store_si256(blockA_256++, L_EH16);
- __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
- _mm256_store_si256(blockA_256++, L_EH24);
- }
- }
-}
-
-// Arrange a block of the right input matrix in contiguous memory.
-//
-// Given column major input (A0 beside A1 in memory):
-// A0 B0 C0 D0 E0 F0 G0 H0 ...
-// A1 B1 C1 D1 E1 F1 G1 H1 ...
-// A2 B2 C2 D2 E2 F2 G2 H2 ...
-// A3 B3 C3 D3 E3 F3 G3 H3 ...
-// A4 B4 C4 D4 E4 F4 G4 H4 ...
-// A5 B5 C5 D5 E5 F5 G5 H5 ...
-// A6 B6 C6 D6 E6 F6 G6 H6 ...
-// A7 B7 C7 D7 E7 F7 G7 H7 ...
-// A8 ...
-// ...
-//
-// Packing yields row major output (A0 beside A1 in memory):
-// A0 A1 A2 A3 A4 A5 A6 A7
-// B0 B1 B2 B3 B4 B5 B6 B7
-// ...
-//
-// At least four elements of the same col are arranged contiguously because
-// maddubs and madd both perform an adjacent addition in the kernel. We can
-// save work by leaving 8 adjacent elements because kr = 8.
-template <typename Index, typename DataMapper, int nr, bool Conjugate,
- bool PanelMode>
-struct gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
- PanelMode> {
- EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs,
- Index depth, Index cols, Index stride = 0,
- Index offset = 0);
-};
-
-template <typename Index, typename DataMapper, int nr, bool Conjugate,
- bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor,
- Conjugate, PanelMode>::
-operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
- Index stride, Index offset) {
- eigen_assert(stride == 0);
- eigen_assert(offset == 0);
-
- typedef typename packet_traits<QUInt8>::type Packet;
-
- // Use alternate function for weird sizes
- if (cols % 32 != 0 || depth % 32 != 0) {
- gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
- PanelMode> rhs_pack;
- return rhs_pack(blockB, rhs, depth, cols, stride, offset);
- }
-
- // Get vector pointer
- __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
-
- // Perform a step of the packing for 4 columns
- __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24;
-#define PACK_STEP \
- R_AB_L = _mm256_unpacklo_epi64(R_A, R_B); \
- R_CD_L = _mm256_unpacklo_epi64(R_C, R_D); \
- R_AB_H = _mm256_unpackhi_epi64(R_A, R_B); \
- R_CD_H = _mm256_unpackhi_epi64(R_C, R_D); \
- R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20); \
- R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
- R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20); \
- R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
- _mm256_store_si256(blockB_256, R_AD_0); \
- _mm256_store_si256(blockB_256 + 8, R_AD_8); \
- _mm256_store_si256(blockB_256 + 16, R_AD_16); \
- _mm256_store_si256(blockB_256 + 24, R_AD_24); \
- blockB_256++;
-
- // Pack cols in sets of 32
- for (Index n = 0; n < cols; n += 32) {
- // Pack depth in sets of 32
- for (Index k = 0; k < depth; k += 32) {
- __m256i R_A = rhs.template loadPacket<Packet>(k, n);
- __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
- __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
- __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 4);
- R_B = rhs.template loadPacket<Packet>(k, n + 5);
- R_C = rhs.template loadPacket<Packet>(k, n + 6);
- R_D = rhs.template loadPacket<Packet>(k, n + 7);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 8);
- R_B = rhs.template loadPacket<Packet>(k, n + 9);
- R_C = rhs.template loadPacket<Packet>(k, n + 10);
- R_D = rhs.template loadPacket<Packet>(k, n + 11);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 12);
- R_B = rhs.template loadPacket<Packet>(k, n + 13);
- R_C = rhs.template loadPacket<Packet>(k, n + 14);
- R_D = rhs.template loadPacket<Packet>(k, n + 15);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 16);
- R_B = rhs.template loadPacket<Packet>(k, n + 17);
- R_C = rhs.template loadPacket<Packet>(k, n + 18);
- R_D = rhs.template loadPacket<Packet>(k, n + 19);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 20);
- R_B = rhs.template loadPacket<Packet>(k, n + 21);
- R_C = rhs.template loadPacket<Packet>(k, n + 22);
- R_D = rhs.template loadPacket<Packet>(k, n + 23);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 24);
- R_B = rhs.template loadPacket<Packet>(k, n + 25);
- R_C = rhs.template loadPacket<Packet>(k, n + 26);
- R_D = rhs.template loadPacket<Packet>(k, n + 27);
- PACK_STEP;
-
- R_A = rhs.template loadPacket<Packet>(k, n + 28);
- R_B = rhs.template loadPacket<Packet>(k, n + 29);
- R_C = rhs.template loadPacket<Packet>(k, n + 30);
- R_D = rhs.template loadPacket<Packet>(k, n + 31);
- PACK_STEP;
-
- blockB_256 += 24;
- }
- }
-#undef PACK_STEP
-}
-
-// Perform the actual multiplication on packed inputs
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
- ConjugateRhs> {
- typedef typename DataMapper::LinearMapper LinearMapper;
-
- EIGEN_DONT_INLINE
- void operator()(const DataMapper& res, const QInt8* blockA,
- const QUInt8* blockB, Index rows, Index depth, Index cols,
- QInt32 alpha, Index strideA = -1, Index strideB = -1,
- Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
- ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
- Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
- Index strideB, Index offsetA, Index offsetB) {
- EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
- eigen_assert(alpha.value == 1);
- eigen_assert(strideA == -1);
- eigen_assert(strideB == -1);
- eigen_assert(offsetA == 0);
- eigen_assert(offsetB == 0);
- eigen_assert(rows > 0);
- eigen_assert(cols > 0);
- eigen_assert(depth > 0);
- eigen_assert(blockA);
- eigen_assert(blockB);
-
- // Use alternate function for weird sizes
- if (rows % 32 != 0 || cols % 32 != 0 || depth % 32 != 0) {
- gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
- ConjugateRhs> gebp;
- return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB,
- offsetA, offsetB);
- }
-
- // Create result block
- QInt32* blockO = aligned_new<QInt32>(32 * 32);
- // Allocating the result block is about 5-10% faster than declaring stack
- // space. It is unclear why this is the case.
- // ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0);
- memset(blockO, 0, 32 * 32 * sizeof(QInt32));
-
- // Get vectorized pointers
- __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
- const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
- const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
-
- // Loop over blocks of 32 columns
- for (Index n = 0; n < cols; n += 32) {
- // Reset index into blockA
- Index indexL = 0;
- // Loop over blocks of 32 rows
- for (Index m = 0; m < rows; m += 32) {
- // Reset index into blockB
- Index indexR = n / 32 * depth;
- // Loop over blocks of 8 on depth
- for (Index k = 0; k < depth; k += 8) {
- // Load inputs
- __m256i L_AD0 = blockA_256[indexL++];
- __m256i L_AD8 = blockA_256[indexL++];
- __m256i L_AD16 = blockA_256[indexL++];
- __m256i L_AD24 = blockA_256[indexL++];
- __m256i L_EH0 = blockA_256[indexL++];
- __m256i L_EH8 = blockA_256[indexL++];
- __m256i L_EH16 = blockA_256[indexL++];
- __m256i L_EH24 = blockA_256[indexL++];
- __m256i R_AH0 = blockB_256[indexR++];
- __m256i R_AH4 = blockB_256[indexR++];
- __m256i R_AH8 = blockB_256[indexR++];
- __m256i R_AH12 = blockB_256[indexR++];
- __m256i R_AH16 = blockB_256[indexR++];
- __m256i R_AH20 = blockB_256[indexR++];
- __m256i R_AH24 = blockB_256[indexR++];
- __m256i R_AH28 = blockB_256[indexR++];
-
- // This constant is used with madd to convert 16 bit to 32 bit
- const __m256i ONE = _mm256_set1_epi32(0x00010001);
-
- // Declare variables used in COMPUTE_STEP
- __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32;
-
-#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET) \
- P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0); \
- P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
- P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0); \
- P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
- P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
- _mm256_store_si256( \
- blockO_256 + 4 * OFFSET, \
- _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32)); \
- \
- P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8); \
- P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
- P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8); \
- P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
- P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
- _mm256_store_si256( \
- blockO_256 + 4 * OFFSET + 1, \
- _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \
- \
- P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16); \
- P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
- P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16); \
- P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
- P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
- _mm256_store_si256( \
- blockO_256 + 4 * OFFSET + 2, \
- _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \
- \
- P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24); \
- P_32_A = _mm256_madd_epi16(P_16_A, ONE); \
- P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24); \
- P_32_B = _mm256_madd_epi16(P_16_B, ONE); \
- P_32 = _mm256_add_epi32(P_32_A, P_32_B); \
- _mm256_store_si256( \
- blockO_256 + 4 * OFFSET + 3, \
- _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32));
-
- // Permute and shuffle to copy a single value across the entire vector
- // Then compute the multiplication
- __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
- __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 0);
- __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 1);
- R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
- __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 2);
- __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 3);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 4);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 5);
- R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 6);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 7);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 8);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 9);
- R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 10);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 11);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 12);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 13);
- R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 14);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 15);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 16);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 17);
- R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 18);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 19);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 20);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 21);
- R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 22);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 23);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 24);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 25);
- R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 26);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 27);
-
- R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00);
- R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD0, R_EH0, 28);
- R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD1, R_EH1, 29);
- R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11);
- R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
- R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
- COMPUTE_STEP(R_AD2, R_EH2, 30);
- R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
- R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
- COMPUTE_STEP(R_AD3, R_EH3, 31);
-
-#undef COMPUTE_STEP
- }
-
- // Transfer the results to the result matrix
- Index i = 0;
- for (Index j = n; j < n + 32; j++) {
- LinearMapper r0 = res.getLinearMapper(m, j);
- LinearMapper r1 = res.getLinearMapper(m + 8, j);
- LinearMapper r2 = res.getLinearMapper(m + 16, j);
- LinearMapper r3 = res.getLinearMapper(m + 24, j);
- typedef typename packet_traits<QInt32>::type Packet;
- r0.template storePacket<Packet>(
- 0, _mm256_add_epi32(blockO_256[i++],
- r0.template loadPacket<Packet>(0)));
- r1.template storePacket<Packet>(
- 0, _mm256_add_epi32(blockO_256[i++],
- r1.template loadPacket<Packet>(0)));
- r2.template storePacket<Packet>(
- 0, _mm256_add_epi32(blockO_256[i++],
- r2.template loadPacket<Packet>(0)));
- r3.template storePacket<Packet>(
- 0, _mm256_add_epi32(blockO_256[i++],
- r3.template loadPacket<Packet>(0)));
- }
-
- // Zero the result block so it can be reused
- memset(blockO, 0, 32 * 32 * sizeof(QInt32));
- }
- }
- aligned_delete(blockO, 32 * 32);
-}
-
-#endif // EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-
-} // namespace internal
-} // namespace Eigen
-
-#endif // CXX11_SRC_FIXEDPOINT_MATMATPRODUCTAVX2_H_
diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h b/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
deleted file mode 100644
index 9e0efae..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
+++ /dev/null
@@ -1,92 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef CXX11_SRC_FIXEDPOINT_MATMATPRODUCTNEON_H_
-#define CXX11_SRC_FIXEDPOINT_MATMATPRODUCTNEON_H_
-
-namespace Eigen {
-namespace internal {
-
-// AVX2 optimized implementation of the case where the lhs is encoded using
-// signed 8bit
-// integers and the rhs using unsigned 8bit integers.
-#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
- public:
- typedef QInt8 LhsScalar;
- typedef QUInt8 RhsScalar;
- typedef QInt32 ResScalar;
-
- enum {
- // register block size along the M and N directions
- // One for the current implementation
- nr = 1,
- mr = 1,
- // Progress made at each iteration of the product loop
- // also 1 for the current implementation
- LhsProgress = 1,
- RhsProgress = 1
- };
-};
-
-// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
- ConjugateRhs> {
- EIGEN_DONT_INLINE
- void operator()(const DataMapper& res, const QInt8* blockA,
- const QUInt8* blockB, Index rows, Index depth, Index cols,
- QInt32 alpha, Index strideA = -1, Index strideB = -1,
- Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
- bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
- ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
- Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
- Index strideB, Index offsetA, Index offsetB) {
- EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
- eigen_assert(alpha.value == 1);
- eigen_assert(strideA == -1);
- eigen_assert(strideB == -1);
- eigen_assert(offsetA == 0);
- eigen_assert(offsetB == 0);
-
- eigen_assert(rows > 0);
- eigen_assert(cols > 0);
- eigen_assert(depth > 0);
- eigen_assert(blockA);
- eigen_assert(blockB);
-
- for (Index j = 0; j < cols; ++j) {
- Index startB = j * depth;
-
- for (Index i = 0; i < rows; ++i) {
- Index startA = i * depth;
-
- for (Index k = 0; k < depth; ++k) {
- res(i, j) += blockA[startA + k] * blockB[startB + k];
- }
- }
- }
-}
-#endif
-
-} // namespace internal
-} // namespace Eigen
-
-#endif // CXX11_SRC_FIXEDPOINT_MATMATPRODUCTNEON_H_
diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h b/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
deleted file mode 100644
index f15200c..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
+++ /dev/null
@@ -1,145 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef CXX11_SRC_FIXEDPOINT_MATVECPRODUCT_H_
-#define CXX11_SRC_FIXEDPOINT_MATVECPRODUCT_H_
-
-namespace Eigen {
-namespace internal {
-
-// Mat-Vec product
-// Both lhs and rhs are encoded as 8bit signed integers
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
- typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index, QInt8, LhsMapper, ColMajor,
- ConjugateLhs, QInt8, RhsMapper,
- ConjugateRhs, Version> {
- EIGEN_DONT_INLINE static void run(Index rows, Index cols,
- const LhsMapper& lhs, const RhsMapper& rhs,
- QInt32* res, Index resIncr, QInt8 alpha);
-};
-
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
- typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<
- Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper,
- ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
- const RhsMapper& rhs, QInt32* res,
- Index resIncr, QInt8 alpha) {
- eigen_assert(alpha.value == 1);
- eigen_assert(resIncr == 1);
- eigen_assert(rows > 0);
- eigen_assert(cols > 0);
-
- for (Index i = 0; i < rows; ++i) {
- for (Index j = 0; j < cols; ++j) {
- res[i] += lhs(i, j) * rhs(j, 0);
- }
- }
-}
-
-// Mat-Vec product
-// Both lhs and rhs are encoded as 16bit signed integers
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
- typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index, QInt16, LhsMapper, ColMajor,
- ConjugateLhs, QInt16, RhsMapper,
- ConjugateRhs, Version> {
- EIGEN_DONT_INLINE static void run(Index rows, Index cols,
- const LhsMapper& lhs, const RhsMapper& rhs,
- QInt32* res, Index resIncr, QInt16 alpha);
-};
-
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
- typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<
- Index, QInt16, LhsMapper, ColMajor, ConjugateLhs, QInt16, RhsMapper,
- ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
- const RhsMapper& rhs, QInt32* res,
- Index resIncr, QInt16 alpha) {
- eigen_assert(alpha.value == 1);
- eigen_assert(resIncr == 1);
- eigen_assert(rows > 0);
- eigen_assert(cols > 0);
-
- for (Index i = 0; i < rows; ++i) {
- for (Index j = 0; j < cols; ++j) {
- res[i] += lhs(i, j) * rhs(j, 0);
- }
- }
-}
-
-// Mat-Vec product
-// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned
-// integers
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
- typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index, QInt8, LhsMapper, ColMajor,
- ConjugateLhs, QUInt8, RhsMapper,
- ConjugateRhs, Version> {
- EIGEN_DONT_INLINE static void run(Index rows, Index cols,
- const LhsMapper& lhs, const RhsMapper& rhs,
- QInt32* res, Index resIncr, QUInt8 alpha);
-};
-
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
- typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<
- Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QUInt8, RhsMapper,
- ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
- const RhsMapper& rhs, QInt32* res,
- Index resIncr, QUInt8 alpha) {
- eigen_assert(alpha.value == 1);
- eigen_assert(resIncr == 1);
- eigen_assert(rows > 0);
- eigen_assert(cols > 0);
-
- for (Index i = 0; i < rows; ++i) {
- for (Index j = 0; j < cols; ++j) {
- res[i] += lhs(i, j) * rhs(j, 0);
- }
- }
-}
-
-// Mat-Vec product
-// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed
-// integers
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
- typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index, QUInt8, LhsMapper, ColMajor,
- ConjugateLhs, QInt8, RhsMapper,
- ConjugateRhs, Version> {
- EIGEN_DONT_INLINE static void run(Index rows, Index cols,
- const LhsMapper& lhs, const RhsMapper& rhs,
- QInt32* res, Index resIncr, QInt8 alpha);
-};
-
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
- typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<
- Index, QUInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper,
- ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
- const RhsMapper& rhs, QInt32* res,
- Index resIncr, QInt8 alpha) {
- eigen_assert(alpha.value == 1);
- eigen_assert(resIncr == 1);
- eigen_assert(rows > 0);
- eigen_assert(cols > 0);
-
- for (Index i = 0; i < rows; ++i) {
- for (Index j = 0; j < cols; ++j) {
- res[i] += lhs(i, j) * rhs(j, 0);
- }
- }
-}
-
-} // namespace internal
-} // namespace Eigen
-
-#endif // CXX11_SRC_FIXEDPOINT_MATVECPRODUCT_H_
diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h b/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h
deleted file mode 100644
index 1a7cd03..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h
+++ /dev/null
@@ -1,149 +0,0 @@
-#ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_
-#define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_
-#ifdef _MSC_VER
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <smmintrin.h>
-
-#endif
-
-namespace Eigen {
-namespace internal {
-
-typedef eigen_packet_wrapper<__m256i, 10> Packet32q8i;
-typedef eigen_packet_wrapper<__m128i, 11> Packet16q8i;
-
-template <>
-struct packet_traits<QInt8> : default_packet_traits {
- typedef Packet32q8i type;
- typedef Packet16q8i half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 32,
- };
- enum {
- HasAdd = 0,
- HasSub = 0,
- HasMul = 0,
- HasNegate = 0,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasConj = 0,
- HasSetLinear = 0
- };
-};
-
-template <>
-struct unpacket_traits<Packet32q8i> {
- typedef QInt8 type;
- typedef Packet16q8i half;
- enum {
- size = 32,
- alignment = Aligned32,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-
-template <>
-struct unpacket_traits<Packet16q8i> {
- typedef QInt8 type;
- typedef Packet16q8i half;
- enum {
- size = 16,
- alignment = Aligned32,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-template <>
-EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
- return _mm256_set1_epi8(from.value);
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
- reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
- reinterpret_cast<const __m128i*>(from));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
- reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
- reinterpret_cast<const __m128i*>(from));
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
- reinterpret_cast<__m256i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
- from.m_val);
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
- from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
- from.m_val);
-}
-
-typedef __m256 Packet8f;
-
-template <>
-struct type_casting_traits<float, QInt8> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8i
-pcast<Packet8f, Packet32q8i>(const Packet8f& a, const Packet8f& b,
- const Packet8f& c, const Packet8f& d) {
- const __m256i a_conv = _mm256_cvtps_epi32(a);
- const __m256i b_conv = _mm256_cvtps_epi32(b);
- const __m256i c_conv = _mm256_cvtps_epi32(c);
- const __m256i d_conv = _mm256_cvtps_epi32(d);
- __m128i low = _mm256_castsi256_si128(a_conv);
- __m128i high = _mm256_extractf128_si256(a_conv, 1);
- __m128i tmp = _mm_packs_epi32(low, high);
- __m128i low2 = _mm256_castsi256_si128(b_conv);
- __m128i high2 = _mm256_extractf128_si256(b_conv, 1);
- __m128i tmp2 = _mm_packs_epi32(low2, high2);
- __m128i converted_low = _mm_packs_epi16(tmp, tmp2);
- low = _mm256_castsi256_si128(c_conv);
- high = _mm256_extractf128_si256(c_conv, 1);
- tmp = _mm_packs_epi32(low, high);
- low2 = _mm256_castsi256_si128(d_conv);
- high2 = _mm256_extractf128_si256(d_conv, 1);
- tmp2 = _mm_packs_epi32(low2, high2);
- __m128i converted_high = _mm_packs_epi16(tmp, tmp2);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(converted_low),
- converted_high, 1);
-}
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_
diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
deleted file mode 100644
index 385aaf8..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ /dev/null
@@ -1,545 +0,0 @@
-#ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
-#define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
-#ifdef _MSC_VER
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <smmintrin.h>
-
-#endif
-
-inline int _mm256_extract_epi16_N0(const __m256i X) {
- return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8);
-}
-
-inline int _mm256_extract_epi16_N1(const __m256i X) {
- return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8);
-}
-
-inline int _mm256_extract_epi8_N0(const __m256i X) {
- return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16);
-}
-
-inline int _mm256_extract_epi8_N1(const __m256i X) {
- return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
-}
-
-namespace Eigen {
-namespace internal {
-
-typedef eigen_packet_wrapper<__m256i, 20> Packet32q8i;
-typedef eigen_packet_wrapper<__m256i, 21> Packet16q16i;
-typedef eigen_packet_wrapper<__m256i, 22> Packet32q8u;
-typedef eigen_packet_wrapper<__m128i, 23> Packet16q8i;
-typedef eigen_packet_wrapper<__m128i, 25> Packet16q8u;
-typedef eigen_packet_wrapper<__m128i, 26> Packet8q16i;
-typedef eigen_packet_wrapper<__m256i, 27> Packet8q32i;
-typedef eigen_packet_wrapper<__m128i, 28> Packet4q32i;
-
-#ifndef EIGEN_VECTORIZE_AVX512
-template <>
-struct packet_traits<QInt8> : default_packet_traits {
- typedef Packet32q8i type;
- typedef Packet16q8i half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 32,
- };
- enum {
- HasAdd = 0,
- HasSub = 0,
- HasMul = 0,
- HasNegate = 0,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 0,
- HasSetLinear = 0
- };
-};
-template <>
-struct packet_traits<QUInt8> : default_packet_traits {
- typedef Packet32q8u type;
- typedef Packet16q8u half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 32,
- };
- enum {
- HasAdd = 0,
- HasSub = 0,
- HasMul = 0,
- HasNegate = 0,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 0,
- HasSetLinear = 0
- };
-};
-template <>
-struct packet_traits<QInt16> : default_packet_traits {
- typedef Packet16q16i type;
- typedef Packet8q16i half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 16,
- };
- enum {
- HasAdd = 0,
- HasSub = 0,
- HasMul = 0,
- HasNegate = 0,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 0,
- HasSetLinear = 0
- };
-};
-template <>
-struct packet_traits<QInt32> : default_packet_traits {
- typedef Packet8q32i type;
- typedef Packet4q32i half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 8,
- };
- enum {
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 0,
- HasSetLinear = 0
- };
-};
-#endif
-
-template <>
-struct unpacket_traits<Packet32q8i> {
- typedef QInt8 type;
- typedef Packet16q8i half;
- enum {
- size = 32,
- alignment = Aligned32,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-template <>
-struct unpacket_traits<Packet16q8i> {
- typedef QInt8 type;
- typedef Packet16q8i half;
- enum {
- size = 16,
- alignment = Aligned32,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-template <>
-struct unpacket_traits<Packet16q16i> {
- typedef QInt16 type;
- typedef Packet8q16i half;
- enum {
- size = 16,
- alignment = Aligned32,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-template <>
-struct unpacket_traits<Packet8q16i> {
- typedef QInt16 type;
- typedef Packet8q16i half;
- enum {
- size = 8,
- alignment = Aligned32,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-template <>
-struct unpacket_traits<Packet32q8u> {
- typedef QUInt8 type;
- typedef Packet16q8u half;
- enum {
- size = 32,
- alignment = Aligned32,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-template <>
-struct unpacket_traits<Packet8q32i> {
- typedef QInt32 type;
- typedef Packet4q32i half;
- enum {
- size = 8,
- alignment = Aligned32,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-
-// Unaligned load
-template <>
-EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
- reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
- reinterpret_cast<const __m128i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
- reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
- reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q16i ploadu<Packet8q16i>(const QInt16* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
- reinterpret_cast<const __m128i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
- reinterpret_cast<const __m256i*>(from));
-}
-
-// Aligned load
-template <>
-EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
- reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
- reinterpret_cast<const __m128i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
- reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
- reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q16i pload<Packet8q16i>(const QInt16* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
- reinterpret_cast<const __m128i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
- reinterpret_cast<const __m256i*>(from));
-}
-
-// Unaligned store
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
- reinterpret_cast<__m256i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
- from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
- reinterpret_cast<__m256i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
- reinterpret_cast<__m256i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet8q16i& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
- from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
- reinterpret_cast<__m256i*>(to), from.m_val);
-}
-
-// Aligned store
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
- from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
- from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet8q16i& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
- from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
- from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
- from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
- from.m_val);
-}
-
-// Extract first element.
-template <>
-EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
- return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
-}
-template <>
-EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
- return _mm256_extract_epi16_N0(a.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
- return static_cast<uint8_t>(_mm256_extract_epi8_N0(a.m_val));
-}
-template <>
-EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) {
- return _mm256_extract_epi8_N0(a.m_val);
-}
-
-// Initialize to constant value.
-template <>
-EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
- return _mm256_set1_epi8(from.value);
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q8u pset1<Packet32q8u>(const QUInt8& from) {
- return _mm256_set1_epi8(static_cast<uint8_t>(from.value));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) {
- return _mm256_set1_epi32(from.value);
-}
-
-// Basic arithmetic packet ops for QInt32.
-template <>
-EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
- const Packet8q32i& b) {
- return _mm256_add_epi32(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
- return _mm256_set1_epi16(from.value);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
- const Packet8q32i& b) {
- return _mm256_sub_epi32(a.m_val, b.m_val);
-}
-// Note: mullo truncates the result to 32 bits.
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a,
- const Packet8q32i& b) {
- return _mm256_mullo_epi32(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) {
- return _mm256_sub_epi32(_mm256_setzero_si256(), a.m_val);
-}
-
-// Min and max.
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a,
- const Packet8q32i& b) {
- return _mm256_min_epi32(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
- const Packet8q32i& b) {
- return _mm256_max_epi32(a.m_val, b.m_val);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
- const Packet16q16i& b) {
- return _mm256_min_epi16(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
- const Packet16q16i& b) {
- return _mm256_max_epi16(a.m_val, b.m_val);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
- const Packet32q8u& b) {
- return _mm256_min_epu8(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a,
- const Packet32q8u& b) {
- return _mm256_max_epu8(a.m_val, b.m_val);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a,
- const Packet32q8i& b) {
- return _mm256_min_epi8(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a,
- const Packet32q8i& b) {
- return _mm256_max_epi8(a.m_val, b.m_val);
-}
-
-// Reductions.
-template <>
-EIGEN_STRONG_INLINE QInt32 predux_min<Packet8q32i>(const Packet8q32i& a) {
- __m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1));
- tmp =
- _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
- return pfirst<Packet8q32i>(
- _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
- __m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1));
- tmp =
- _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
- return pfirst<Packet8q32i>(
- _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
- __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
- tmp =
- _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
- tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
- return std::min(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp));
-}
-template <>
-EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
- __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
- tmp =
- _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
- tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
- return std::max(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp));
-}
-
-template <>
-EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
- __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
- tmp =
- _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
- tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
- tmp = _mm256_min_epu8(tmp,
- _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
- return std::min(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)),
- static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp)));
-}
-template <>
-EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) {
- __m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1));
- tmp =
- _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
- tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
- tmp = _mm256_max_epu8(tmp,
- _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
- return std::max(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)),
- static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp)));
-}
-
-template <>
-EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) {
- __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1));
- tmp =
- _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
- tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
- tmp = _mm256_min_epi8(tmp,
- _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
- return std::min(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
-}
-template <>
-EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) {
- __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1));
- tmp =
- _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
- tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
- tmp = _mm256_max_epi8(tmp,
- _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
- return std::max(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
-}
-
-// Vectorized scaling of Packet32q8i by float.
-template <>
-struct scalar_product_op<QInt32, double> : binary_op_base<QInt32, double> {
- typedef typename ScalarBinaryOpTraits<QInt32, double>::ReturnType result_type;
-#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
- scalar_product_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN }
-#endif
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
- operator()(const QInt32& a, const double& b) const {
- return a * b;
- }
-
- EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a,
- const double& b) const {
- __m256d scale = _mm256_set1_pd(b);
- __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
- __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo));
- __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1));
- __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi));
- return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi,
- 1);
- }
-};
-
-template <>
-struct functor_traits<scalar_product_op<QInt32, double>> {
- enum { Cost = 4 * NumTraits<float>::MulCost, PacketAccess = true };
-};
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
deleted file mode 100644
index 5a0ae2e..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+++ /dev/null
@@ -1,516 +0,0 @@
-#ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
-#define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
-
-#include "PacketMathAVX2.h"
-
-namespace Eigen {
-namespace internal {
-
-typedef eigen_packet_wrapper<__m512i, 30> Packet64q8i;
-typedef eigen_packet_wrapper<__m512i, 31> Packet32q16i;
-typedef eigen_packet_wrapper<__m512i, 32> Packet64q8u;
-typedef eigen_packet_wrapper<__m512i, 33> Packet16q32i;
-
-template <>
-struct packet_traits<QInt8> : default_packet_traits {
- typedef Packet64q8i type;
- typedef Packet32q8i half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 64,
- };
- enum {
- HasAdd = 0,
- HasSub = 0,
- HasMul = 0,
- HasNegate = 0,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 0,
- HasSetLinear = 0
- };
-};
-template <>
-struct packet_traits<QUInt8> : default_packet_traits {
- typedef Packet64q8u type;
- typedef Packet32q8u half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 64,
- };
- enum {
- HasAdd = 0,
- HasSub = 0,
- HasMul = 0,
- HasNegate = 0,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 0,
- HasSetLinear = 0
- };
-};
-template <>
-struct packet_traits<QInt16> : default_packet_traits {
- typedef Packet32q16i type;
- typedef Packet16q16i half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 32,
- };
- enum {
- HasAdd = 0,
- HasSub = 0,
- HasMul = 0,
- HasNegate = 0,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 0,
- HasSetLinear = 0
- };
-};
-template <>
-struct packet_traits<QInt32> : default_packet_traits {
- typedef Packet16q32i type;
- typedef Packet8q32i half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 16,
- };
- enum {
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 0,
- HasSetLinear = 0
- };
-};
-
-template <>
-struct unpacket_traits<Packet64q8i> {
- typedef QInt8 type;
- typedef Packet32q8i half;
- enum {
- size = 64,
- alignment = Aligned64,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-template <>
-struct unpacket_traits<Packet32q16i> {
- typedef QInt16 type;
- typedef Packet16q16i half;
- enum {
- size = 32,
- alignment = Aligned64,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-template <>
-struct unpacket_traits<Packet64q8u> {
- typedef QUInt8 type;
- typedef Packet32q8u half;
- enum {
- size = 64,
- alignment = Aligned64,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-template <>
-struct unpacket_traits<Packet16q32i> {
- typedef QInt32 type;
- typedef Packet8q32i half;
- enum {
- size = 16,
- alignment = Aligned64,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-
-// Unaligned load
-template <>
-EIGEN_STRONG_INLINE Packet64q8i ploadu<Packet64q8i>(const QInt8* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
- reinterpret_cast<const __m512i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q16i ploadu<Packet32q16i>(const QInt16* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
- reinterpret_cast<const __m512i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet64q8u ploadu<Packet64q8u>(const QUInt8* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
- reinterpret_cast<const __m512i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q32i ploadu<Packet16q32i>(const QInt32* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
- reinterpret_cast<const __m512i*>(from));
-}
-
-// Aligned load
-template <>
-EIGEN_STRONG_INLINE Packet64q8i pload<Packet64q8i>(const QInt8* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
- reinterpret_cast<const __m512i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q16i pload<Packet32q16i>(const QInt16* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
- reinterpret_cast<const __m512i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet64q8u pload<Packet64q8u>(const QUInt8* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
- reinterpret_cast<const __m512i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pload<Packet16q32i>(const QInt32* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
- reinterpret_cast<const __m512i*>(from));
-}
-
-// Unaligned store
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet64q8i& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
- reinterpret_cast<__m512i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet32q16i& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
- reinterpret_cast<__m512i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet64q8u& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
- reinterpret_cast<__m512i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet16q32i& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
- reinterpret_cast<__m512i*>(to), from.m_val);
-}
-
-// Aligned store
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet16q32i& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
- from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet64q8u& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
- from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet64q8i& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
- from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet32q16i& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
- from.m_val);
-}
-
-// Extract first element.
-template <>
-EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
- return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a, 0));
-}
-template <>
-EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
- return static_cast<uint8_t>(
- _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0));
-}
-template <>
-EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
- return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0);
-}
-template <>
-EIGEN_STRONG_INLINE QInt16 pfirst<Packet32q16i>(const Packet32q16i& a) {
- return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.m_val, 0), 0);
-}
-
-// Initialize to constant value.
-template <>
-EIGEN_STRONG_INLINE Packet64q8i pset1<Packet64q8i>(const QInt8& from) {
- return _mm512_set1_epi8(from.value);
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q16i pset1<Packet32q16i>(const QInt16& from) {
- return _mm512_set1_epi16(from.value);
-}
-template <>
-EIGEN_STRONG_INLINE Packet64q8u pset1<Packet64q8u>(const QUInt8& from) {
- return _mm512_set1_epi8(static_cast<uint8_t>(from.value));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pset1<Packet16q32i>(const QInt32& from) {
- return _mm512_set1_epi32(from.value);
-}
-
-// Basic arithmetic packet ops for QInt32.
-template <>
-EIGEN_STRONG_INLINE Packet16q32i padd<Packet16q32i>(const Packet16q32i& a,
- const Packet16q32i& b) {
- return _mm512_add_epi32(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q32i psub<Packet16q32i>(const Packet16q32i& a,
- const Packet16q32i& b) {
- return _mm512_sub_epi32(a.m_val, b.m_val);
-}
-// Note: mullo truncates the result to 32 bits.
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pmul<Packet16q32i>(const Packet16q32i& a,
- const Packet16q32i& b) {
- return _mm512_mullo_epi32(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pnegate<Packet16q32i>(const Packet16q32i& a) {
- return _mm512_sub_epi32(_mm512_setzero_si512(), a.m_val);
-}
-
-// Min and max.
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pmin<Packet16q32i>(const Packet16q32i& a,
- const Packet16q32i& b) {
- return _mm512_min_epi32(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pmax<Packet16q32i>(const Packet16q32i& a,
- const Packet16q32i& b) {
- return _mm512_max_epi32(a.m_val, b.m_val);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet64q8u pmin<Packet64q8u>(const Packet64q8u& a,
- const Packet64q8u& b) {
-#ifdef EIGEN_VECTORIZE_AVX512BW
- return _mm512_min_epu8(a.m_val, b.m_val);
-#else
- __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
- __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
- __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
- __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
- __m256i r0 = _mm256_min_epu8(ap0, bp0);
- __m256i r1 = _mm256_min_epu8(ap1, bp1);
- return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE Packet64q8u pmax<Packet64q8u>(const Packet64q8u& a,
- const Packet64q8u& b) {
-#ifdef EIGEN_VECTORIZE_AVX512BW
- return _mm512_max_epu8(a.m_val, b.m_val);
-#else
- __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
- __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
- __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
- __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
- __m256i r0 = _mm256_max_epu8(ap0, bp0);
- __m256i r1 = _mm256_max_epu8(ap1, bp1);
- return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
-#endif
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet64q8i pmin<Packet64q8i>(const Packet64q8i& a,
- const Packet64q8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX512BW
- return _mm512_min_epi8(a.m_val, b.m_val);
-#else
- __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
- __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
- __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
- __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
- __m256i r0 = _mm256_min_epi8(ap0, bp0);
- __m256i r1 = _mm256_min_epi8(ap1, bp1);
- return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q16i pmin<Packet32q16i>(const Packet32q16i& a,
- const Packet32q16i& b) {
-#ifdef EIGEN_VECTORIZE_AVX512BW
- return _mm512_min_epi16(a.m_val, b.m_val);
-#else
- __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
- __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
- __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
- __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
- __m256i r0 = _mm256_min_epi16(ap0, bp0);
- __m256i r1 = _mm256_min_epi16(ap1, bp1);
- return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE Packet64q8i pmax<Packet64q8i>(const Packet64q8i& a,
- const Packet64q8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX512BW
- return _mm512_max_epi8(a.m_val, b.m_val);
-#else
- __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
- __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
- __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
- __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
- __m256i r0 = _mm256_max_epi8(ap0, bp0);
- __m256i r1 = _mm256_max_epi8(ap1, bp1);
- return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a,
- const Packet32q16i& b) {
-#ifdef EIGEN_VECTORIZE_AVX512BW
- return _mm512_max_epi16(a.m_val, b.m_val);
-#else
- __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
- __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
- __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
- __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
- __m256i r0 = _mm256_max_epi16(ap0, bp0);
- __m256i r1 = _mm256_max_epi16(ap1, bp1);
- return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
-#endif
-}
-
-// Reductions.
-template <>
-EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
- Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
- Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
- Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
- Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
- Packet4i res =
- _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
- res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
- res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
- return pfirst(res);
-}
-template <>
-EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
- Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
- Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
- Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
- Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
- Packet4i res =
- _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
- res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
- res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
- return pfirst(res);
-}
-template <>
-EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
- Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
- Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
- Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
- Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
- Packet4i res =
- _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
- res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
- res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
- std::uint32_t w = pfirst(res);
- return std::min(
- {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
-}
-template <>
-EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
- Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
- Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
- Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
- Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
- Packet4i res =
- _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
- res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
- res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
- std::uint32_t w = pfirst(res);
- return std::max(
- {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
-}
-template <>
-EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
- Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
- Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
- Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
- Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
- Packet4i res =
- _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
- res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
- res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
- std::uint32_t w = pfirst(res);
- return std::min(
- {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
- static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
-}
-template <>
-EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
- Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
- Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
- Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
- Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
- Packet4i res =
- _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
- res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
- res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
- std::uint32_t w = pfirst(res);
- return std::max(
- {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
- static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
-}
-template <>
-EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
- Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
- Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
- Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
- Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
- Packet4i res =
- _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
- res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
- res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
- std::uint32_t w = pfirst(res);
- return std::min(
- {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
- static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
-}
-template <>
-EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
- Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
- Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
- Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
- Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
- Packet4i res =
- _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
- res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
- res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
- std::uint32_t w = pfirst(res);
- return std::min(
- {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
- static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
-}
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h b/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
deleted file mode 100644
index 5dd2cd3..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
-#define CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
-
-namespace Eigen {
-namespace internal {
-
-typedef __m256 Packet8f;
-
-template <>
-struct type_casting_traits<QInt32, float> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet8f pcast<Packet8q32i>(const Packet8q32i& a) {
- return _mm256_cvtepi32_ps(a.m_val);
-}
-
-template <>
-struct type_casting_traits<float, QInt32> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pcast<Packet8f>(const Packet8f& a) {
- return _mm256_cvtps_epi32(a);
-}
-
-template <>
-struct type_casting_traits<QInt32, QInt8> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8i
-pcast<Packet8q32i, Packet32q8i>(const Packet8q32i& a, const Packet8q32i& b,
- const Packet8q32i& c, const Packet8q32i& d) {
- __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.m_val, b.m_val),
- _mm256_packs_epi32(c.m_val, d.m_val));
- // Since packs does not cross 128 bit lane boundaries,
- // we have to permute to properly order the final result.
- const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
- return _mm256_permutevar8x32_epi32(converted, permute_mask);
-}
-
-template <>
-struct type_casting_traits<float, QInt8> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8i
-pcast<Packet8f, Packet32q8i>(const Packet8f& a, const Packet8f& b,
- const Packet8f& c, const Packet8f& d) {
- const __m256i a_conv = _mm256_cvtps_epi32(a);
- const __m256i b_conv = _mm256_cvtps_epi32(b);
- const __m256i c_conv = _mm256_cvtps_epi32(c);
- const __m256i d_conv = _mm256_cvtps_epi32(d);
- __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a_conv, b_conv),
- _mm256_packs_epi32(c_conv, d_conv));
- const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
- return _mm256_permutevar8x32_epi32(converted, permute_mask);
-}
-
-template <>
-struct type_casting_traits<QInt32, QUInt8> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8u
-pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b,
- const Packet8q32i& c, const Packet8q32i& d) {
- // _mm256_packus_epi32 trims negative numbers to 0 but we can't allow numbers
- // that are too large because _mm256_packus_epi16 expects signed input
- // (example of problem input: 0x11111111, which saturates to 0xffff = -1,
- // which saturates to 0).
- const __m256i a_clip = _mm256_min_epi32(a, _mm256_set1_epi32(255));
- const __m256i b_clip = _mm256_min_epi32(b, _mm256_set1_epi32(255));
- const __m256i c_clip = _mm256_min_epi32(c, _mm256_set1_epi32(255));
- const __m256i d_clip = _mm256_min_epi32(d, _mm256_set1_epi32(255));
- const __m256i converted = _mm256_packus_epi16(
- _mm256_packus_epi32(a_clip, b_clip), _mm256_packus_epi32(c_clip, d_clip));
- // Since packus does not cross 128 bit lane boundaries,
- // we have to permute to properly order the final result.
- const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
- return _mm256_permutevar8x32_epi32(converted, permute_mask);
-}
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
deleted file mode 100644
index 17408d1..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+++ /dev/null
@@ -1,191 +0,0 @@
-#ifndef CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
-#define CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
-
-namespace Eigen {
-namespace internal {
-
-typedef __m512 Packet16f;
-typedef __m512i Packet16i;
-
-template <>
-struct type_casting_traits<QInt32, float> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet16f pcast<Packet16q32i>(const Packet16q32i& a) {
- return _mm512_cvtepi32_ps(a.m_val);
-}
-
-template <>
-struct type_casting_traits<float, QInt32> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pcast<Packet16f>(const Packet16f& a) {
- return _mm512_cvtps_epi32(a);
-}
-
-template <>
-struct type_casting_traits<float, QInt16> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
- Packet16i a_int = _mm512_cvtps_epi32(a);
- Packet16i b_int = _mm512_cvtps_epi32(b);
-#ifdef EIGEN_VECTORIZE_AVX512BW
- return _mm512_packs_epi32(a_int, b_int);
-#else
- Packet8i ab_int16_low = _mm256_permute4x64_epi64(
- _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
- _mm512_castsi512_si256(b_int)),
- _MM_SHUFFLE(0, 2, 1, 3));
- Packet8i ab_int16_high = _mm256_permute4x64_epi64(
- _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
- _mm512_extracti32x8_epi32(b_int, 1)),
- _MM_SHUFFLE(0, 2, 1, 3));
- return _mm512_inserti32x8(_mm512_castsi256_si512(ab_int16_low), ab_int16_high,
- 1);
-#endif
-}
-
-template <>
-struct type_casting_traits<float, QInt8> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet64q8i pcast<Packet16f>(const Packet16f& a,
- const Packet16f& b,
- const Packet16f& c,
- const Packet16f& d) {
- Packet16i a_int = _mm512_cvtps_epi32(a);
- Packet16i b_int = _mm512_cvtps_epi32(b);
- Packet16i c_int = _mm512_cvtps_epi32(c);
- Packet16i d_int = _mm512_cvtps_epi32(d);
-#ifdef EIGEN_VECTORIZE_AVX512BW
- return _mm512_packs_epi16(_mm512_packs_epi32(a_int, b_int),
- _mm512_packs_epi32(c_int, d_int));
-#else
- Packet8i ab_int16_low = _mm256_permute4x64_epi64(
- _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
- _mm512_castsi512_si256(b_int)),
- _MM_SHUFFLE(0, 2, 1, 3));
- Packet8i cd_int16_low = _mm256_permute4x64_epi64(
- _mm256_packs_epi32(_mm512_castsi512_si256(c_int),
- _mm512_castsi512_si256(d_int)),
- _MM_SHUFFLE(0, 2, 1, 3));
- Packet8i ab_int16_high = _mm256_permute4x64_epi64(
- _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
- _mm512_extracti32x8_epi32(b_int, 1)),
- _MM_SHUFFLE(0, 2, 1, 3));
- Packet8i cd_int16_high = _mm256_permute4x64_epi64(
- _mm256_packs_epi32(_mm512_extracti32x8_epi32(c_int, 1),
- _mm512_extracti32x8_epi32(d_int, 1)),
- _MM_SHUFFLE(0, 2, 1, 3));
- Packet8i abcd_int8_low = _mm256_permute4x64_epi64(
- _mm256_packs_epi16(ab_int16_low, cd_int16_low), _MM_SHUFFLE(0, 2, 1, 3));
- Packet8i abcd_int8_high =
- _mm256_permute4x64_epi64(_mm256_packs_epi16(ab_int16_high, cd_int16_high),
- _MM_SHUFFLE(0, 2, 1, 3));
- return _mm512_inserti32x8(_mm512_castsi256_si512(abcd_int8_low),
- abcd_int8_high, 1);
-#endif
-}
-
-template <>
-struct type_casting_traits<QInt32, QInt8> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-struct type_casting_traits<QInt32, QInt16> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet64q8i
-pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a, const Packet16q32i& b,
- const Packet16q32i& c, const Packet16q32i& d) {
- __m128i a_part = _mm512_cvtsepi32_epi8(a);
- __m128i b_part = _mm512_cvtsepi32_epi8(b);
- __m128i c_part = _mm512_cvtsepi32_epi8(c);
- __m128i d_part = _mm512_cvtsepi32_epi8(d);
- __m256i ab =
- _mm256_inserti128_si256(_mm256_castsi128_si256(a_part), b_part, 1);
- __m256i cd =
- _mm256_inserti128_si256(_mm256_castsi128_si256(c_part), d_part, 1);
- __m512i converted = _mm512_inserti64x4(_mm512_castsi256_si512(ab), cd, 1);
- return converted;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16q32i, Packet32q16i>(
- const Packet16q32i& a, const Packet16q32i& b) {
- __m256i a_part = _mm512_cvtsepi32_epi16(a);
- __m256i b_part = _mm512_cvtsepi32_epi16(b);
- __m512i converted =
- _mm512_inserti64x4(_mm512_castsi256_si512(a_part), b_part, 1);
- return converted;
-}
-
-template <>
-struct type_casting_traits<QInt32, QUInt8> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet64q8u
-pcast<Packet16q32i, Packet64q8u>(const Packet16q32i& a, const Packet16q32i& b,
- const Packet16q32i& c, const Packet16q32i& d) {
- // Brute-force saturation since there isn't a pack operation for unsigned
- // numbers that keeps the elements in order.
- __m128i a_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
- _mm512_min_epi32(a, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
- __m128i b_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
- _mm512_min_epi32(b, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
- __m128i c_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
- _mm512_min_epi32(c, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
- __m128i d_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
- _mm512_min_epi32(d, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
- __m256i ab =
- _mm256_inserti128_si256(_mm256_castsi128_si256(a_part), b_part, 1);
- __m256i cd =
- _mm256_inserti128_si256(_mm256_castsi128_si256(c_part), d_part, 1);
- __m512i converted = _mm512_inserti64x4(_mm512_castsi256_si512(ab), cd, 1);
- return converted;
-}
-
-#if 0
-// The type Packet32q16u does not exist for AVX-512 yet
-template <>
-struct type_casting_traits<QInt32, QUInt16> {
- enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet32q16u
-pcast<Packet16q32i, Packet32q16u>(const Packet16q32i& a,
- const Packet16q32i& b) {
- // Brute-force saturation since there isn't a pack operation for unsigned
- // numbers that keeps the elements in order.
- __m256i a_part =
- _mm512_cvtepi32_epi16(_mm512_max_epi32(
- _mm512_min_epi32(a, _mm512_set1_epi32(65535)), _mm512_setzero_si512()));
- __m256i b_part = _mm512_cvtepi32_epi16(
- _mm512_max_epi32(_mm512_min_epi32(b, _mm512_set1_epi32(65535)),
- _mm512_setzero_si512()));
- __m512i converted =
- _mm512_inserti64x4(_mm512_castsi256_si512(a_part), b_part, 1);
- return converted;
-}
-#endif
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_