Clean up some old technical debt: Move FixedPoint from third_party/eigen3 to third_party/tensorflow. The fixed point library is not a part of or used by Eigen, but was developed for TensorFlow and never pushed to Eigen upstream. This also simplifies the TF open source build: After this change tensorflow/third_party/eigen3/... consists entirely of forwarding headers (and can perhaps be removed?) PiperOrigin-RevId: 478097075 Change-Id: Ie4785af12387ec430b73406451e01a4afae2cce3

commit: 864374c4052d9d98e851a6515740163c6e86b18a [log] [tgz]
author: Rasmus Munk Larsen <rmlarsen@google.com> Fri Sep 30 15:45:19 2022 -0700
committer: Antonio Sanchez <cantonios@google.com> Tue Oct 18 23:26:09 2022 -0700
tree: 0cb27b14d71dba34c6a706993f5816232d94b1d6
parent: 36eb2497550e758330a0c836c12cd6932a03c6c8 [diff]
diff --git a/unsupported/Eigen/CXX11/FixedPoint b/unsupported/Eigen/CXX11/FixedPoint
deleted file mode 100644
index 67cb111..0000000
--- a/unsupported/Eigen/CXX11/FixedPoint
+++ /dev/null

@@ -1,58 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_FIXED_POINT_MODULE
-#define EIGEN_CXX11_FIXED_POINT_MODULE
-
-#include <Eigen/Core>
-#include <stdint.h>
-
-/** \defgroup CXX11_FixedPoint_Module Fixed Point Module
-  *
-  * This module provides common core features for all modules that
-  * explicitly depend on C++11. Currently, this is only the Tensor
-  * module. Note that at this stage, you should not need to include
-  * this module directly.
-  *
-  * It also provides a limited fallback for compilers that don't support
-  * CXX11 yet, such as nvcc.
-  *
-  * \code
-  * #include <Eigen/CXX11/FixedPoint>
-  * \endcode
-  */
-
-#include "src/FixedPoint/FixedPointTypes.h"
-
-// Use optimized implementations whenever available
-#if defined (EIGEN_VECTORIZE_AVX512DQ) || defined (EIGEN_VECTORIZE_AVX512BW)
-#include "src/FixedPoint/PacketMathAVX512.h"
-#include "src/FixedPoint/TypeCastingAVX512.h"
-
-#elif defined EIGEN_VECTORIZE_AVX2
-#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-#define EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
-#include "src/FixedPoint/PacketMathAVX2.h"
-#include "src/FixedPoint/MatMatProductAVX2.h"
-#include "src/FixedPoint/TypeCastingAVX2.h"
-
-#elif defined EIGEN_VECTORIZE_AVX
-#include "src/FixedPoint/PacketMathAVX.h"
-
-#elif defined EIGEN_VECTORIZE_NEON
-#define EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-#include "src/FixedPoint/MatMatProductNEON.h"
-#endif
-
-// Use the default implementation when no optimized code is available
-#include "src/FixedPoint/MatMatProduct.h"
-#include "src/FixedPoint/MatVecProduct.h"
-
-
-#endif // EIGEN_CXX11_FIXED_POINT_MODULE

diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h b/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
deleted file mode 100644
index f37897b..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
+++ /dev/null

@@ -1,345 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef CXX11_SRC_FIXEDPOINT_FIXEDPOINTTYPES_H_
-#define CXX11_SRC_FIXEDPOINT_FIXEDPOINTTYPES_H_
-
-#include <cmath>
-#include <iostream>
-
-namespace Eigen {
-
-// The mantissa part of the fixed point representation. See
-// go/tensorfixedpoint for details
-struct QInt8;
-struct QUInt8;
-struct QInt16;
-struct QUInt16;
-struct QInt32;
-
-template <>
-struct NumTraits<QInt8> : GenericNumTraits<int8_t> {};
-template <>
-struct NumTraits<QUInt8> : GenericNumTraits<uint8_t> {};
-template <>
-struct NumTraits<QInt16> : GenericNumTraits<int16_t> {};
-template <>
-struct NumTraits<QUInt16> : GenericNumTraits<uint16_t> {};
-template <>
-struct NumTraits<QInt32> : GenericNumTraits<int32_t> {};
-
-namespace internal {
-template <>
-struct scalar_product_traits<QInt32, double> {
-  enum {
-    // Cost = NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef QInt32 ReturnType;
-};
-}
-
-// Wrap the 8bit int into a QInt8 struct instead of using a typedef to prevent
-// the compiler from silently type cast the mantissa into a bigger or a smaller
-// representation.
-struct QInt8 {
-  QInt8() : value(0) {}
-  QInt8(const int8_t v) : value(v) {}
-  QInt8(const QInt32 v);
-
-  operator int() const { return static_cast<int>(value); }
-
-  int8_t value;
-};
-
-struct QUInt8 {
-  QUInt8() : value(0) {}
-  QUInt8(const uint8_t v) : value(v) {}
-  QUInt8(const QInt32 v);
-
-  operator int() const { return static_cast<int>(value); }
-
-  uint8_t value;
-};
-
-struct QInt16 {
-  QInt16() : value(0) {}
-  QInt16(const int16_t v) : value(v) {}
-  QInt16(const QInt32 v);
-  operator int() const { return static_cast<int>(value); }
-
-  int16_t value;
-};
-
-struct QUInt16 {
-  QUInt16() : value(0) {}
-  QUInt16(const uint16_t v) : value(v) {}
-  QUInt16(const QInt32 v);
-  operator int() const { return static_cast<int>(value); }
-
-  uint16_t value;
-};
-
-struct QInt32 {
-  QInt32() : value(0) {}
-  QInt32(const int8_t v) : value(v) {}
-  QInt32(const int32_t v) : value(v) {}
-  QInt32(const uint32_t v) : value(static_cast<int32_t>(v)) {}
-  QInt32(const QInt8 v) : value(v.value) {}
-  QInt32(const float v) : value(static_cast<int32_t>(lrint(v))) {}
-#ifdef EIGEN_MAKING_DOCS
-  // Workaround to fix build on PPC.
-  QInt32(unsigned long v) : value(v) {}
-#endif
-
-  operator float() const { return static_cast<float>(value); }
-
-  int32_t value;
-};
-
-EIGEN_STRONG_INLINE QInt8::QInt8(const QInt32 v)
-    : value(static_cast<int8_t>(
-          v.value > 127 ? 127 : (v.value < -128 ? -128 : v.value))) {}
-EIGEN_STRONG_INLINE QUInt8::QUInt8(const QInt32 v)
-    : value(static_cast<uint8_t>(v.value > 255 ? 255
-                                               : (v.value < 0 ? 0 : v.value))) {
-}
-EIGEN_STRONG_INLINE QInt16::QInt16(const QInt32 v)
-    : value(static_cast<int16_t>(
-          v.value > 32767 ? 32767 : (v.value < -32768 ? -32768 : v.value))) {}
-EIGEN_STRONG_INLINE QUInt16::QUInt16(const QInt32 v)
-    : value(static_cast<uint16_t>(
-          v.value > 65535 ? 65535 : (v.value < 0 ? 0 : v.value))) {}
-
-// Basic widening 8-bit operations: This will be vectorized in future CLs.
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt8 b) {
-  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QUInt8 b) {
-  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt8 b) {
-  return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt8 b) {
-  return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value));
-}
-
-// Basic widening 16-bit operations: This will be vectorized in future CLs.
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt16 b) {
-  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QUInt16 b) {
-  return QInt32(static_cast<int32_t>(a.value) * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt16 b) {
-  return QInt32(static_cast<int32_t>(a.value) + static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt16 b) {
-  return QInt32(static_cast<int32_t>(a.value) - static_cast<int32_t>(b.value));
-}
-
-// Mixed QInt32 op QInt8 operations. This will be vectorized in future CLs.
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt8 b) {
-  return QInt32(a.value + static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt8 a, const QInt32 b) {
-  return QInt32(static_cast<int32_t>(a.value) + b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt8 b) {
-  return QInt32(a.value - static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt8 a, const QInt32 b) {
-  return QInt32(static_cast<int32_t>(a.value) - b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt8 b) {
-  return QInt32(a.value * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt8 a, const QInt32 b) {
-  return QInt32(static_cast<int32_t>(a.value) * b.value);
-}
-
-// Mixed QInt32 op QInt16 operations. This will be vectorized in future CLs.
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt16 b) {
-  return QInt32(a.value + static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt16 a, const QInt32 b) {
-  return QInt32(static_cast<int32_t>(a.value) + b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt16 b) {
-  return QInt32(a.value - static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt16 a, const QInt32 b) {
-  return QInt32(static_cast<int32_t>(a.value) - b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt16 b) {
-  return QInt32(a.value * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt16 a, const QInt32 b) {
-  return QInt32(static_cast<int32_t>(a.value) * b.value);
-}
-
-// Mixed QInt32 op QUInt8 operations. This will be vectorized in future CLs.
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt8 b) {
-  return QInt32(a.value + static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator+(const QUInt8 a, const QInt32 b) {
-  return QInt32(static_cast<int32_t>(a.value) + b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt8 b) {
-  return QInt32(a.value - static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QUInt8 a, const QInt32 b) {
-  return QInt32(static_cast<int32_t>(a.value) - b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt8 b) {
-  return QInt32(a.value * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QUInt8 a, const QInt32 b) {
-  return QInt32(static_cast<int32_t>(a.value) * b.value);
-}
-
-// Mixed QInt32 op QUInt16 operations. This will be vectorized in future CLs.
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QUInt16 b) {
-  return QInt32(a.value + static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator+(const QUInt16 a, const QInt32 b) {
-  return QInt32(static_cast<int32_t>(a.value) + b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QUInt16 b) {
-  return QInt32(a.value - static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QUInt16 a, const QInt32 b) {
-  return QInt32(static_cast<int32_t>(a.value) - b.value);
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QUInt16 b) {
-  return QInt32(a.value * static_cast<int32_t>(b.value));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QUInt16 a, const QInt32 b) {
-  return QInt32(static_cast<int32_t>(a.value) * b.value);
-}
-
-// Basic arithmetic operations on QInt32, which behaves like a int32_t.
-EIGEN_STRONG_INLINE QInt32 operator+(const QInt32 a, const QInt32 b) {
-  return a.value + b.value;
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a, const QInt32 b) {
-  return a.value - b.value;
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const QInt32 b) {
-  return a.value * b.value;
-}
-EIGEN_STRONG_INLINE QInt32 operator/(const QInt32 a, const QInt32 b) {
-  return a.value / b.value;
-}
-EIGEN_STRONG_INLINE QInt32& operator+=(QInt32& a, const QInt32 b) {
-  a.value += b.value;
-  return a;
-}
-EIGEN_STRONG_INLINE QInt32& operator-=(QInt32& a, const QInt32 b) {
-  a.value -= b.value;
-  return a;
-}
-EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const QInt32 b) {
-  a.value *= b.value;
-  return a;
-}
-EIGEN_STRONG_INLINE QInt32& operator/=(QInt32& a, const QInt32 b) {
-  a.value /= b.value;
-  return a;
-}
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) { return -a.value; }
-
-// Scaling QInt32 by double. We do the arithmetic in double because
-// float only has 23 bits of mantissa, so casting QInt32 to float might reduce
-// accuracy by discarding up to 7 (least significant) bits.
-EIGEN_STRONG_INLINE QInt32 operator*(const QInt32 a, const double b) {
-  return static_cast<int32_t>(lrint(static_cast<double>(a.value) * b));
-}
-EIGEN_STRONG_INLINE QInt32 operator*(const double a, const QInt32 b) {
-  return static_cast<int32_t>(lrint(a * static_cast<double>(b.value)));
-}
-EIGEN_STRONG_INLINE QInt32& operator*=(QInt32& a, const double b) {
-  a.value = static_cast<int32_t>(lrint(static_cast<double>(a.value) * b));
-  return a;
-}
-
-// Comparisons
-EIGEN_STRONG_INLINE bool operator==(const QInt8 a, const QInt8 b) {
-  return a.value == b.value;
-}
-EIGEN_STRONG_INLINE bool operator==(const QUInt8 a, const QUInt8 b) {
-  return a.value == b.value;
-}
-EIGEN_STRONG_INLINE bool operator==(const QInt16 a, const QInt16 b) {
-  return a.value == b.value;
-}
-EIGEN_STRONG_INLINE bool operator==(const QUInt16 a, const QUInt16 b) {
-  return a.value == b.value;
-}
-EIGEN_STRONG_INLINE bool operator==(const QInt32 a, const QInt32 b) {
-  return a.value == b.value;
-}
-
-EIGEN_STRONG_INLINE bool operator<(const QInt8 a, const QInt8 b) {
-  return a.value < b.value;
-}
-EIGEN_STRONG_INLINE bool operator<(const QUInt8 a, const QUInt8 b) {
-  return a.value < b.value;
-}
-EIGEN_STRONG_INLINE bool operator<(const QInt16 a, const QInt16 b) {
-  return a.value < b.value;
-}
-EIGEN_STRONG_INLINE bool operator<(const QUInt16 a, const QUInt16 b) {
-  return a.value < b.value;
-}
-EIGEN_STRONG_INLINE bool operator<(const QInt32 a, const QInt32 b) {
-  return a.value < b.value;
-}
-
-EIGEN_STRONG_INLINE bool operator>(const QInt8 a, const QInt8 b) {
-  return a.value > b.value;
-}
-EIGEN_STRONG_INLINE bool operator>(const QUInt8 a, const QUInt8 b) {
-  return a.value > b.value;
-}
-EIGEN_STRONG_INLINE bool operator>(const QInt16 a, const QInt16 b) {
-  return a.value > b.value;
-}
-EIGEN_STRONG_INLINE bool operator>(const QUInt16 a, const QUInt16 b) {
-  return a.value > b.value;
-}
-EIGEN_STRONG_INLINE bool operator>(const QInt32 a, const QInt32 b) {
-  return a.value > b.value;
-}
-
-EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt8 a) {
-  os << static_cast<int>(a.value);
-  return os;
-}
-EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt8 a) {
-  os << static_cast<int>(a.value);
-  return os;
-}
-EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt16 a) {
-  os << static_cast<int>(a.value);
-  return os;
-}
-EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QUInt16 a) {
-  os << static_cast<int>(a.value);
-  return os;
-}
-EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt32 a) {
-  os << a.value;
-  return os;
-}
-
-}  // namespace Eigen
-
-#endif  // CXX11_SRC_FIXEDPOINT_FIXEDPOINTTYPES_H_

diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h b/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
deleted file mode 100644
index 3f93f9f..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
+++ /dev/null

@@ -1,345 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef CXX11_SRC_FIXEDPOINT_MATMATPRODUCT_H_
-#define CXX11_SRC_FIXEDPOINT_MATMATPRODUCT_H_
-
-namespace Eigen {
-namespace internal {
-
-// Accumulate the product of 2 QInt8 inputs on 32 bits to prevent
-// overflows
-template <>
-struct scalar_product_traits<QInt8, QInt8> {
-  enum { Defined = 1 };
-  typedef QInt32 ReturnType;
-};
-
-// Accumulate the product of 2 QInt16 inputs on 32 bits to prevent
-// overflows
-template <>
-struct scalar_product_traits<QInt16, QInt16> {
-  enum { Defined = 1 };
-  typedef QInt32 ReturnType;
-};
-
-// Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits
-// to prevent overflows
-template <>
-struct scalar_product_traits<QInt8, QUInt8> {
-  enum { Defined = 1 };
-  typedef QInt32 ReturnType;
-};
-
-// Accumulate the product of QUInt8 inputs with Qint8 inputs on 32 bits
-// to prevent overflows
-template <>
-struct scalar_product_traits<QUInt8, QInt8> {
-  enum { Defined = 1 };
-  typedef QInt32 ReturnType;
-};
-
-// Description of the product implementation. It's pretty simple now since
-// nothing is vectorized yet.
-// This definition tackle the case where both lhs and rhs are encoded using
-// signed 8bit integers
-#ifndef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT
-
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs> {
- public:
-  typedef QInt8 LhsScalar;
-  typedef QInt8 RhsScalar;
-  typedef QInt32 ResScalar;
-
-  typedef typename packet_traits<LhsScalar>::type LhsPacket;
-  typedef LhsPacket LhsPacket4Packing;
-
-  enum {
-    // register block size along the M and N directions
-    // One for the current implementation
-    nr = 1,
-    mr = 1,
-    // Progress made at each iteration of the product loop
-    // also 1 for the current implementation
-    LhsProgress = 1,
-    RhsProgress = 1
-  };
-};
-
-// The signed 8bit Mat-Mat product itself.
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
-                   ConjugateRhs> {
-  EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA,
-                  const QInt8* blockB, Index rows, Index depth, Index cols,
-                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
-                  Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr,
-                                   ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
-           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
-           Index strideB, Index offsetA, Index offsetB) {
-  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  eigen_assert(alpha.value == 1);
-  eigen_assert(strideA == -1);
-  eigen_assert(strideB == -1);
-  eigen_assert(offsetA == 0);
-  eigen_assert(offsetB == 0);
-
-  eigen_assert(rows > 0);
-  eigen_assert(cols > 0);
-  eigen_assert(depth > 0);
-  eigen_assert(blockA);
-  eigen_assert(blockB);
-
-  for (Index j = 0; j < cols; ++j) {
-    Index startB = j * depth;
-
-    for (Index i = 0; i < rows; ++i) {
-      Index startA = i * depth;
-
-      for (Index k = 0; k < depth; ++k) {
-        res(i, j) += blockA[startA + k] * blockB[startB + k];
-      }
-    }
-  }
-}
-#endif
-
-// This definition tackle the case where the lhs is encoded using signed 8bit
-// integers and the rhs using unsigned 8bit integers.
-#ifndef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
- public:
-  typedef QInt8 LhsScalar;
-  typedef QUInt8 RhsScalar;
-  typedef QInt32 ResScalar;
-
-  typedef typename packet_traits<LhsScalar>::type LhsPacket;
-  typedef LhsPacket LhsPacket4Packing;
-
-  enum {
-    // register block size along the M and N directions
-    // One for the current implementation
-    nr = 1,
-    mr = 1,
-    // Progress made at each iteration of the product loop
-    // also 1 for the current implementation
-    LhsProgress = 1,
-    RhsProgress = 1
-  };
-};
-
-// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
-                   ConjugateRhs> {
-  EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA,
-                  const QUInt8* blockB, Index rows, Index depth, Index cols,
-                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
-                  Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
-                                   ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
-           Index strideB, Index offsetA, Index offsetB) {
-  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  eigen_assert(alpha.value == 1);
-  eigen_assert(strideA == -1);
-  eigen_assert(strideB == -1);
-  eigen_assert(offsetA == 0);
-  eigen_assert(offsetB == 0);
-
-  eigen_assert(rows > 0);
-  eigen_assert(cols > 0);
-  eigen_assert(depth > 0);
-  eigen_assert(blockA);
-  eigen_assert(blockB);
-
-  for (Index j = 0; j < cols; ++j) {
-    Index startB = j * depth;
-
-    for (Index i = 0; i < rows; ++i) {
-      Index startA = i * depth;
-
-      for (Index k = 0; k < depth; ++k) {
-        res(i, j) += blockA[startA + k] * blockB[startB + k];
-      }
-    }
-  }
-}
-#endif
-
-// This definition tackle the case where the khs is encoded using unsigned 8bit
-// integers and the rhs using signed 8bit integers.
-#ifndef EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs> {
- public:
-  typedef QUInt8 LhsScalar;
-  typedef QInt8 RhsScalar;
-  typedef QInt32 ResScalar;
-
-  typedef typename packet_traits<LhsScalar>::type LhsPacket;
-  typedef LhsPacket LhsPacket4Packing;
-
-  enum {
-    // register block size along the M and N directions
-    // One for the current implementation
-    nr = 1,
-    mr = 1,
-    // Progress made at each iteration of the product loop
-    // also 1 for the current implementation
-    LhsProgress = 1,
-    RhsProgress = 1
-  };
-};
-
-// Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
-                   ConjugateRhs> {
-  EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QUInt8* blockA,
-                  const QInt8* blockB, Index rows, Index depth, Index cols,
-                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
-                  Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr,
-                                   ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
-           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
-           Index strideB, Index offsetA, Index offsetB) {
-  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  eigen_assert(alpha.value == 1);
-  eigen_assert(strideA == -1);
-  eigen_assert(strideB == -1);
-  eigen_assert(offsetA == 0);
-  eigen_assert(offsetB == 0);
-
-  eigen_assert(rows > 0);
-  eigen_assert(cols > 0);
-  eigen_assert(depth > 0);
-  eigen_assert(blockA);
-  eigen_assert(blockB);
-
-  for (Index j = 0; j < cols; ++j) {
-    Index startB = j * depth;
-
-    for (Index i = 0; i < rows; ++i) {
-      Index startA = i * depth;
-
-      for (Index k = 0; k < depth; ++k) {
-        res(i, j) += blockA[startA + k] * blockB[startB + k];
-      }
-    }
-  }
-}
-#endif
-
-#ifndef EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
-
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
- public:
-  typedef QInt16 LhsScalar;
-  typedef QInt16 RhsScalar;
-  typedef QInt32 ResScalar;
-
-  typedef typename packet_traits<LhsScalar>::type LhsPacket;
-  typedef LhsPacket LhsPacket4Packing;
-
-  enum {
-    // register block size along the M and N directions
-    // One for the current implementation
-    nr = 1,
-    mr = 1,
-    // Progress made at each iteration of the product loop
-    // also 1 for the current implementation
-    LhsProgress = 1,
-    RhsProgress = 1
-  };
-};
-
-// The signed 16bit Mat-Mat product itself.
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
-                   ConjugateRhs> {
-  EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt16* blockA,
-                  const QInt16* blockB, Index rows, Index depth, Index cols,
-                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
-                  Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr,
-                                   ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB,
-           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
-           Index strideB, Index offsetA, Index offsetB) {
-  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  eigen_assert(alpha.value == 1);
-  eigen_assert(strideA == -1);
-  eigen_assert(strideB == -1);
-  eigen_assert(offsetA == 0);
-  eigen_assert(offsetB == 0);
-
-  eigen_assert(rows > 0);
-  eigen_assert(cols > 0);
-  eigen_assert(depth > 0);
-  eigen_assert(blockA);
-  eigen_assert(blockB);
-
-  for (Index j = 0; j < cols; ++j) {
-    Index startB = j * depth;
-
-    for (Index i = 0; i < rows; ++i) {
-      Index startA = i * depth;
-
-      for (Index k = 0; k < depth; ++k) {
-        res(i, j) += blockA[startA + k] * blockB[startB + k];
-      }
-    }
-  }
-}
-#endif
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // CXX11_SRC_FIXEDPOINT_MATMATPRODUCT_H_

diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h b/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
deleted file mode 100644
index 8547dca..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
+++ /dev/null

@@ -1,2289 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2015 Matthew Sarett <msarett@google.com>
-// Copyright (C) 2016 Nishant Patil <nishantpatil@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef CXX11_SRC_FIXEDPOINT_MATMATPRODUCTAVX2_H_
-#define CXX11_SRC_FIXEDPOINT_MATMATPRODUCTAVX2_H_
-
-namespace Eigen {
-namespace internal {
-
-// AVX2 optimized implementation of Mat-Mat product.
-// LHS is encoded using signed 16-bit integers.
-// RHS is encoded using signed 16-bit integers.
-#ifdef EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
-
-// Define quantized traits
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
- public:
-  typedef QInt16 LhsScalar;
-  typedef QInt16 RhsScalar;
-  typedef QInt32 ResScalar;
-
-  typedef typename packet_traits<LhsScalar>::type LhsPacket;
-  typedef LhsPacket LhsPacket4Packing;
-
-  enum {
-    // Define register blocking scheme.
-    nr = 16,
-    mr = 16,
-    kr = 4,
-    // Ignore progress tracking per loop iteration.
-    LhsProgress = -1,
-    RhsProgress = -1
-  };
-};
-
-// Specialized blocking for quantized implementations.
-// Used by TensorContractionThreadPool, inputs must have dimensions that are
-// multiples of 32.
-template <typename Index, int ShardingType>
-class TensorContractionBlocking<QInt16, QInt16, QInt16, Index, ShardingType> {
- public:
-  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
-      : kc_(((k + 15) / 16) * 16),
-        mc_(((m + 15) / 16) * 16),
-        nc_(((n + 15) / 16) * 16) {
-    eigen_assert(mc_ % 16 == 0);
-    eigen_assert(kc_ % 16 == 0);
-    if (!k || !m || !n) {
-      return;
-    }
-
-    if (ShardingType == ShardByCol) {
-      eigen_assert(nc_ % 16 == 0);
-      nc_ = (((nc_ / num_threads) + 15) / 16) * 16;
-    } else {
-      eigen_assert(nc_ % 16 == 0);
-      mc_ = (((mc_ / num_threads) + 15) / 16) * 16;
-    }
-  }
-
-  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
-  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
-  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
-
- private:
-  Index kc_;
-  Index mc_;
-  Index nc_;
-};
-
-// Specialized blocking for quantized implementations.
-// Used by TensorContraction and GeneralMatrixMatrix, inputs are padded to
-// multiples of 32.
-template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<ColMajor, QInt16, QInt16, MaxRows, MaxCols, MaxDepth,
-                          KcFactor, false>
-    : public level3_blocking<QInt16, QInt16> {
-  DenseIndex m_sizeA;
-  DenseIndex m_sizeB;
-
- public:
-  gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
-                      DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
-    this->m_mc = ((rows + 15) / 16) * 16;
-    this->m_nc = ((cols + 15) / 16) * 16;
-    this->m_kc = ((depth + 15) / 16) * 16;
-    m_sizeA = this->m_mc * this->m_kc;
-    m_sizeB = this->m_kc * this->m_nc;
-  }
-  void allocateA() {
-    if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt16>(m_sizeA);
-  }
-  void allocateB() {
-    if (this->m_blockB == 0) this->m_blockB = aligned_new<QInt16>(m_sizeB);
-  }
-  void allocateAll() {
-    allocateA();
-    allocateB();
-  }
-  ~gemm_blocking_space() {
-    aligned_delete(this->m_blockA, m_sizeA);
-    aligned_delete(this->m_blockB, m_sizeB);
-  }
-};
-
-// Below are the fully optimized versions that are correct only for sizes that
-// are multiple of 16.  It is about a 10% performance benefit to keep these
-// implementations separate.
-
-// Arrange a block of the left input matrix in contiguous memory.
-//
-// Given column major input (A0 beside A1 in memory):
-// A0 B0 C0 D0 E0 F0 G0 H0 ...
-// A1 B1 C1 D1 E1 F1 G1 H1 ...
-// A2 B2 C2 D2 E2 F2 G2 H2 ...
-// A3 B3 C3 D3 E3 F3 G3 H3 ...
-// A4 B4 C4 D4 E4 F4 G4 H4 ...
-// A5 B5 C5 D5 E5 F5 G5 H5 ...
-// A6 B6 C6 D6 E6 F6 G6 H6 ...
-// A7 B7 C7 D7 E7 F7 G7 H7 ...
-// A8 ...
-// ...
-//
-// Packing with m = 8 yields row major output (A0 beside B0 in memory):
-// A0 B0
-// A1 B1
-// A2 B2
-// A3 B3
-// A4 B4
-// A5 B5
-// A6 B6
-// A7 B7
-// ...
-//
-// The purpose is to collect m rows of size k.  Two elements of the same
-// row are arranged contiguously because madd performs an adjacent addition
-// in the kernel.
-
-template <typename Index, typename DataMapper, int Pack1, int Pack2,
-          bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, QInt16, ColMajor,
-                     Conjugate, PanelMode> {
-  EIGEN_DONT_INLINE void operator()(QInt16* blockA, const DataMapper& lhs,
-                                    Index depth, Index rows, Index stride = 0,
-                                    Index offset = 0);
-};
-
-template <typename Index, typename DataMapper, int Pack1, int Pack2,
-          bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2,
-                                     QInt16, ColMajor, Conjugate, PanelMode>::
-operator()(QInt16* blockA, const DataMapper& lhs, Index depth, Index rows,
-           Index stride, Index offset) {
-  eigen_assert(stride == 0);
-  eigen_assert(offset == 0);
-
-  typedef typename packet_traits<QInt16>::type Packet;
-
-  // Use alternate function for weird sizes
-  if (rows % 16 != 0 || depth % 16 != 0) {
-    assert(false &&
-           "only depths and rows that are a multiple of 16 are currently "
-           "supported");
-    // gemm_pack_lhs_any<QInt16, Index, DataMapper, Pack1, Pack2, ColMajor,
-    // Conjugate, PanelMode> lhs_pack;
-    // return lhs_pack(blockA, lhs, depth, rows, stride, offset);
-  }
-
-  // Get vector pointer
-  __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
-
-  // Pack rows in sets of 16
-  for (Index m = 0; m < rows; m += 16) {
-    // Pack depth in sets of 4
-    for (Index k = 0; k < depth; k += 4) {
-      // Load vectors
-      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
-      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
-      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
-      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
-
-      // Rearrange the inputs as required by the kernel
-      __m256i L_AB0_AB7 = _mm256_unpacklo_epi16(L_A, L_B);
-      __m256i L_AB8_AB15 = _mm256_unpackhi_epi16(L_A, L_B);
-      __m256i L_CD0_CD7 = _mm256_unpacklo_epi16(L_C, L_D);
-      __m256i L_CD8_CD15 = _mm256_unpackhi_epi16(L_C, L_D);
-
-      __m256i L_AD0 = _mm256_permute2x128_si256(L_AB0_AB7, L_AB8_AB15, 0x20);
-      _mm256_store_si256(blockA_256++, L_AD0);
-      __m256i L_AD8 = _mm256_permute2x128_si256(L_CD0_CD7, L_CD8_CD15, 0x20);
-      _mm256_store_si256(blockA_256++, L_AD8);
-      __m256i L_AD16 = _mm256_permute2x128_si256(L_AB0_AB7, L_AB8_AB15, 0x31);
-      _mm256_store_si256(blockA_256++, L_AD16);
-      __m256i L_AD24 = _mm256_permute2x128_si256(L_CD0_CD7, L_CD8_CD15, 0x31);
-      _mm256_store_si256(blockA_256++, L_AD24);
-    }
-  }
-}
-
-// Arrange a block of the right input matrix in contiguous memory.
-//
-// Given column major input (A0 beside A1 in memory):
-// A0 B0 C0 D0 E0 F0 G0 H0 ...
-// A1 B1 C1 D1 E1 F1 G1 H1 ...
-// A2 B2 C2 D2 E2 F2 G2 H2 ...
-// A3 B3 C3 D3 E3 F3 G3 H3 ...
-// A4 B4 C4 D4 E4 F4 G4 H4 ...
-// A5 B5 C5 D5 E5 F5 G5 H5 ...
-// A6 B6 C6 D6 E6 F6 G6 H6 ...
-// A7 B7 C7 D7 E7 F7 G7 H7 ...
-// A8 ...
-// ...
-// Packing yields row major output (A0 beside A1 in memory):
-// A0 A1 A2 A3 A4 A5 A6 A7
-// B0 B1 B2 B3 B4 B5 B6 B7
-// ...
-//
-// At least two elements of the same col are arranged contiguously because
-// maddubs and madd both perform an adjacent addition in the kernel.  We can
-// save work by leaving 4 adjacent elements because kr = 4.
-// The purpose is to collect n cols of size k.  Two elements of the same
-// col are arranged contiguously because madd performs an adjacent addition
-// in the kernel.
-template <typename Index, typename DataMapper, int nr, bool Conjugate,
-          bool PanelMode>
-struct gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor, Conjugate,
-                     PanelMode> {
-  EIGEN_DONT_INLINE void operator()(QInt16* blockB, const DataMapper& rhs,
-                                    Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0);
-};
-
-template <typename Index, typename DataMapper, int nr, bool Conjugate,
-          bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor,
-                                     Conjugate, PanelMode>::
-operator()(QInt16* blockB, const DataMapper& rhs, Index depth, Index cols,
-           Index stride, Index offset) {
-  eigen_assert(stride == 0);
-  eigen_assert(offset == 0);
-
-  typedef typename packet_traits<QInt16>::type Packet;
-
-  // Use alternate function for weird sizes
-  if (cols % 16 != 0 || depth % 16 != 0) {
-    assert(false &&
-           "only depths and cols that are a multiple of 16 are currently "
-           "supported");
-    // gemm_pack_rhs_any<QInt16, Index, DataMapper, nr, ColMajor, Conjugate,
-    // PanelMode> rhs_pack;
-    // return rhs_pack(blockB, rhs, depth, cols, stride, offset);
-  }
-
-  // Get vector pointer
-  __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
-
-  // Perform a step of the packing for 4 columns
-  __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_4, R_AD_8, R_AD_12;
-#define PACK_STEP                                            \
-  R_AB_L = _mm256_unpacklo_epi64(R_A, R_B);                  \
-  R_CD_L = _mm256_unpacklo_epi64(R_C, R_D);                  \
-  R_AB_H = _mm256_unpackhi_epi64(R_A, R_B);                  \
-  R_CD_H = _mm256_unpackhi_epi64(R_C, R_D);                  \
-  R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20);  \
-  R_AD_8 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31);  \
-  R_AD_4 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20);  \
-  R_AD_12 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
-  _mm256_store_si256(blockB_256, R_AD_0);                    \
-  _mm256_store_si256(blockB_256 + 4, R_AD_4);                \
-  _mm256_store_si256(blockB_256 + 8, R_AD_8);                \
-  _mm256_store_si256(blockB_256 + 12, R_AD_12);              \
-  blockB_256++;
-
-  // Pack cols in sets of 16
-  for (Index n = 0; n < cols; n += 16) {
-    // Pack depth in sets of 16
-    for (Index k = 0; k < depth; k += 16) {
-      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
-      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
-      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
-      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 4);
-      R_B = rhs.template loadPacket<Packet>(k, n + 5);
-      R_C = rhs.template loadPacket<Packet>(k, n + 6);
-      R_D = rhs.template loadPacket<Packet>(k, n + 7);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 8);
-      R_B = rhs.template loadPacket<Packet>(k, n + 9);
-      R_C = rhs.template loadPacket<Packet>(k, n + 10);
-      R_D = rhs.template loadPacket<Packet>(k, n + 11);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 12);
-      R_B = rhs.template loadPacket<Packet>(k, n + 13);
-      R_C = rhs.template loadPacket<Packet>(k, n + 14);
-      R_D = rhs.template loadPacket<Packet>(k, n + 15);
-      PACK_STEP;
-
-      blockB_256 += 12;
-    }
-  }
-#undef PACK_STEP
-}
-
-// Perform the actual multiplication on packed inputs
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
-                   ConjugateRhs> {
-  typedef typename DataMapper::LinearMapper LinearMapper;
-
-  EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt16* blockA,
-                  const QInt16* blockB, Index rows, Index depth, Index cols,
-                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
-                  Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr,
-                                   ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB,
-           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
-           Index strideB, Index offsetA, Index offsetB) {
-  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  eigen_assert(alpha.value == 1);
-  eigen_assert(strideA == -1);
-  eigen_assert(strideB == -1);
-  eigen_assert(offsetA == 0);
-  eigen_assert(offsetB == 0);
-  eigen_assert(rows > 0);
-  eigen_assert(cols > 0);
-  eigen_assert(depth > 0);
-  eigen_assert(blockA);
-  eigen_assert(blockB);
-
-  // Use alternate function for weird sizes
-  if (rows % 16 != 0 || cols % 16 != 0 || depth % 16 != 0) {
-    assert(false &&
-           "only depths, cols and rows that are a multiple of 16 are currently "
-           "supported");
-    // gebp_kernel_any<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
-    // ConjugateRhs> gebp;
-    // return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA,
-    // strideB, offsetA, offsetB);
-  }
-
-  // Create result block
-  QInt32* blockO = aligned_new<QInt32>(16 * 16);
-  memset(blockO, 0, 16 * 16 * sizeof(QInt32));
-
-  // Get vectorized pointers
-  __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
-  const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
-  const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
-
-  // Loop over blocks of 16 columns
-  for (Index n = 0; n < cols; n += 16) {
-    // Reset index into blockA
-    Index indexL = 0;
-    // Loop over blocks of 16 rows
-    for (Index m = 0; m < rows; m += 16) {
-      // Reset index into blockB
-      Index indexR = n / 16 * depth;
-      // Loop over blocks of 4 on depth
-      for (Index k = 0; k < depth; k += 4) {
-        // Load inputs
-        __m256i L_AD0 = blockA_256[indexL++];
-        __m256i L_AD8 = blockA_256[indexL++];
-        __m256i L_EH0 = blockA_256[indexL++];
-        __m256i L_EH8 = blockA_256[indexL++];
-
-        __m256i R_AH0 = blockB_256[indexR++];
-        __m256i R_AH4 = blockB_256[indexR++];
-        __m256i R_AH8 = blockB_256[indexR++];
-        __m256i R_AH12 = blockB_256[indexR++];
-
-        // Declare variables used in COMPUTE_STEP
-        __m256i P_32_A, P_32_B, P_32;
-
-#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET)                         \
-  P_32_A = _mm256_madd_epi16(R_INPUT_A, L_AD0);                            \
-  P_32_B = _mm256_madd_epi16(R_INPUT_B, L_AD8);                            \
-  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                 \
-  _mm256_store_si256(                                                      \
-      blockO_256 + 2 * OFFSET,                                             \
-      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 2 * OFFSET), P_32)); \
-                                                                           \
-  P_32_A = _mm256_madd_epi16(R_INPUT_A, L_EH0);                            \
-  P_32_B = _mm256_madd_epi16(R_INPUT_B, L_EH8);                            \
-  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                 \
-  _mm256_store_si256(                                                      \
-      blockO_256 + 2 * OFFSET + 1,                                         \
-      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 2 * OFFSET + 1), P_32));
-
-        // Permute and shuffle to copy a single value across the entire vector
-        // Then compute the multiplication
-        // Replicate lower 128-bits of R_AH0 across both lanes
-        __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
-        // Copy first two elements of R_AH0 across entire vector
-        __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        // Copy second two elements of R_AH0 across entire vector
-        __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-
-        COMPUTE_STEP(R_AD0, R_EH0, 0);
-        __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 1);
-
-        // Replicate upper 128-bits of R_AH0 across both lanes
-        R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
-        __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 2);
-        __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 3);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 4);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 5);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 6);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 7);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 8);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 9);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 10);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 11);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 12);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 13);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 14);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 15);
-
-#undef COMPUTE_STEP
-      }
-
-      // Transfer the results to the result matrix
-      Index i = 0;
-      for (Index j = n; j < n + 16; j++) {
-        LinearMapper r0 = res.getLinearMapper(m, j);
-        LinearMapper r1 = res.getLinearMapper(m + 8, j);
-        typedef typename packet_traits<QInt32>::type Packet;
-        r0.template storePacket<Packet>(
-            0, _mm256_add_epi32(blockO_256[i++],
-                                r0.template loadPacket<Packet>(0)));
-        r1.template storePacket<Packet>(
-            0, _mm256_add_epi32(blockO_256[i++],
-                                r1.template loadPacket<Packet>(0)));
-      }
-
-      // Zero the result block so it can be reused
-      memset(blockO, 0, 16 * 16 * sizeof(QInt32));
-    }
-  }
-  aligned_delete(blockO, 16 * 16);
-}
-
-#endif
-
-// AVX2 optimized implementation of Mat-Mat product.
-// LHS is encoded using signed 8-bit integers.
-// RHS is encoded using unsigned 8-bit integers.
-#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-
-// Define quantized traits
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
- public:
-  typedef QInt8 LhsScalar;
-  typedef QUInt8 RhsScalar;
-  typedef QInt32 ResScalar;
-
-  typedef typename packet_traits<LhsScalar>::type LhsPacket;
-  typedef LhsPacket LhsPacket4Packing;
-
-  enum {
-    // Define register blocking scheme.
-    nr = 32,
-    mr = 32,
-    kr = 8,
-    // Ignore progress tracking per loop iteration.
-    LhsProgress = -1,
-    RhsProgress = -1
-  };
-};
-
-// Specialized blocking for quantized implementations.
-// Used by TensorContractionThreadPool, inputs must have dimensions that are
-// multiples of 32.
-template <typename ResScalar, typename Index, typename LeftTensor,
-          typename left_nocontract_t, typename left_contract_t,
-          bool left_inner_dim_contiguous, bool left_inner_dim_reordered,
-          int LeftAlignment, typename RightTensor, typename right_nocontract_t,
-          typename right_contract_t, bool right_inner_dim_contiguous,
-          bool right_inner_dim_reordered, int RightAlignment, int ShardingType>
-class TensorContractionBlocking<
-    ResScalar,
-    TensorContractionInputMapper<
-        QInt8, Index, Lhs, LeftTensor, left_nocontract_t, left_contract_t, 32,
-        left_inner_dim_contiguous, left_inner_dim_reordered, LeftAlignment>,
-    TensorContractionInputMapper<QUInt8, Index, Rhs, RightTensor,
-                                 right_nocontract_t, right_contract_t, 32,
-                                 right_inner_dim_contiguous,
-                                 right_inner_dim_reordered, RightAlignment>,
-    Index, ShardingType> {
- public:
-  typedef QInt8 LhsScalar;
-  typedef QUInt8 RhsScalar;
-
-  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
-      : kc_(k), mc_(m), nc_(n) {
-    eigen_assert(m % 32 == 0);
-    eigen_assert(k % 32 == 0);
-    if (!k || !m || !n) {
-      return;
-    }
-
-    if (ShardingType == ShardByCol) {
-      eigen_assert(n % 32 == 0);
-      nc_ = (((n / num_threads) + 31) / 32) * 32;
-    } else {
-      eigen_assert(n % 32 == 0 || n == 1);
-      // Special case to avoid breaking the unimplemented matrix-vector case
-      if (n == 1) {
-        nc_ = 32;
-      }
-      mc_ = (((m / num_threads) + 31) / 32) * 32;
-    }
-  }
-
-  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
-  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
-  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
-
- private:
-  Index kc_;
-  Index mc_;
-  Index nc_;
-};
-
-// Specialized blocking for quantized implementations.
-// Used by TensorContraction and GeneralMatrixMatrix, inputs are padded to
-// multiples of 32.
-template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<ColMajor, QInt8, QInt8, MaxRows, MaxCols, MaxDepth,
-                          KcFactor, false>
-    : public level3_blocking<QInt8, QInt8> {
-  DenseIndex m_sizeA;
-  DenseIndex m_sizeB;
-
- public:
-  gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
-                      DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
-    this->m_mc = ((rows + 31) / 32) * 32;
-    this->m_nc = ((cols + 31) / 32) * 32;
-    this->m_kc = ((depth + 31) / 32) * 32;
-    m_sizeA = this->m_mc * this->m_kc;
-    m_sizeB = this->m_kc * this->m_nc;
-  }
-  void allocateA() {
-    if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA);
-  }
-  void allocateB() {
-    if (this->m_blockB == 0) this->m_blockB = aligned_new<QInt8>(m_sizeB);
-  }
-  void allocateAll() {
-    allocateA();
-    allocateB();
-  }
-  ~gemm_blocking_space() {
-    aligned_delete(this->m_blockA, m_sizeA);
-    aligned_delete(this->m_blockB, m_sizeB);
-  }
-};
-
-template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth,
-                          KcFactor, false>
-    : public level3_blocking<QInt8, QUInt8> {
-  DenseIndex m_sizeA;
-  DenseIndex m_sizeB;
-
- public:
-  gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
-                      DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
-    this->m_mc = ((rows + 31) / 32) * 32;
-    this->m_nc = ((cols + 31) / 32) * 32;
-    this->m_kc = ((depth + 31) / 32) * 32;
-    m_sizeA = this->m_mc * this->m_kc;
-    m_sizeB = this->m_kc * this->m_nc;
-  }
-  void allocateA() {
-    if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt8>(m_sizeA);
-  }
-  void allocateB() {
-    if (this->m_blockB == 0) this->m_blockB = aligned_new<QUInt8>(m_sizeB);
-  }
-  void allocateAll() {
-    allocateA();
-    allocateB();
-  }
-  ~gemm_blocking_space() {
-    aligned_delete(this->m_blockA, m_sizeA);
-    aligned_delete(this->m_blockB, m_sizeB);
-  }
-};
-
-// Alternate templates for any input sizes
-template <typename Scalar, typename Index, typename DataMapper, int Pack1,
-          int Pack2, int StorageOrder, bool Conjugate = false,
-          bool PanelMode = false>
-struct gemm_pack_lhs_any;
-template <typename Index, typename DataMapper, int Pack1, int Pack2,
-          bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
-                         Conjugate, PanelMode> {
-  EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
-                                    Index depth, Index rows, Index stride = 0,
-                                    Index offset = 0);
-};
-
-template <typename Scalar, typename Index, typename DataMapper, int nr,
-          int StorageOrder, bool Conjugate = false, bool PanelMode = false>
-struct gemm_pack_rhs_any;
-template <typename Index, typename DataMapper, int nr, bool Conjugate,
-          bool PanelMode>
-struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
-                         PanelMode> {
-  EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs,
-                                    Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0);
-};
-
-template <typename LhsScalar, typename RhsScalar, typename Index,
-          typename DataMapper, int mr, int nr, bool ConjugateLhs = false,
-          bool ConjugateRhs = false>
-struct gebp_kernel_any;
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
-                       ConjugateRhs> {
-  typedef typename DataMapper::LinearMapper LinearMapper;
-
-  EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA,
-                  const QUInt8* blockB, Index rows, Index depth, Index cols,
-                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
-                  Index offsetA = 0, Index offsetB = 0);
-};
-
-// Alternate implementations for any input sizes
-template <typename Index, typename DataMapper, int Pack1, int Pack2,
-          bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2,
-                                         ColMajor, Conjugate, PanelMode>::
-operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
-           Index stride, Index offset) {
-  eigen_assert(stride == 0);
-  eigen_assert(offset == 0);
-
-  typedef typename packet_traits<QInt8>::type Packet;
-
-  // Get vector pointer
-  __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
-
-  // Get even multiples of the dimensions
-  Index rows_32 = (rows / 32) * 32;
-  Index depth_8 = (depth / 8) * 8;
-
-  // Get padding for when depth is not a multiple of 32
-  int padding = 0;
-  if (depth % 32 != 0) {
-    int depth_32 = (depth / 32) * 32;
-    int extra_depth = depth - depth_32;
-    int extra_depth_8 = ((extra_depth + 7) / 8) * 8;
-    padding = 32 - extra_depth_8;
-  }
-
-  // Pack rows in sets of 32
-  for (Index m = 0; m < rows_32; m += 32) {
-    // Pack depth in sets of 8
-    for (Index k = 0; k < depth_8; k += 8) {
-      // Load vectors
-      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
-      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
-
-      // Interleave 8-bit elements
-      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
-      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
-
-      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
-      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
-      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
-      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
-
-      // Interleave 16-bit elements
-      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
-      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
-
-      // Use permute before we store to cross 128-bit lanes
-      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
-      _mm256_store_si256(blockA_256++, L_AD0);
-
-      // Complete packing for 32 x 8 block
-      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
-      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
-      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
-      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
-      _mm256_store_si256(blockA_256++, L_AD8);
-      _mm256_store_si256(blockA_256++, L_AD16);
-      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
-      _mm256_store_si256(blockA_256++, L_AD24);
-      __m256i L_E = lhs.template loadPacket<Packet>(m, k + 4);
-      __m256i L_F = lhs.template loadPacket<Packet>(m, k + 5);
-      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
-      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
-      __m256i L_G = lhs.template loadPacket<Packet>(m, k + 6);
-      __m256i L_H = lhs.template loadPacket<Packet>(m, k + 7);
-      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
-      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
-      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
-      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
-      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
-      _mm256_store_si256(blockA_256++, L_EH0);
-      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
-      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
-      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
-      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
-      _mm256_store_si256(blockA_256++, L_EH8);
-      _mm256_store_si256(blockA_256++, L_EH16);
-      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
-      _mm256_store_si256(blockA_256++, L_EH24);
-    }
-
-    // Finish the k dimension, padding with zeros
-    if (depth_8 < depth) {
-      __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
-      switch (depth - depth_8) {
-        case 1:
-          L_A = lhs.template loadPacket<Packet>(m, depth_8);
-          L_B = _mm256_setzero_si256();
-          L_C = _mm256_setzero_si256();
-          L_D = _mm256_setzero_si256();
-          L_E = _mm256_setzero_si256();
-          L_F = _mm256_setzero_si256();
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          break;
-        case 2:
-          L_A = lhs.template loadPacket<Packet>(m, depth_8);
-          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
-          L_C = _mm256_setzero_si256();
-          L_D = _mm256_setzero_si256();
-          L_E = _mm256_setzero_si256();
-          L_F = _mm256_setzero_si256();
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          break;
-        case 3:
-          L_A = lhs.template loadPacket<Packet>(m, depth_8);
-          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
-          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
-          L_D = _mm256_setzero_si256();
-          L_E = _mm256_setzero_si256();
-          L_F = _mm256_setzero_si256();
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          break;
-        case 4:
-          L_A = lhs.template loadPacket<Packet>(m, depth_8);
-          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
-          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
-          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
-          L_E = _mm256_setzero_si256();
-          L_F = _mm256_setzero_si256();
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          break;
-        case 5:
-          L_A = lhs.template loadPacket<Packet>(m, depth_8);
-          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
-          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
-          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
-          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
-          L_F = _mm256_setzero_si256();
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          break;
-        case 6:
-          L_A = lhs.template loadPacket<Packet>(m, depth_8);
-          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
-          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
-          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
-          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
-          L_F = lhs.template loadPacket<Packet>(m, depth_8 + 5);
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          break;
-        case 7:
-          L_A = lhs.template loadPacket<Packet>(m, depth_8);
-          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
-          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
-          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
-          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
-          L_F = lhs.template loadPacket<Packet>(m, depth_8 + 5);
-          L_G = lhs.template loadPacket<Packet>(m, depth_8 + 6);
-          L_H = _mm256_setzero_si256();
-          break;
-      }
-
-      // Interleave 8-bit elements
-      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
-      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
-
-      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
-      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
-
-      // Interleave 16-bit elements
-      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
-      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
-
-      // Use permute before we store to cross 128-bit lanes
-      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
-      _mm256_store_si256(blockA_256++, L_AD0);
-
-      // Complete packing
-      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
-      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
-      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
-      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
-      _mm256_store_si256(blockA_256++, L_AD8);
-      _mm256_store_si256(blockA_256++, L_AD16);
-      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
-      _mm256_store_si256(blockA_256++, L_AD24);
-      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
-      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
-      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
-      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
-      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
-      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
-      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
-      _mm256_store_si256(blockA_256++, L_EH0);
-      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
-      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
-      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
-      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
-      _mm256_store_si256(blockA_256++, L_EH8);
-      _mm256_store_si256(blockA_256++, L_EH16);
-      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
-      _mm256_store_si256(blockA_256++, L_EH24);
-    }
-    blockA_256 += padding;
-  }
-
-  // Finish the m dimension, padding with zeros
-  if (rows_32 < rows) {
-    // Pack depth in sets of 8
-    for (Index k = 0; k < depth_8; k += 8) {
-      // Load vectors
-      __m256i L_A = _mm256_setzero_si256();
-      __m256i L_B = _mm256_setzero_si256();
-      __m256i L_C = _mm256_setzero_si256();
-      __m256i L_D = _mm256_setzero_si256();
-      __m256i L_E = _mm256_setzero_si256();
-      __m256i L_F = _mm256_setzero_si256();
-      __m256i L_G = _mm256_setzero_si256();
-      __m256i L_H = _mm256_setzero_si256();
-      for (Index m = 0; m < rows - rows_32; m++) {
-        QInt8* ptr = (QInt8*)&L_A;
-        ptr[m] = lhs(rows_32 + m, k);
-        ptr = (QInt8*)&L_B;
-        ptr[m] = lhs(rows_32 + m, k + 1);
-        ptr = (QInt8*)&L_C;
-        ptr[m] = lhs(rows_32 + m, k + 2);
-        ptr = (QInt8*)&L_D;
-        ptr[m] = lhs(rows_32 + m, k + 3);
-        ptr = (QInt8*)&L_E;
-        ptr[m] = lhs(rows_32 + m, k + 4);
-        ptr = (QInt8*)&L_F;
-        ptr[m] = lhs(rows_32 + m, k + 5);
-        ptr = (QInt8*)&L_G;
-        ptr[m] = lhs(rows_32 + m, k + 6);
-        ptr = (QInt8*)&L_H;
-        ptr[m] = lhs(rows_32 + m, k + 7);
-      }
-
-      // Interleave 8-bit elements
-      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
-      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
-      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
-      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
-
-      // Interleave 16-bit elements
-      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
-      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
-
-      // Use permute before we store to cross 128-bit lanes
-      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
-      _mm256_store_si256(blockA_256++, L_AD0);
-
-      // Complete packing for 32 x 8 block
-      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
-      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
-      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
-      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
-      _mm256_store_si256(blockA_256++, L_AD8);
-      _mm256_store_si256(blockA_256++, L_AD16);
-      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
-      _mm256_store_si256(blockA_256++, L_AD24);
-      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
-      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
-      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
-      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
-      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
-      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
-      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
-      _mm256_store_si256(blockA_256++, L_EH0);
-      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
-      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
-      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
-      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
-      _mm256_store_si256(blockA_256++, L_EH8);
-      _mm256_store_si256(blockA_256++, L_EH16);
-      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
-      _mm256_store_si256(blockA_256++, L_EH24);
-    }
-
-    // Finish the k dimension, padding with zeros
-    if (depth_8 < depth) {
-      __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
-      QInt8* ptr;
-      switch (depth - depth_8) {
-        case 1:
-          L_A = _mm256_setzero_si256();
-          L_B = _mm256_setzero_si256();
-          L_C = _mm256_setzero_si256();
-          L_D = _mm256_setzero_si256();
-          L_E = _mm256_setzero_si256();
-          L_F = _mm256_setzero_si256();
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          for (Index m = 0; m < rows - rows_32; m++) {
-            QInt8* ptr = (QInt8*)&L_A;
-            ptr[m] = lhs(rows_32 + m, depth_8);
-          }
-          break;
-        case 2:
-          L_A = _mm256_setzero_si256();
-          L_B = _mm256_setzero_si256();
-          L_C = _mm256_setzero_si256();
-          L_D = _mm256_setzero_si256();
-          L_E = _mm256_setzero_si256();
-          L_F = _mm256_setzero_si256();
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          for (Index m = 0; m < rows - rows_32; m++) {
-            ptr = (QInt8*)&L_A;
-            ptr[m] = lhs(rows_32 + m, depth_8);
-            ptr = (QInt8*)&L_B;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          }
-          break;
-        case 3:
-          L_A = _mm256_setzero_si256();
-          L_B = _mm256_setzero_si256();
-          L_C = _mm256_setzero_si256();
-          L_D = _mm256_setzero_si256();
-          L_E = _mm256_setzero_si256();
-          L_F = _mm256_setzero_si256();
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          for (Index m = 0; m < rows - rows_32; m++) {
-            ptr = (QInt8*)&L_A;
-            ptr[m] = lhs(rows_32 + m, depth_8);
-            ptr = (QInt8*)&L_B;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-            ptr = (QInt8*)&L_C;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          }
-          break;
-        case 4:
-          L_A = _mm256_setzero_si256();
-          L_B = _mm256_setzero_si256();
-          L_C = _mm256_setzero_si256();
-          L_D = _mm256_setzero_si256();
-          L_E = _mm256_setzero_si256();
-          L_F = _mm256_setzero_si256();
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          for (Index m = 0; m < rows - rows_32; m++) {
-            ptr = (QInt8*)&L_A;
-            ptr[m] = lhs(rows_32 + m, depth_8);
-            ptr = (QInt8*)&L_B;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-            ptr = (QInt8*)&L_C;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-            ptr = (QInt8*)&L_D;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-          }
-          break;
-        case 5:
-          L_A = _mm256_setzero_si256();
-          L_B = _mm256_setzero_si256();
-          L_C = _mm256_setzero_si256();
-          L_D = _mm256_setzero_si256();
-          L_E = _mm256_setzero_si256();
-          L_F = _mm256_setzero_si256();
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          for (Index m = 0; m < rows - rows_32; m++) {
-            ptr = (QInt8*)&L_A;
-            ptr[m] = lhs(rows_32 + m, depth_8);
-            ptr = (QInt8*)&L_B;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-            ptr = (QInt8*)&L_C;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-            ptr = (QInt8*)&L_D;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-            ptr = (QInt8*)&L_E;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
-          }
-          break;
-        case 6:
-          L_A = _mm256_setzero_si256();
-          L_B = _mm256_setzero_si256();
-          L_C = _mm256_setzero_si256();
-          L_D = _mm256_setzero_si256();
-          L_E = _mm256_setzero_si256();
-          L_F = _mm256_setzero_si256();
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          for (Index m = 0; m < rows - rows_32; m++) {
-            ptr = (QInt8*)&L_A;
-            ptr[m] = lhs(rows_32 + m, depth_8);
-            ptr = (QInt8*)&L_B;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-            ptr = (QInt8*)&L_C;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-            ptr = (QInt8*)&L_D;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-            ptr = (QInt8*)&L_E;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
-            ptr = (QInt8*)&L_F;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 5);
-          }
-          break;
-        case 7:
-          L_A = _mm256_setzero_si256();
-          L_B = _mm256_setzero_si256();
-          L_C = _mm256_setzero_si256();
-          L_D = _mm256_setzero_si256();
-          L_E = _mm256_setzero_si256();
-          L_F = _mm256_setzero_si256();
-          L_G = _mm256_setzero_si256();
-          L_H = _mm256_setzero_si256();
-          for (Index m = 0; m < rows - rows_32; m++) {
-            ptr = (QInt8*)&L_A;
-            ptr[m] = lhs(rows_32 + m, depth_8);
-            ptr = (QInt8*)&L_B;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-            ptr = (QInt8*)&L_C;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-            ptr = (QInt8*)&L_D;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-            ptr = (QInt8*)&L_E;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
-            ptr = (QInt8*)&L_F;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 5);
-            ptr = (QInt8*)&L_G;
-            ptr[m] = lhs(rows_32 + m, depth_8 + 6);
-          }
-          break;
-      }
-
-      // Interleave 8-bit elements
-      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
-      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
-      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
-      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
-
-      // Interleave 16-bit elements
-      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
-      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
-
-      // Use permute before we store to cross 128-bit lanes
-      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
-      _mm256_store_si256(blockA_256++, L_AD0);
-
-      // Complete packing
-      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
-      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
-      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
-      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
-      _mm256_store_si256(blockA_256++, L_AD8);
-      _mm256_store_si256(blockA_256++, L_AD16);
-      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
-      _mm256_store_si256(blockA_256++, L_AD24);
-      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
-      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
-      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
-      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
-      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
-      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
-      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
-      _mm256_store_si256(blockA_256++, L_EH0);
-      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
-      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
-      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
-      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
-      _mm256_store_si256(blockA_256++, L_EH8);
-      _mm256_store_si256(blockA_256++, L_EH16);
-      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
-      _mm256_store_si256(blockA_256++, L_EH24);
-    }
-  }
-}
-
-template <typename Index, typename DataMapper, int nr, bool Conjugate,
-          bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr,
-                                         ColMajor, Conjugate, PanelMode>::
-operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
-           Index stride, Index offset) {
-  eigen_assert(stride == 0);
-  eigen_assert(offset == 0);
-
-  typedef typename packet_traits<QUInt8>::type Packet;
-
-  // Get vector pointer
-  __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
-
-  // Get even multiples of the dimensions
-  Index cols_32 = (cols / 32) * 32;
-  Index depth_32 = (depth / 32) * 32;
-
-  // Perform a step of the packing for 4 columns
-  __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24;
-#define PACK_STEP                                            \
-  R_AB_L = _mm256_unpacklo_epi64(R_A, R_B);                  \
-  R_CD_L = _mm256_unpacklo_epi64(R_C, R_D);                  \
-  R_AB_H = _mm256_unpackhi_epi64(R_A, R_B);                  \
-  R_CD_H = _mm256_unpackhi_epi64(R_C, R_D);                  \
-  R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20);  \
-  R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
-  R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20);  \
-  R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
-  _mm256_store_si256(blockB_256, R_AD_0);                    \
-  _mm256_store_si256(blockB_256 + 8, R_AD_8);                \
-  _mm256_store_si256(blockB_256 + 16, R_AD_16);              \
-  _mm256_store_si256(blockB_256 + 24, R_AD_24);              \
-  blockB_256++;
-
-  // Pack cols in sets of 32
-  for (Index n = 0; n < cols_32; n += 32) {
-    // Pack depth in sets of 32
-    for (Index k = 0; k < depth_32; k += 32) {
-      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
-      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
-      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
-      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 4);
-      R_B = rhs.template loadPacket<Packet>(k, n + 5);
-      R_C = rhs.template loadPacket<Packet>(k, n + 6);
-      R_D = rhs.template loadPacket<Packet>(k, n + 7);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 8);
-      R_B = rhs.template loadPacket<Packet>(k, n + 9);
-      R_C = rhs.template loadPacket<Packet>(k, n + 10);
-      R_D = rhs.template loadPacket<Packet>(k, n + 11);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 12);
-      R_B = rhs.template loadPacket<Packet>(k, n + 13);
-      R_C = rhs.template loadPacket<Packet>(k, n + 14);
-      R_D = rhs.template loadPacket<Packet>(k, n + 15);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 16);
-      R_B = rhs.template loadPacket<Packet>(k, n + 17);
-      R_C = rhs.template loadPacket<Packet>(k, n + 18);
-      R_D = rhs.template loadPacket<Packet>(k, n + 19);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 20);
-      R_B = rhs.template loadPacket<Packet>(k, n + 21);
-      R_C = rhs.template loadPacket<Packet>(k, n + 22);
-      R_D = rhs.template loadPacket<Packet>(k, n + 23);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 24);
-      R_B = rhs.template loadPacket<Packet>(k, n + 25);
-      R_C = rhs.template loadPacket<Packet>(k, n + 26);
-      R_D = rhs.template loadPacket<Packet>(k, n + 27);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 28);
-      R_B = rhs.template loadPacket<Packet>(k, n + 29);
-      R_C = rhs.template loadPacket<Packet>(k, n + 30);
-      R_D = rhs.template loadPacket<Packet>(k, n + 31);
-      PACK_STEP;
-
-      blockB_256 += 24;
-    }
-
-    if (depth_32 < depth) {
-      QUInt8* ptr;
-      __m256i R_A = _mm256_setzero_si256();
-      __m256i R_B = _mm256_setzero_si256();
-      __m256i R_C = _mm256_setzero_si256();
-      __m256i R_D = _mm256_setzero_si256();
-      for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*)&R_A;
-        ptr[k - depth_32] = rhs(k, n);
-        ptr = (QUInt8*)&R_B;
-        ptr[k - depth_32] = rhs(k, n + 1);
-        ptr = (QUInt8*)&R_C;
-        ptr[k - depth_32] = rhs(k, n + 2);
-        ptr = (QUInt8*)&R_D;
-        ptr[k - depth_32] = rhs(k, n + 3);
-      }
-      PACK_STEP;
-
-      R_A = _mm256_setzero_si256();
-      R_B = _mm256_setzero_si256();
-      R_C = _mm256_setzero_si256();
-      R_D = _mm256_setzero_si256();
-      for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*)&R_A;
-        ptr[k - depth_32] = rhs(k, n + 4);
-        ptr = (QUInt8*)&R_B;
-        ptr[k - depth_32] = rhs(k, n + 5);
-        ptr = (QUInt8*)&R_C;
-        ptr[k - depth_32] = rhs(k, n + 6);
-        ptr = (QUInt8*)&R_D;
-        ptr[k - depth_32] = rhs(k, n + 7);
-      }
-      PACK_STEP;
-
-      R_A = _mm256_setzero_si256();
-      R_B = _mm256_setzero_si256();
-      R_C = _mm256_setzero_si256();
-      R_D = _mm256_setzero_si256();
-      for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*)&R_A;
-        ptr[k - depth_32] = rhs(k, n + 8);
-        ptr = (QUInt8*)&R_B;
-        ptr[k - depth_32] = rhs(k, n + 9);
-        ptr = (QUInt8*)&R_C;
-        ptr[k - depth_32] = rhs(k, n + 10);
-        ptr = (QUInt8*)&R_D;
-        ptr[k - depth_32] = rhs(k, n + 11);
-      }
-      PACK_STEP;
-
-      R_A = _mm256_setzero_si256();
-      R_B = _mm256_setzero_si256();
-      R_C = _mm256_setzero_si256();
-      R_D = _mm256_setzero_si256();
-      for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*)&R_A;
-        ptr[k - depth_32] = rhs(k, n + 12);
-        ptr = (QUInt8*)&R_B;
-        ptr[k - depth_32] = rhs(k, n + 13);
-        ptr = (QUInt8*)&R_C;
-        ptr[k - depth_32] = rhs(k, n + 14);
-        ptr = (QUInt8*)&R_D;
-        ptr[k - depth_32] = rhs(k, n + 15);
-      }
-      PACK_STEP;
-
-      R_A = _mm256_setzero_si256();
-      R_B = _mm256_setzero_si256();
-      R_C = _mm256_setzero_si256();
-      R_D = _mm256_setzero_si256();
-      for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*)&R_A;
-        ptr[k - depth_32] = rhs(k, n + 16);
-        ptr = (QUInt8*)&R_B;
-        ptr[k - depth_32] = rhs(k, n + 17);
-        ptr = (QUInt8*)&R_C;
-        ptr[k - depth_32] = rhs(k, n + 18);
-        ptr = (QUInt8*)&R_D;
-        ptr[k - depth_32] = rhs(k, n + 19);
-      }
-      PACK_STEP;
-
-      R_A = _mm256_setzero_si256();
-      R_B = _mm256_setzero_si256();
-      R_C = _mm256_setzero_si256();
-      R_D = _mm256_setzero_si256();
-      for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*)&R_A;
-        ptr[k - depth_32] = rhs(k, n + 20);
-        ptr = (QUInt8*)&R_B;
-        ptr[k - depth_32] = rhs(k, n + 21);
-        ptr = (QUInt8*)&R_C;
-        ptr[k - depth_32] = rhs(k, n + 22);
-        ptr = (QUInt8*)&R_D;
-        ptr[k - depth_32] = rhs(k, n + 23);
-      }
-      PACK_STEP;
-
-      R_A = _mm256_setzero_si256();
-      R_B = _mm256_setzero_si256();
-      R_C = _mm256_setzero_si256();
-      R_D = _mm256_setzero_si256();
-      for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*)&R_A;
-        ptr[k - depth_32] = rhs(k, n + 24);
-        ptr = (QUInt8*)&R_B;
-        ptr[k - depth_32] = rhs(k, n + 25);
-        ptr = (QUInt8*)&R_C;
-        ptr[k - depth_32] = rhs(k, n + 26);
-        ptr = (QUInt8*)&R_D;
-        ptr[k - depth_32] = rhs(k, n + 27);
-      }
-      PACK_STEP;
-
-      R_A = _mm256_setzero_si256();
-      R_B = _mm256_setzero_si256();
-      R_C = _mm256_setzero_si256();
-      R_D = _mm256_setzero_si256();
-      for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*)&R_A;
-        ptr[k - depth_32] = rhs(k, n + 28);
-        ptr = (QUInt8*)&R_B;
-        ptr[k - depth_32] = rhs(k, n + 29);
-        ptr = (QUInt8*)&R_C;
-        ptr[k - depth_32] = rhs(k, n + 30);
-        ptr = (QUInt8*)&R_D;
-        ptr[k - depth_32] = rhs(k, n + 31);
-      }
-      PACK_STEP;
-      blockB_256 += 24;
-    }
-  }
-
-  // Finish packing cols
-  if (cols_32 < cols) {
-    // Pack depth in sets of 32
-    for (Index k = 0; k < depth_32; k += 32) {
-      __m256i R_A, R_B, R_C, R_D;
-      Index n;
-      for (n = cols_32; n < cols; n += 4) {
-        switch (cols - n) {
-          case 1:
-            R_A = rhs.template loadPacket<Packet>(k, n);
-            R_B = _mm256_setzero_si256();
-            R_C = _mm256_setzero_si256();
-            R_D = _mm256_setzero_si256();
-            PACK_STEP;
-            break;
-          case 2:
-            R_A = rhs.template loadPacket<Packet>(k, n);
-            R_B = rhs.template loadPacket<Packet>(k, n + 1);
-            R_C = _mm256_setzero_si256();
-            R_D = _mm256_setzero_si256();
-            PACK_STEP;
-            break;
-          case 3:
-            R_A = rhs.template loadPacket<Packet>(k, n);
-            R_B = rhs.template loadPacket<Packet>(k, n + 1);
-            R_C = rhs.template loadPacket<Packet>(k, n + 2);
-            R_D = _mm256_setzero_si256();
-            PACK_STEP;
-            break;
-          default:
-            R_A = rhs.template loadPacket<Packet>(k, n);
-            R_B = rhs.template loadPacket<Packet>(k, n + 1);
-            R_C = rhs.template loadPacket<Packet>(k, n + 2);
-            R_D = rhs.template loadPacket<Packet>(k, n + 3);
-            PACK_STEP;
-            break;
-        }
-      }
-
-      // Increment the block pointer.
-      // We must pad if cols is not a multiple of 32.
-      blockB_256 += 32 - (n - cols_32) / 4;
-    }
-
-    if (depth_32 < depth) {
-      for (Index n = cols_32; n < cols; n += 4) {
-        QUInt8* ptr;
-        __m256i R_A = _mm256_setzero_si256();
-        __m256i R_B = _mm256_setzero_si256();
-        __m256i R_C = _mm256_setzero_si256();
-        __m256i R_D = _mm256_setzero_si256();
-        switch (cols - n) {
-          case 1:
-            for (Index k = depth_32; k < depth; k++) {
-              ptr = (QUInt8*)&R_A;
-              ptr[k - depth_32] = rhs(k, n);
-            }
-            PACK_STEP;
-            break;
-          case 2:
-            for (Index k = depth_32; k < depth; k++) {
-              ptr = (QUInt8*)&R_A;
-              ptr[k - depth_32] = rhs(k, n);
-              ptr = (QUInt8*)&R_B;
-              ptr[k - depth_32] = rhs(k, n + 1);
-            }
-            PACK_STEP;
-            break;
-          case 3:
-            for (Index k = depth_32; k < depth; k++) {
-              ptr = (QUInt8*)&R_A;
-              ptr[k - depth_32] = rhs(k, n);
-              ptr = (QUInt8*)&R_B;
-              ptr[k - depth_32] = rhs(k, n + 1);
-              ptr = (QUInt8*)&R_C;
-              ptr[k - depth_32] = rhs(k, n + 2);
-            }
-            PACK_STEP;
-            break;
-          default:
-            for (Index k = depth_32; k < depth; k++) {
-              ptr = (QUInt8*)&R_A;
-              ptr[k - depth_32] = rhs(k, n);
-              ptr = (QUInt8*)&R_B;
-              ptr[k - depth_32] = rhs(k, n + 1);
-              ptr = (QUInt8*)&R_C;
-              ptr[k - depth_32] = rhs(k, n + 2);
-              ptr = (QUInt8*)&R_D;
-              ptr[k - depth_32] = rhs(k, n + 3);
-            }
-            PACK_STEP;
-            break;
-        }
-      }
-    }
-  }
-#undef PACK_STEP
-}
-
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr,
-                                       ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
-           Index strideB, Index offsetA, Index offsetB) {
-  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  eigen_assert(alpha.value == 1);
-  eigen_assert(strideA == -1);
-  eigen_assert(strideB == -1);
-  eigen_assert(offsetA == 0);
-  eigen_assert(offsetB == 0);
-  eigen_assert(rows > 0);
-  eigen_assert(cols > 0);
-  eigen_assert(depth > 0);
-  eigen_assert(blockA);
-  eigen_assert(blockB);
-
-  Index rows_32 = ((rows + 31) / 32) * 32;
-  Index cols_32 = ((cols + 31) / 32) * 32;
-  Index depth_32 = ((depth + 31) / 32) * 32;
-
-  // Create result block
-  ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0);
-  memset(blockO, 0, 32 * 32 * sizeof(QInt32));
-
-  // Get vectorized pointers
-  __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
-  const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
-  const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
-
-  // Loop over blocks of 32 columns
-  for (Index n = 0; n < cols_32; n += 32) {
-    // Reset index into blockA
-    Index indexL = 0;
-    // Loop over blocks of 32 rows
-    for (Index m = 0; m < rows_32; m += 32) {
-      // Reset index into blockB
-      Index indexR = n / 32 * depth_32;
-      // Loop over blocks of 8 on depth
-      for (Index k = 0; k < depth_32; k += 8) {
-        // Load inputs
-        __m256i L_AD0 = blockA_256[indexL++];
-        __m256i L_AD8 = blockA_256[indexL++];
-        __m256i L_AD16 = blockA_256[indexL++];
-        __m256i L_AD24 = blockA_256[indexL++];
-        __m256i L_EH0 = blockA_256[indexL++];
-        __m256i L_EH8 = blockA_256[indexL++];
-        __m256i L_EH16 = blockA_256[indexL++];
-        __m256i L_EH24 = blockA_256[indexL++];
-        __m256i R_AH0 = blockB_256[indexR++];
-        __m256i R_AH4 = blockB_256[indexR++];
-        __m256i R_AH8 = blockB_256[indexR++];
-        __m256i R_AH12 = blockB_256[indexR++];
-        __m256i R_AH16 = blockB_256[indexR++];
-        __m256i R_AH20 = blockB_256[indexR++];
-        __m256i R_AH24 = blockB_256[indexR++];
-        __m256i R_AH28 = blockB_256[indexR++];
-
-        // This constant is used with madd to convert 16 bit to 32 bit
-        const __m256i ONE = _mm256_set1_epi32(0x00010001);
-
-        // Declare variables used in COMPUTE_STEP
-        __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32;
-
-#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET)                             \
-  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0);                             \
-  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
-  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0);                             \
-  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
-  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
-  _mm256_store_si256(                                                          \
-      blockO_256 + 4 * OFFSET,                                                 \
-      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32));     \
-                                                                               \
-  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8);                             \
-  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
-  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8);                             \
-  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
-  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
-  _mm256_store_si256(                                                          \
-      blockO_256 + 4 * OFFSET + 1,                                             \
-      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \
-                                                                               \
-  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16);                            \
-  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
-  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16);                            \
-  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
-  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
-  _mm256_store_si256(                                                          \
-      blockO_256 + 4 * OFFSET + 2,                                             \
-      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \
-                                                                               \
-  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24);                            \
-  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
-  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24);                            \
-  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
-  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
-  _mm256_store_si256(                                                          \
-      blockO_256 + 4 * OFFSET + 3,                                             \
-      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32));
-
-        // Permute and shuffle to copy a single value across the entire vector
-        // Then compute the multiplication
-        __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
-        __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 0);
-        __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 1);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
-        __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 2);
-        __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 3);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 4);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 5);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 6);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 7);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 8);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 9);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 10);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 11);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 12);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 13);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 14);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 15);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 16);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 17);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 18);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 19);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 20);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 21);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 22);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 23);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 24);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 25);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 26);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 27);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 28);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 29);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 30);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 31);
-
-#undef COMPUTE_STEP
-      }
-
-      // Transfer the results to the result matrix.
-      if (m + 32 <= rows && n + 32 <= cols) {
-        Index i = 0;
-        for (Index j = n; j < n + 32; j++) {
-          LinearMapper r0 = res.getLinearMapper(m, j);
-          LinearMapper r1 = res.getLinearMapper(m + 8, j);
-          LinearMapper r2 = res.getLinearMapper(m + 16, j);
-          LinearMapper r3 = res.getLinearMapper(m + 24, j);
-          typedef typename packet_traits<QInt32>::type Packet;
-          r0.template storePacket<Packet>(
-              0, _mm256_add_epi32(blockO_256[i++],
-                                  r0.template loadPacket<Packet>(0)));
-          r1.template storePacket<Packet>(
-              0, _mm256_add_epi32(blockO_256[i++],
-                                  r1.template loadPacket<Packet>(0)));
-          r2.template storePacket<Packet>(
-              0, _mm256_add_epi32(blockO_256[i++],
-                                  r2.template loadPacket<Packet>(0)));
-          r3.template storePacket<Packet>(
-              0, _mm256_add_epi32(blockO_256[i++],
-                                  r3.template loadPacket<Packet>(0)));
-        }
-      } else {
-        for (Index j = n; j < cols; j++) {
-          for (Index i = m; i < rows; i++) {
-            res(i, j) = blockO[(j - n) * 32 + (i - m)];
-          }
-        }
-      }
-
-      // Zero the result block so it can be reused
-      memset(blockO, 0, 32 * 32 * sizeof(QInt32));
-    }
-  }
-}
-
-// Below are the fully optimized versions that are correct only for sizes that
-// are multiple of 32.  It is about a 10% performance benefit to keep these
-// implementations separate.
-
-// Arrange a block of the left input matrix in contiguous memory.
-//
-// Given column major input (A0 beside A1 in memory):
-// A0 B0 C0 D0 E0 F0 G0 H0 ...
-// A1 B1 C1 D1 E1 F1 G1 H1 ...
-// A2 B2 C2 D2 E2 F2 G2 H2 ...
-// A3 B3 C3 D3 E3 F3 G3 H3 ...
-// A4 B4 C4 D4 E4 F4 G4 H4 ...
-// A5 B5 C5 D5 E5 F5 G5 H5 ...
-// A6 B6 C6 D6 E6 F6 G6 H6 ...
-// A7 B7 C7 D7 E7 F7 G7 H7 ...
-// A8 ...
-// ...
-//
-// Packing yields output (A0 beside B0 in memory):
-// A0 B0 C0 D0
-// A1 B1 C1 D1
-// A2 B2 C2 D2
-// A3 B3 C3 D3
-// A4 B4 C4 D4
-// A5 B5 C5 D5
-// A6 B6 C6 D6
-// A7 B7 C7 D7
-// ...
-// A31 B31 C31 D31
-// E0 F0 G0 H0
-// E1 F1 G1 H1
-// E2 F2 G2 H2
-// E3 F3 G3 H3
-// E4 F4 G4 H4
-// E5 F5 G5 H5
-// E6 F6 G6 H6
-// E7 F7 G7 H7
-// ...
-//
-// Four elements of the same row are arranged contiguously because maddubs and
-// madd both perform an adjacent addition in the kernel.
-template <typename Index, typename DataMapper, int Pack1, int Pack2,
-          bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, QInt8, ColMajor,
-                     Conjugate, PanelMode> {
-  EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
-                                    Index depth, Index rows, Index stride = 0,
-                                    Index offset = 0);
-};
-
-template <typename Index, typename DataMapper, int Pack1, int Pack2,
-          bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2,
-                                     QInt8, ColMajor, Conjugate, PanelMode>::
-operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
-           Index stride, Index offset) {
-  eigen_assert(stride == 0);
-  eigen_assert(offset == 0);
-
-  typedef typename packet_traits<QInt8>::type Packet;
-
-  // Use alternate function for weird sizes
-  if (rows % 32 != 0 || depth % 32 != 0) {
-    gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
-                      Conjugate, PanelMode> lhs_pack;
-    return lhs_pack(blockA, lhs, depth, rows, stride, offset);
-  }
-
-  // Get vector pointer
-  __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
-
-  // Pack rows in sets of 32
-  for (Index m = 0; m < rows; m += 32) {
-    // Pack depth in sets of 8
-    for (Index k = 0; k < depth; k += 8) {
-      // Load vectors
-      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
-      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
-
-      // Interleave 8-bit elements
-      __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
-      __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
-
-      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
-      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
-      __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
-      __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
-
-      // Interleave 16-bit elements
-      __m256i L_AD0_AD16 = _mm256_unpacklo_epi16(L_AB0_AB16, L_CD0_CD16);
-      __m256i L_AD4_AD20 = _mm256_unpackhi_epi16(L_AB0_AB16, L_CD0_CD16);
-
-      // Use permute before we store to cross 128-bit lanes
-      __m256i L_AD0 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x20);
-      _mm256_store_si256(blockA_256++, L_AD0);
-
-      // Complete packing for 32 x 8 block
-      __m256i L_AD16 = _mm256_permute2x128_si256(L_AD0_AD16, L_AD4_AD20, 0x31);
-      __m256i L_AD8_AD24 = _mm256_unpacklo_epi16(L_AB8_AB24, L_CD8_CD24);
-      __m256i L_AD12_AD28 = _mm256_unpackhi_epi16(L_AB8_AB24, L_CD8_CD24);
-      __m256i L_AD8 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x20);
-      _mm256_store_si256(blockA_256++, L_AD8);
-      _mm256_store_si256(blockA_256++, L_AD16);
-      __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
-      _mm256_store_si256(blockA_256++, L_AD24);
-      __m256i L_E = lhs.template loadPacket<Packet>(m, k + 4);
-      __m256i L_F = lhs.template loadPacket<Packet>(m, k + 5);
-      __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
-      __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
-      __m256i L_G = lhs.template loadPacket<Packet>(m, k + 6);
-      __m256i L_H = lhs.template loadPacket<Packet>(m, k + 7);
-      __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
-      __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
-      __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
-      __m256i L_EH4_EH20 = _mm256_unpackhi_epi16(L_EF0_EF16, L_GH0_GH16);
-      __m256i L_EH0 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x20);
-      _mm256_store_si256(blockA_256++, L_EH0);
-      __m256i L_EH16 = _mm256_permute2x128_si256(L_EH0_EH16, L_EH4_EH20, 0x31);
-      __m256i L_EH8_EH24 = _mm256_unpacklo_epi16(L_EF8_EF24, L_GH8_GH24);
-      __m256i L_EH12_EH28 = _mm256_unpackhi_epi16(L_EF8_EF24, L_GH8_GH24);
-      __m256i L_EH8 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x20);
-      _mm256_store_si256(blockA_256++, L_EH8);
-      _mm256_store_si256(blockA_256++, L_EH16);
-      __m256i L_EH24 = _mm256_permute2x128_si256(L_EH8_EH24, L_EH12_EH28, 0x31);
-      _mm256_store_si256(blockA_256++, L_EH24);
-    }
-  }
-}
-
-// Arrange a block of the right input matrix in contiguous memory.
-//
-// Given column major input (A0 beside A1 in memory):
-// A0 B0 C0 D0 E0 F0 G0 H0 ...
-// A1 B1 C1 D1 E1 F1 G1 H1 ...
-// A2 B2 C2 D2 E2 F2 G2 H2 ...
-// A3 B3 C3 D3 E3 F3 G3 H3 ...
-// A4 B4 C4 D4 E4 F4 G4 H4 ...
-// A5 B5 C5 D5 E5 F5 G5 H5 ...
-// A6 B6 C6 D6 E6 F6 G6 H6 ...
-// A7 B7 C7 D7 E7 F7 G7 H7 ...
-// A8 ...
-// ...
-//
-// Packing yields row major output (A0 beside A1 in memory):
-// A0 A1 A2 A3 A4 A5 A6 A7
-// B0 B1 B2 B3 B4 B5 B6 B7
-// ...
-//
-// At least four elements of the same col are arranged contiguously because
-// maddubs and madd both perform an adjacent addition in the kernel.  We can
-// save work by leaving 8 adjacent elements because kr = 8.
-template <typename Index, typename DataMapper, int nr, bool Conjugate,
-          bool PanelMode>
-struct gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
-                     PanelMode> {
-  EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs,
-                                    Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0);
-};
-
-template <typename Index, typename DataMapper, int nr, bool Conjugate,
-          bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<QUInt8, Index, DataMapper, nr, ColMajor,
-                                     Conjugate, PanelMode>::
-operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
-           Index stride, Index offset) {
-  eigen_assert(stride == 0);
-  eigen_assert(offset == 0);
-
-  typedef typename packet_traits<QUInt8>::type Packet;
-
-  // Use alternate function for weird sizes
-  if (cols % 32 != 0 || depth % 32 != 0) {
-    gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
-                      PanelMode> rhs_pack;
-    return rhs_pack(blockB, rhs, depth, cols, stride, offset);
-  }
-
-  // Get vector pointer
-  __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
-
-  // Perform a step of the packing for 4 columns
-  __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_8, R_AD_16, R_AD_24;
-#define PACK_STEP                                            \
-  R_AB_L = _mm256_unpacklo_epi64(R_A, R_B);                  \
-  R_CD_L = _mm256_unpacklo_epi64(R_C, R_D);                  \
-  R_AB_H = _mm256_unpackhi_epi64(R_A, R_B);                  \
-  R_CD_H = _mm256_unpackhi_epi64(R_C, R_D);                  \
-  R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20);  \
-  R_AD_16 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31); \
-  R_AD_8 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20);  \
-  R_AD_24 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
-  _mm256_store_si256(blockB_256, R_AD_0);                    \
-  _mm256_store_si256(blockB_256 + 8, R_AD_8);                \
-  _mm256_store_si256(blockB_256 + 16, R_AD_16);              \
-  _mm256_store_si256(blockB_256 + 24, R_AD_24);              \
-  blockB_256++;
-
-  // Pack cols in sets of 32
-  for (Index n = 0; n < cols; n += 32) {
-    // Pack depth in sets of 32
-    for (Index k = 0; k < depth; k += 32) {
-      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
-      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
-      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
-      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 4);
-      R_B = rhs.template loadPacket<Packet>(k, n + 5);
-      R_C = rhs.template loadPacket<Packet>(k, n + 6);
-      R_D = rhs.template loadPacket<Packet>(k, n + 7);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 8);
-      R_B = rhs.template loadPacket<Packet>(k, n + 9);
-      R_C = rhs.template loadPacket<Packet>(k, n + 10);
-      R_D = rhs.template loadPacket<Packet>(k, n + 11);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 12);
-      R_B = rhs.template loadPacket<Packet>(k, n + 13);
-      R_C = rhs.template loadPacket<Packet>(k, n + 14);
-      R_D = rhs.template loadPacket<Packet>(k, n + 15);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 16);
-      R_B = rhs.template loadPacket<Packet>(k, n + 17);
-      R_C = rhs.template loadPacket<Packet>(k, n + 18);
-      R_D = rhs.template loadPacket<Packet>(k, n + 19);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 20);
-      R_B = rhs.template loadPacket<Packet>(k, n + 21);
-      R_C = rhs.template loadPacket<Packet>(k, n + 22);
-      R_D = rhs.template loadPacket<Packet>(k, n + 23);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 24);
-      R_B = rhs.template loadPacket<Packet>(k, n + 25);
-      R_C = rhs.template loadPacket<Packet>(k, n + 26);
-      R_D = rhs.template loadPacket<Packet>(k, n + 27);
-      PACK_STEP;
-
-      R_A = rhs.template loadPacket<Packet>(k, n + 28);
-      R_B = rhs.template loadPacket<Packet>(k, n + 29);
-      R_C = rhs.template loadPacket<Packet>(k, n + 30);
-      R_D = rhs.template loadPacket<Packet>(k, n + 31);
-      PACK_STEP;
-
-      blockB_256 += 24;
-    }
-  }
-#undef PACK_STEP
-}
-
-// Perform the actual multiplication on packed inputs
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
-                   ConjugateRhs> {
-  typedef typename DataMapper::LinearMapper LinearMapper;
-
-  EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA,
-                  const QUInt8* blockB, Index rows, Index depth, Index cols,
-                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
-                  Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
-                                   ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
-           Index strideB, Index offsetA, Index offsetB) {
-  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  eigen_assert(alpha.value == 1);
-  eigen_assert(strideA == -1);
-  eigen_assert(strideB == -1);
-  eigen_assert(offsetA == 0);
-  eigen_assert(offsetB == 0);
-  eigen_assert(rows > 0);
-  eigen_assert(cols > 0);
-  eigen_assert(depth > 0);
-  eigen_assert(blockA);
-  eigen_assert(blockB);
-
-  // Use alternate function for weird sizes
-  if (rows % 32 != 0 || cols % 32 != 0 || depth % 32 != 0) {
-    gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
-                    ConjugateRhs> gebp;
-    return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB,
-                offsetA, offsetB);
-  }
-
-  // Create result block
-  QInt32* blockO = aligned_new<QInt32>(32 * 32);
-  // Allocating the result block is about 5-10% faster than declaring stack
-  // space.  It is unclear why this is the case.
-  // ei_declare_aligned_stack_constructed_variable(QInt32, blockO, 32 * 32, 0);
-  memset(blockO, 0, 32 * 32 * sizeof(QInt32));
-
-  // Get vectorized pointers
-  __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
-  const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
-  const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
-
-  // Loop over blocks of 32 columns
-  for (Index n = 0; n < cols; n += 32) {
-    // Reset index into blockA
-    Index indexL = 0;
-    // Loop over blocks of 32 rows
-    for (Index m = 0; m < rows; m += 32) {
-      // Reset index into blockB
-      Index indexR = n / 32 * depth;
-      // Loop over blocks of 8 on depth
-      for (Index k = 0; k < depth; k += 8) {
-        // Load inputs
-        __m256i L_AD0 = blockA_256[indexL++];
-        __m256i L_AD8 = blockA_256[indexL++];
-        __m256i L_AD16 = blockA_256[indexL++];
-        __m256i L_AD24 = blockA_256[indexL++];
-        __m256i L_EH0 = blockA_256[indexL++];
-        __m256i L_EH8 = blockA_256[indexL++];
-        __m256i L_EH16 = blockA_256[indexL++];
-        __m256i L_EH24 = blockA_256[indexL++];
-        __m256i R_AH0 = blockB_256[indexR++];
-        __m256i R_AH4 = blockB_256[indexR++];
-        __m256i R_AH8 = blockB_256[indexR++];
-        __m256i R_AH12 = blockB_256[indexR++];
-        __m256i R_AH16 = blockB_256[indexR++];
-        __m256i R_AH20 = blockB_256[indexR++];
-        __m256i R_AH24 = blockB_256[indexR++];
-        __m256i R_AH28 = blockB_256[indexR++];
-
-        // This constant is used with madd to convert 16 bit to 32 bit
-        const __m256i ONE = _mm256_set1_epi32(0x00010001);
-
-        // Declare variables used in COMPUTE_STEP
-        __m256i P_16_A, P_16_B, P_32_A, P_32_B, P_32;
-
-#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET)                             \
-  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD0);                             \
-  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
-  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH0);                             \
-  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
-  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
-  _mm256_store_si256(                                                          \
-      blockO_256 + 4 * OFFSET,                                                 \
-      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET), P_32));     \
-                                                                               \
-  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD8);                             \
-  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
-  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH8);                             \
-  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
-  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
-  _mm256_store_si256(                                                          \
-      blockO_256 + 4 * OFFSET + 1,                                             \
-      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 1), P_32)); \
-                                                                               \
-  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD16);                            \
-  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
-  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH16);                            \
-  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
-  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
-  _mm256_store_si256(                                                          \
-      blockO_256 + 4 * OFFSET + 2,                                             \
-      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 2), P_32)); \
-                                                                               \
-  P_16_A = _mm256_maddubs_epi16(R_INPUT_A, L_AD24);                            \
-  P_32_A = _mm256_madd_epi16(P_16_A, ONE);                                     \
-  P_16_B = _mm256_maddubs_epi16(R_INPUT_B, L_EH24);                            \
-  P_32_B = _mm256_madd_epi16(P_16_B, ONE);                                     \
-  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                     \
-  _mm256_store_si256(                                                          \
-      blockO_256 + 4 * OFFSET + 3,                                             \
-      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 4 * OFFSET + 3), P_32));
-
-        // Permute and shuffle to copy a single value across the entire vector
-        // Then compute the multiplication
-        __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
-        __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 0);
-        __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 1);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
-        __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 2);
-        __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 3);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 4);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 5);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 6);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 7);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 8);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 9);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 10);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 11);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 12);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 13);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 14);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 15);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 16);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 17);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH16, R_AH16, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 18);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 19);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 20);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 21);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH20, R_AH20, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 22);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 23);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 24);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 25);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH24, R_AH24, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 26);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 27);
-
-        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x00);
-        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD0, R_EH0, 28);
-        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD1, R_EH1, 29);
-        R_AH0_ = _mm256_permute2x128_si256(R_AH28, R_AH28, 0x11);
-        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
-        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
-        COMPUTE_STEP(R_AD2, R_EH2, 30);
-        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
-        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
-        COMPUTE_STEP(R_AD3, R_EH3, 31);
-
-#undef COMPUTE_STEP
-      }
-
-      // Transfer the results to the result matrix
-      Index i = 0;
-      for (Index j = n; j < n + 32; j++) {
-        LinearMapper r0 = res.getLinearMapper(m, j);
-        LinearMapper r1 = res.getLinearMapper(m + 8, j);
-        LinearMapper r2 = res.getLinearMapper(m + 16, j);
-        LinearMapper r3 = res.getLinearMapper(m + 24, j);
-        typedef typename packet_traits<QInt32>::type Packet;
-        r0.template storePacket<Packet>(
-            0, _mm256_add_epi32(blockO_256[i++],
-                                r0.template loadPacket<Packet>(0)));
-        r1.template storePacket<Packet>(
-            0, _mm256_add_epi32(blockO_256[i++],
-                                r1.template loadPacket<Packet>(0)));
-        r2.template storePacket<Packet>(
-            0, _mm256_add_epi32(blockO_256[i++],
-                                r2.template loadPacket<Packet>(0)));
-        r3.template storePacket<Packet>(
-            0, _mm256_add_epi32(blockO_256[i++],
-                                r3.template loadPacket<Packet>(0)));
-      }
-
-      // Zero the result block so it can be reused
-      memset(blockO, 0, 32 * 32 * sizeof(QInt32));
-    }
-  }
-  aligned_delete(blockO, 32 * 32);
-}
-
-#endif  // EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // CXX11_SRC_FIXEDPOINT_MATMATPRODUCTAVX2_H_

diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h b/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
deleted file mode 100644
index 9e0efae..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
+++ /dev/null

@@ -1,92 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef CXX11_SRC_FIXEDPOINT_MATMATPRODUCTNEON_H_
-#define CXX11_SRC_FIXEDPOINT_MATMATPRODUCTNEON_H_
-
-namespace Eigen {
-namespace internal {
-
-// AVX2 optimized implementation of the case where the lhs is encoded using
-// signed 8bit
-// integers and the rhs using unsigned 8bit integers.
-#ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-
-template <bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
- public:
-  typedef QInt8 LhsScalar;
-  typedef QUInt8 RhsScalar;
-  typedef QInt32 ResScalar;
-
-  enum {
-    // register block size along the M and N directions
-    // One for the current implementation
-    nr = 1,
-    mr = 1,
-    // Progress made at each iteration of the product loop
-    // also 1 for the current implementation
-    LhsProgress = 1,
-    RhsProgress = 1
-  };
-};
-
-// Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
-                   ConjugateRhs> {
-  EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA,
-                  const QUInt8* blockB, Index rows, Index depth, Index cols,
-                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
-                  Index offsetA = 0, Index offsetB = 0);
-};
-
-template <typename Index, typename DataMapper, int mr, int nr,
-          bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
-                                   ConjugateLhs, ConjugateRhs>::
-operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
-           Index strideB, Index offsetA, Index offsetB) {
-  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  eigen_assert(alpha.value == 1);
-  eigen_assert(strideA == -1);
-  eigen_assert(strideB == -1);
-  eigen_assert(offsetA == 0);
-  eigen_assert(offsetB == 0);
-
-  eigen_assert(rows > 0);
-  eigen_assert(cols > 0);
-  eigen_assert(depth > 0);
-  eigen_assert(blockA);
-  eigen_assert(blockB);
-
-  for (Index j = 0; j < cols; ++j) {
-    Index startB = j * depth;
-
-    for (Index i = 0; i < rows; ++i) {
-      Index startA = i * depth;
-
-      for (Index k = 0; k < depth; ++k) {
-        res(i, j) += blockA[startA + k] * blockB[startB + k];
-      }
-    }
-  }
-}
-#endif
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // CXX11_SRC_FIXEDPOINT_MATMATPRODUCTNEON_H_

diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h b/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
deleted file mode 100644
index f15200c..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
+++ /dev/null

@@ -1,145 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef CXX11_SRC_FIXEDPOINT_MATVECPRODUCT_H_
-#define CXX11_SRC_FIXEDPOINT_MATVECPRODUCT_H_
-
-namespace Eigen {
-namespace internal {
-
-// Mat-Vec product
-// Both lhs and rhs are encoded as 8bit signed integers
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
-          typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index, QInt8, LhsMapper, ColMajor,
-                                     ConjugateLhs, QInt8, RhsMapper,
-                                     ConjugateRhs, Version> {
-  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
-                                    const LhsMapper& lhs, const RhsMapper& rhs,
-                                    QInt32* res, Index resIncr, QInt8 alpha);
-};
-
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
-          typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<
-    Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper,
-    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
-                                const RhsMapper& rhs, QInt32* res,
-                                Index resIncr, QInt8 alpha) {
-  eigen_assert(alpha.value == 1);
-  eigen_assert(resIncr == 1);
-  eigen_assert(rows > 0);
-  eigen_assert(cols > 0);
-
-  for (Index i = 0; i < rows; ++i) {
-    for (Index j = 0; j < cols; ++j) {
-      res[i] += lhs(i, j) * rhs(j, 0);
-    }
-  }
-}
-
-// Mat-Vec product
-// Both lhs and rhs are encoded as 16bit signed integers
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
-          typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index, QInt16, LhsMapper, ColMajor,
-                                     ConjugateLhs, QInt16, RhsMapper,
-                                     ConjugateRhs, Version> {
-  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
-                                    const LhsMapper& lhs, const RhsMapper& rhs,
-                                    QInt32* res, Index resIncr, QInt16 alpha);
-};
-
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
-          typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<
-    Index, QInt16, LhsMapper, ColMajor, ConjugateLhs, QInt16, RhsMapper,
-    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
-                                const RhsMapper& rhs, QInt32* res,
-                                Index resIncr, QInt16 alpha) {
-  eigen_assert(alpha.value == 1);
-  eigen_assert(resIncr == 1);
-  eigen_assert(rows > 0);
-  eigen_assert(cols > 0);
-
-  for (Index i = 0; i < rows; ++i) {
-    for (Index j = 0; j < cols; ++j) {
-      res[i] += lhs(i, j) * rhs(j, 0);
-    }
-  }
-}
-
-// Mat-Vec product
-// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned
-// integers
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
-          typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index, QInt8, LhsMapper, ColMajor,
-                                     ConjugateLhs, QUInt8, RhsMapper,
-                                     ConjugateRhs, Version> {
-  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
-                                    const LhsMapper& lhs, const RhsMapper& rhs,
-                                    QInt32* res, Index resIncr, QUInt8 alpha);
-};
-
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
-          typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<
-    Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QUInt8, RhsMapper,
-    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
-                                const RhsMapper& rhs, QInt32* res,
-                                Index resIncr, QUInt8 alpha) {
-  eigen_assert(alpha.value == 1);
-  eigen_assert(resIncr == 1);
-  eigen_assert(rows > 0);
-  eigen_assert(cols > 0);
-
-  for (Index i = 0; i < rows; ++i) {
-    for (Index j = 0; j < cols; ++j) {
-      res[i] += lhs(i, j) * rhs(j, 0);
-    }
-  }
-}
-
-// Mat-Vec product
-// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed
-// integers
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
-          typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index, QUInt8, LhsMapper, ColMajor,
-                                     ConjugateLhs, QInt8, RhsMapper,
-                                     ConjugateRhs, Version> {
-  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
-                                    const LhsMapper& lhs, const RhsMapper& rhs,
-                                    QInt32* res, Index resIncr, QInt8 alpha);
-};
-
-template <typename Index, typename LhsMapper, bool ConjugateLhs,
-          typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<
-    Index, QUInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper,
-    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
-                                const RhsMapper& rhs, QInt32* res,
-                                Index resIncr, QInt8 alpha) {
-  eigen_assert(alpha.value == 1);
-  eigen_assert(resIncr == 1);
-  eigen_assert(rows > 0);
-  eigen_assert(cols > 0);
-
-  for (Index i = 0; i < rows; ++i) {
-    for (Index j = 0; j < cols; ++j) {
-      res[i] += lhs(i, j) * rhs(j, 0);
-    }
-  }
-}
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // CXX11_SRC_FIXEDPOINT_MATVECPRODUCT_H_

diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h b/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h
deleted file mode 100644
index 1a7cd03..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h
+++ /dev/null

@@ -1,149 +0,0 @@
-#ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_
-#define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_
-#ifdef _MSC_VER
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <smmintrin.h>
-
-#endif
-
-namespace Eigen {
-namespace internal {
-
-typedef eigen_packet_wrapper<__m256i, 10> Packet32q8i;
-typedef eigen_packet_wrapper<__m128i, 11> Packet16q8i;
-
-template <>
-struct packet_traits<QInt8> : default_packet_traits {
-  typedef Packet32q8i type;
-  typedef Packet16q8i half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 32,
-  };
-  enum {
-    HasAdd = 0,
-    HasSub = 0,
-    HasMul = 0,
-    HasNegate = 0,
-    HasAbs = 0,
-    HasAbs2 = 0,
-    HasMin = 0,
-    HasMax = 0,
-    HasConj = 0,
-    HasSetLinear = 0
-  };
-};
-
-template <>
-struct unpacket_traits<Packet32q8i> {
-  typedef QInt8 type;
-  typedef Packet16q8i half;
-  enum {
-    size = 32,
-    alignment = Aligned32,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-
-template <>
-struct unpacket_traits<Packet16q8i> {
-  typedef QInt8 type;
-  typedef Packet16q8i half;
-  enum {
-    size = 16,
-    alignment = Aligned32,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template <>
-EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
-  return _mm256_set1_epi8(from.value);
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
-      reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
-      reinterpret_cast<const __m128i*>(from));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
-      reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
-      reinterpret_cast<const __m128i*>(from));
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
-      reinterpret_cast<__m256i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
-                                               from.m_val);
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
-                                               from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
-                                            from.m_val);
-}
-
-typedef __m256 Packet8f;
-
-template <>
-struct type_casting_traits<float, QInt8> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8i
-pcast<Packet8f, Packet32q8i>(const Packet8f& a, const Packet8f& b,
-                             const Packet8f& c, const Packet8f& d) {
-  const __m256i a_conv = _mm256_cvtps_epi32(a);
-  const __m256i b_conv = _mm256_cvtps_epi32(b);
-  const __m256i c_conv = _mm256_cvtps_epi32(c);
-  const __m256i d_conv = _mm256_cvtps_epi32(d);
-  __m128i low = _mm256_castsi256_si128(a_conv);
-  __m128i high = _mm256_extractf128_si256(a_conv, 1);
-  __m128i tmp = _mm_packs_epi32(low, high);
-  __m128i low2 = _mm256_castsi256_si128(b_conv);
-  __m128i high2 = _mm256_extractf128_si256(b_conv, 1);
-  __m128i tmp2 = _mm_packs_epi32(low2, high2);
-  __m128i converted_low = _mm_packs_epi16(tmp, tmp2);
-  low = _mm256_castsi256_si128(c_conv);
-  high = _mm256_extractf128_si256(c_conv, 1);
-  tmp = _mm_packs_epi32(low, high);
-  low2 = _mm256_castsi256_si128(d_conv);
-  high2 = _mm256_extractf128_si256(d_conv, 1);
-  tmp2 = _mm_packs_epi32(low2, high2);
-  __m128i converted_high = _mm_packs_epi16(tmp, tmp2);
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(converted_low),
-                                 converted_high, 1);
-}
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif  // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_

diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
deleted file mode 100644
index 385aaf8..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ /dev/null

@@ -1,545 +0,0 @@
-#ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
-#define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
-#ifdef _MSC_VER
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <smmintrin.h>
-
-#endif
-
-inline int _mm256_extract_epi16_N0(const __m256i X) {
-  return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8);
-}
-
-inline int _mm256_extract_epi16_N1(const __m256i X) {
-  return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8);
-}
-
-inline int _mm256_extract_epi8_N0(const __m256i X) {
-  return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16);
-}
-
-inline int _mm256_extract_epi8_N1(const __m256i X) {
-  return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
-}
-
-namespace Eigen {
-namespace internal {
-
-typedef eigen_packet_wrapper<__m256i, 20> Packet32q8i;
-typedef eigen_packet_wrapper<__m256i, 21> Packet16q16i;
-typedef eigen_packet_wrapper<__m256i, 22> Packet32q8u;
-typedef eigen_packet_wrapper<__m128i, 23> Packet16q8i;
-typedef eigen_packet_wrapper<__m128i, 25> Packet16q8u;
-typedef eigen_packet_wrapper<__m128i, 26> Packet8q16i;
-typedef eigen_packet_wrapper<__m256i, 27> Packet8q32i;
-typedef eigen_packet_wrapper<__m128i, 28> Packet4q32i;
-
-#ifndef EIGEN_VECTORIZE_AVX512
-template <>
-struct packet_traits<QInt8> : default_packet_traits {
-  typedef Packet32q8i type;
-  typedef Packet16q8i half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 32,
-  };
-  enum {
-    HasAdd = 0,
-    HasSub = 0,
-    HasMul = 0,
-    HasNegate = 0,
-    HasAbs = 0,
-    HasAbs2 = 0,
-    HasMin = 1,
-    HasMax = 1,
-    HasConj = 0,
-    HasSetLinear = 0
-  };
-};
-template <>
-struct packet_traits<QUInt8> : default_packet_traits {
-  typedef Packet32q8u type;
-  typedef Packet16q8u half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 32,
-  };
-  enum {
-    HasAdd = 0,
-    HasSub = 0,
-    HasMul = 0,
-    HasNegate = 0,
-    HasAbs = 0,
-    HasAbs2 = 0,
-    HasMin = 1,
-    HasMax = 1,
-    HasConj = 0,
-    HasSetLinear = 0
-  };
-};
-template <>
-struct packet_traits<QInt16> : default_packet_traits {
-  typedef Packet16q16i type;
-  typedef Packet8q16i half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 16,
-  };
-  enum {
-    HasAdd = 0,
-    HasSub = 0,
-    HasMul = 0,
-    HasNegate = 0,
-    HasAbs = 0,
-    HasAbs2 = 0,
-    HasMin = 1,
-    HasMax = 1,
-    HasConj = 0,
-    HasSetLinear = 0
-  };
-};
-template <>
-struct packet_traits<QInt32> : default_packet_traits {
-  typedef Packet8q32i type;
-  typedef Packet4q32i half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-  };
-  enum {
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasNegate = 1,
-    HasAbs = 0,
-    HasAbs2 = 0,
-    HasMin = 1,
-    HasMax = 1,
-    HasConj = 0,
-    HasSetLinear = 0
-  };
-};
-#endif
-
-template <>
-struct unpacket_traits<Packet32q8i> {
-  typedef QInt8 type;
-  typedef Packet16q8i half;
-  enum {
-    size = 32,
-    alignment = Aligned32,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template <>
-struct unpacket_traits<Packet16q8i> {
-  typedef QInt8 type;
-  typedef Packet16q8i half;
-  enum {
-    size = 16,
-    alignment = Aligned32,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template <>
-struct unpacket_traits<Packet16q16i> {
-  typedef QInt16 type;
-  typedef Packet8q16i half;
-  enum {
-    size = 16,
-    alignment = Aligned32,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template <>
-struct unpacket_traits<Packet8q16i> {
-  typedef QInt16 type;
-  typedef Packet8q16i half;
-  enum {
-    size = 8,
-    alignment = Aligned32,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template <>
-struct unpacket_traits<Packet32q8u> {
-  typedef QUInt8 type;
-  typedef Packet16q8u half;
-  enum {
-    size = 32,
-    alignment = Aligned32,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template <>
-struct unpacket_traits<Packet8q32i> {
-  typedef QInt32 type;
-  typedef Packet4q32i half;
-  enum {
-    size = 8,
-    alignment = Aligned32,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-
-// Unaligned load
-template <>
-EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
-      reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
-      reinterpret_cast<const __m128i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
-      reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
-      reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q16i ploadu<Packet8q16i>(const QInt16* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
-      reinterpret_cast<const __m128i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
-      reinterpret_cast<const __m256i*>(from));
-}
-
-// Aligned load
-template <>
-EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
-      reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
-      reinterpret_cast<const __m128i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
-      reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
-      reinterpret_cast<const __m256i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q16i pload<Packet8q16i>(const QInt16* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
-      reinterpret_cast<const __m128i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
-      reinterpret_cast<const __m256i*>(from));
-}
-
-// Unaligned store
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
-      reinterpret_cast<__m256i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
-                                               from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
-      reinterpret_cast<__m256i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
-      reinterpret_cast<__m256i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet8q16i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
-                                               from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
-      reinterpret_cast<__m256i*>(to), from.m_val);
-}
-
-// Aligned store
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
-                                               from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
-                                               from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet8q16i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
-                                            from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
-                                               from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
-                                               from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
-                                            from.m_val);
-}
-
-// Extract first element.
-template <>
-EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
-  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
-}
-template <>
-EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
-  return _mm256_extract_epi16_N0(a.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
-  return static_cast<uint8_t>(_mm256_extract_epi8_N0(a.m_val));
-}
-template <>
-EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) {
-  return _mm256_extract_epi8_N0(a.m_val);
-}
-
-// Initialize to constant value.
-template <>
-EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
-  return _mm256_set1_epi8(from.value);
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q8u pset1<Packet32q8u>(const QUInt8& from) {
-  return _mm256_set1_epi8(static_cast<uint8_t>(from.value));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) {
-  return _mm256_set1_epi32(from.value);
-}
-
-// Basic arithmetic packet ops for QInt32.
-template <>
-EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
-                                                  const Packet8q32i& b) {
-  return _mm256_add_epi32(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
-  return _mm256_set1_epi16(from.value);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
-                                                  const Packet8q32i& b) {
-  return _mm256_sub_epi32(a.m_val, b.m_val);
-}
-// Note: mullo truncates the result to 32 bits.
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a,
-                                                  const Packet8q32i& b) {
-  return _mm256_mullo_epi32(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) {
-  return _mm256_sub_epi32(_mm256_setzero_si256(), a.m_val);
-}
-
-// Min and max.
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a,
-                                                  const Packet8q32i& b) {
-  return _mm256_min_epi32(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
-                                                  const Packet8q32i& b) {
-  return _mm256_max_epi32(a.m_val, b.m_val);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
-                                                    const Packet16q16i& b) {
-  return _mm256_min_epi16(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
-                                                    const Packet16q16i& b) {
-  return _mm256_max_epi16(a.m_val, b.m_val);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
-                                                  const Packet32q8u& b) {
-  return _mm256_min_epu8(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a,
-                                                  const Packet32q8u& b) {
-  return _mm256_max_epu8(a.m_val, b.m_val);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a,
-                                                  const Packet32q8i& b) {
-  return _mm256_min_epi8(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a,
-                                                  const Packet32q8i& b) {
-  return _mm256_max_epi8(a.m_val, b.m_val);
-}
-
-// Reductions.
-template <>
-EIGEN_STRONG_INLINE QInt32 predux_min<Packet8q32i>(const Packet8q32i& a) {
-  __m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp =
-      _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  return pfirst<Packet8q32i>(
-      _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
-  __m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp =
-      _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  return pfirst<Packet8q32i>(
-      _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
-  __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp =
-      _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
-  return std::min(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp));
-}
-template <>
-EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
-  __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp =
-      _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
-  return std::max(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp));
-}
-
-template <>
-EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
-  __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp =
-      _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
-  tmp = _mm256_min_epu8(tmp,
-                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  return std::min(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)),
-                  static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp)));
-}
-template <>
-EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) {
-  __m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp =
-      _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
-  tmp = _mm256_max_epu8(tmp,
-                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  return std::max(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)),
-                  static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp)));
-}
-
-template <>
-EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) {
-  __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp =
-      _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
-  tmp = _mm256_min_epi8(tmp,
-                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  return std::min(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
-}
-template <>
-EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) {
-  __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp =
-      _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
-  tmp = _mm256_max_epi8(tmp,
-                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  return std::max(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
-}
-
-// Vectorized scaling of Packet32q8i by float.
-template <>
-struct scalar_product_op<QInt32, double> : binary_op_base<QInt32, double> {
-  typedef typename ScalarBinaryOpTraits<QInt32, double>::ReturnType result_type;
-#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  scalar_product_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
-  operator()(const QInt32& a, const double& b) const {
-    return a * b;
-  }
-
-  EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a,
-                                                 const double& b) const {
-    __m256d scale = _mm256_set1_pd(b);
-    __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
-    __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo));
-    __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1));
-    __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi));
-    return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi,
-                                   1);
-  }
-};
-
-template <>
-struct functor_traits<scalar_product_op<QInt32, double>> {
-  enum { Cost = 4 * NumTraits<float>::MulCost, PacketAccess = true };
-};
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif  // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_

diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
deleted file mode 100644
index 5a0ae2e..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+++ /dev/null

@@ -1,516 +0,0 @@
-#ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
-#define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
-
-#include "PacketMathAVX2.h"
-
-namespace Eigen {
-namespace internal {
-
-typedef eigen_packet_wrapper<__m512i, 30> Packet64q8i;
-typedef eigen_packet_wrapper<__m512i, 31> Packet32q16i;
-typedef eigen_packet_wrapper<__m512i, 32> Packet64q8u;
-typedef eigen_packet_wrapper<__m512i, 33> Packet16q32i;
-
-template <>
-struct packet_traits<QInt8> : default_packet_traits {
-  typedef Packet64q8i type;
-  typedef Packet32q8i half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 64,
-  };
-  enum {
-    HasAdd = 0,
-    HasSub = 0,
-    HasMul = 0,
-    HasNegate = 0,
-    HasAbs = 0,
-    HasAbs2 = 0,
-    HasMin = 1,
-    HasMax = 1,
-    HasConj = 0,
-    HasSetLinear = 0
-  };
-};
-template <>
-struct packet_traits<QUInt8> : default_packet_traits {
-  typedef Packet64q8u type;
-  typedef Packet32q8u half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 64,
-  };
-  enum {
-    HasAdd = 0,
-    HasSub = 0,
-    HasMul = 0,
-    HasNegate = 0,
-    HasAbs = 0,
-    HasAbs2 = 0,
-    HasMin = 1,
-    HasMax = 1,
-    HasConj = 0,
-    HasSetLinear = 0
-  };
-};
-template <>
-struct packet_traits<QInt16> : default_packet_traits {
-  typedef Packet32q16i type;
-  typedef Packet16q16i half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 32,
-  };
-  enum {
-    HasAdd = 0,
-    HasSub = 0,
-    HasMul = 0,
-    HasNegate = 0,
-    HasAbs = 0,
-    HasAbs2 = 0,
-    HasMin = 1,
-    HasMax = 1,
-    HasConj = 0,
-    HasSetLinear = 0
-  };
-};
-template <>
-struct packet_traits<QInt32> : default_packet_traits {
-  typedef Packet16q32i type;
-  typedef Packet8q32i half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 16,
-  };
-  enum {
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasNegate = 1,
-    HasAbs = 0,
-    HasAbs2 = 0,
-    HasMin = 1,
-    HasMax = 1,
-    HasConj = 0,
-    HasSetLinear = 0
-  };
-};
-
-template <>
-struct unpacket_traits<Packet64q8i> {
-  typedef QInt8 type;
-  typedef Packet32q8i half;
-  enum {
-    size = 64,
-    alignment = Aligned64,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template <>
-struct unpacket_traits<Packet32q16i> {
-  typedef QInt16 type;
-  typedef Packet16q16i half;
-  enum {
-    size = 32,
-    alignment = Aligned64,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template <>
-struct unpacket_traits<Packet64q8u> {
-  typedef QUInt8 type;
-  typedef Packet32q8u half;
-  enum {
-    size = 64,
-    alignment = Aligned64,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template <>
-struct unpacket_traits<Packet16q32i> {
-  typedef QInt32 type;
-  typedef Packet8q32i half;
-  enum {
-    size = 16,
-    alignment = Aligned64,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-
-// Unaligned load
-template <>
-EIGEN_STRONG_INLINE Packet64q8i ploadu<Packet64q8i>(const QInt8* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
-      reinterpret_cast<const __m512i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q16i ploadu<Packet32q16i>(const QInt16* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
-      reinterpret_cast<const __m512i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet64q8u ploadu<Packet64q8u>(const QUInt8* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
-      reinterpret_cast<const __m512i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q32i ploadu<Packet16q32i>(const QInt32* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
-      reinterpret_cast<const __m512i*>(from));
-}
-
-// Aligned load
-template <>
-EIGEN_STRONG_INLINE Packet64q8i pload<Packet64q8i>(const QInt8* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
-      reinterpret_cast<const __m512i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q16i pload<Packet32q16i>(const QInt16* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
-      reinterpret_cast<const __m512i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet64q8u pload<Packet64q8u>(const QUInt8* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
-      reinterpret_cast<const __m512i*>(from));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pload<Packet16q32i>(const QInt32* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
-      reinterpret_cast<const __m512i*>(from));
-}
-
-// Unaligned store
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet64q8i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
-      reinterpret_cast<__m512i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet32q16i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
-      reinterpret_cast<__m512i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet64q8u& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
-      reinterpret_cast<__m512i*>(to), from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet16q32i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
-      reinterpret_cast<__m512i*>(to), from.m_val);
-}
-
-// Aligned store
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet16q32i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
-                                               from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet64q8u& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
-                                               from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet64q8i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
-                                               from.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet32q16i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
-                                               from.m_val);
-}
-
-// Extract first element.
-template <>
-EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
-  return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a, 0));
-}
-template <>
-EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
-  return static_cast<uint8_t>(
-      _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0));
-}
-template <>
-EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
-  return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0);
-}
-template <>
-EIGEN_STRONG_INLINE QInt16 pfirst<Packet32q16i>(const Packet32q16i& a) {
-  return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.m_val, 0), 0);
-}
-
-// Initialize to constant value.
-template <>
-EIGEN_STRONG_INLINE Packet64q8i pset1<Packet64q8i>(const QInt8& from) {
-  return _mm512_set1_epi8(from.value);
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q16i pset1<Packet32q16i>(const QInt16& from) {
-  return _mm512_set1_epi16(from.value);
-}
-template <>
-EIGEN_STRONG_INLINE Packet64q8u pset1<Packet64q8u>(const QUInt8& from) {
-  return _mm512_set1_epi8(static_cast<uint8_t>(from.value));
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pset1<Packet16q32i>(const QInt32& from) {
-  return _mm512_set1_epi32(from.value);
-}
-
-// Basic arithmetic packet ops for QInt32.
-template <>
-EIGEN_STRONG_INLINE Packet16q32i padd<Packet16q32i>(const Packet16q32i& a,
-                                                    const Packet16q32i& b) {
-  return _mm512_add_epi32(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q32i psub<Packet16q32i>(const Packet16q32i& a,
-                                                    const Packet16q32i& b) {
-  return _mm512_sub_epi32(a.m_val, b.m_val);
-}
-// Note: mullo truncates the result to 32 bits.
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pmul<Packet16q32i>(const Packet16q32i& a,
-                                                    const Packet16q32i& b) {
-  return _mm512_mullo_epi32(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pnegate<Packet16q32i>(const Packet16q32i& a) {
-  return _mm512_sub_epi32(_mm512_setzero_si512(), a.m_val);
-}
-
-// Min and max.
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pmin<Packet16q32i>(const Packet16q32i& a,
-                                                    const Packet16q32i& b) {
-  return _mm512_min_epi32(a.m_val, b.m_val);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pmax<Packet16q32i>(const Packet16q32i& a,
-                                                    const Packet16q32i& b) {
-  return _mm512_max_epi32(a.m_val, b.m_val);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet64q8u pmin<Packet64q8u>(const Packet64q8u& a,
-                                                  const Packet64q8u& b) {
-#ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_min_epu8(a.m_val, b.m_val);
-#else
-  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
-  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
-  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
-  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
-  __m256i r0 = _mm256_min_epu8(ap0, bp0);
-  __m256i r1 = _mm256_min_epu8(ap1, bp1);
-  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE Packet64q8u pmax<Packet64q8u>(const Packet64q8u& a,
-                                                  const Packet64q8u& b) {
-#ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_max_epu8(a.m_val, b.m_val);
-#else
-  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
-  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
-  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
-  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
-  __m256i r0 = _mm256_max_epu8(ap0, bp0);
-  __m256i r1 = _mm256_max_epu8(ap1, bp1);
-  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
-#endif
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet64q8i pmin<Packet64q8i>(const Packet64q8i& a,
-                                                  const Packet64q8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_min_epi8(a.m_val, b.m_val);
-#else
-  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
-  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
-  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
-  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
-  __m256i r0 = _mm256_min_epi8(ap0, bp0);
-  __m256i r1 = _mm256_min_epi8(ap1, bp1);
-  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q16i pmin<Packet32q16i>(const Packet32q16i& a,
-                                                    const Packet32q16i& b) {
-#ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_min_epi16(a.m_val, b.m_val);
-#else
-  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
-  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
-  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
-  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
-  __m256i r0 = _mm256_min_epi16(ap0, bp0);
-  __m256i r1 = _mm256_min_epi16(ap1, bp1);
-  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE Packet64q8i pmax<Packet64q8i>(const Packet64q8i& a,
-                                                  const Packet64q8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_max_epi8(a.m_val, b.m_val);
-#else
-  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
-  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
-  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
-  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
-  __m256i r0 = _mm256_max_epi8(ap0, bp0);
-  __m256i r1 = _mm256_max_epi8(ap1, bp1);
-  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a,
-                                                    const Packet32q16i& b) {
-#ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_max_epi16(a.m_val, b.m_val);
-#else
-  __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
-  __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
-  __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
-  __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
-  __m256i r0 = _mm256_max_epi16(ap0, bp0);
-  __m256i r1 = _mm256_max_epi16(ap1, bp1);
-  return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
-#endif
-}
-
-// Reductions.
-template <>
-EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
-  Packet4i res =
-      _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
-  res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
-  return pfirst(res);
-}
-template <>
-EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
-  Packet4i res =
-      _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
-  res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
-  return pfirst(res);
-}
-template <>
-EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
-  Packet4i res =
-      _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
-  res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
-  std::uint32_t w = pfirst(res);
-  return std::min(
-      {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
-}
-template <>
-EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
-  Packet4i res =
-      _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
-  res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
-  std::uint32_t w = pfirst(res);
-  return std::max(
-      {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
-}
-template <>
-EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
-  Packet4i res =
-      _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
-  res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
-  std::uint32_t w = pfirst(res);
-  return std::min(
-      {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
-       static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
-}
-template <>
-EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
-  Packet4i res =
-      _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
-  res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
-  std::uint32_t w = pfirst(res);
-  return std::max(
-      {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
-       static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
-}
-template <>
-EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
-  Packet4i res =
-      _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
-  res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
-  std::uint32_t w = pfirst(res);
-  return std::min(
-      {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
-       static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
-}
-template <>
-EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
-  Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
-  Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
-  Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
-  Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
-  Packet4i res =
-      _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
-  res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
-  std::uint32_t w = pfirst(res);
-  return std::min(
-      {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
-       static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
-}
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif  // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_

diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h b/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
deleted file mode 100644
index 5dd2cd3..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
+++ /dev/null

@@ -1,93 +0,0 @@
-#ifndef CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
-#define CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
-
-namespace Eigen {
-namespace internal {
-
-typedef __m256 Packet8f;
-
-template <>
-struct type_casting_traits<QInt32, float> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet8f pcast<Packet8q32i>(const Packet8q32i& a) {
-  return _mm256_cvtepi32_ps(a.m_val);
-}
-
-template <>
-struct type_casting_traits<float, QInt32> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet8q32i pcast<Packet8f>(const Packet8f& a) {
-  return _mm256_cvtps_epi32(a);
-}
-
-template <>
-struct type_casting_traits<QInt32, QInt8> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8i
-pcast<Packet8q32i, Packet32q8i>(const Packet8q32i& a, const Packet8q32i& b,
-                                const Packet8q32i& c, const Packet8q32i& d) {
-  __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.m_val, b.m_val),
-                                         _mm256_packs_epi32(c.m_val, d.m_val));
-  // Since packs does not cross 128 bit lane boundaries,
-  // we have to permute to properly order the final result.
-  const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-  return _mm256_permutevar8x32_epi32(converted, permute_mask);
-}
-
-template <>
-struct type_casting_traits<float, QInt8> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8i
-pcast<Packet8f, Packet32q8i>(const Packet8f& a, const Packet8f& b,
-                             const Packet8f& c, const Packet8f& d) {
-  const __m256i a_conv = _mm256_cvtps_epi32(a);
-  const __m256i b_conv = _mm256_cvtps_epi32(b);
-  const __m256i c_conv = _mm256_cvtps_epi32(c);
-  const __m256i d_conv = _mm256_cvtps_epi32(d);
-  __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a_conv, b_conv),
-                                         _mm256_packs_epi32(c_conv, d_conv));
-  const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-  return _mm256_permutevar8x32_epi32(converted, permute_mask);
-}
-
-template <>
-struct type_casting_traits<QInt32, QUInt8> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet32q8u
-pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b,
-                                const Packet8q32i& c, const Packet8q32i& d) {
-  // _mm256_packus_epi32 trims negative numbers to 0 but we can't allow numbers
-  // that are too large because _mm256_packus_epi16 expects signed input
-  // (example of problem input: 0x11111111, which saturates to 0xffff = -1,
-  // which saturates to 0).
-  const __m256i a_clip = _mm256_min_epi32(a, _mm256_set1_epi32(255));
-  const __m256i b_clip = _mm256_min_epi32(b, _mm256_set1_epi32(255));
-  const __m256i c_clip = _mm256_min_epi32(c, _mm256_set1_epi32(255));
-  const __m256i d_clip = _mm256_min_epi32(d, _mm256_set1_epi32(255));
-  const __m256i converted = _mm256_packus_epi16(
-      _mm256_packus_epi32(a_clip, b_clip), _mm256_packus_epi32(c_clip, d_clip));
-  // Since packus does not cross 128 bit lane boundaries,
-  // we have to permute to properly order the final result.
-  const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-  return _mm256_permutevar8x32_epi32(converted, permute_mask);
-}
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif  // CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_

diff --git a/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
deleted file mode 100644
index 17408d1..0000000
--- a/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+++ /dev/null

@@ -1,191 +0,0 @@
-#ifndef CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
-#define CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
-
-namespace Eigen {
-namespace internal {
-
-typedef __m512 Packet16f;
-typedef __m512i Packet16i;
-
-template <>
-struct type_casting_traits<QInt32, float> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet16f pcast<Packet16q32i>(const Packet16q32i& a) {
-  return _mm512_cvtepi32_ps(a.m_val);
-}
-
-template <>
-struct type_casting_traits<float, QInt32> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet16q32i pcast<Packet16f>(const Packet16f& a) {
-  return _mm512_cvtps_epi32(a);
-}
-
-template <>
-struct type_casting_traits<float, QInt16> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16f>(const Packet16f& a,
-                                                  const Packet16f& b) {
-  Packet16i a_int = _mm512_cvtps_epi32(a);
-  Packet16i b_int = _mm512_cvtps_epi32(b);
-#ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_packs_epi32(a_int, b_int);
-#else
-  Packet8i ab_int16_low = _mm256_permute4x64_epi64(
-      _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
-                         _mm512_castsi512_si256(b_int)),
-      _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i ab_int16_high = _mm256_permute4x64_epi64(
-      _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
-                         _mm512_extracti32x8_epi32(b_int, 1)),
-      _MM_SHUFFLE(0, 2, 1, 3));
-  return _mm512_inserti32x8(_mm512_castsi256_si512(ab_int16_low), ab_int16_high,
-                            1);
-#endif
-}
-
-template <>
-struct type_casting_traits<float, QInt8> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet64q8i pcast<Packet16f>(const Packet16f& a,
-                                                 const Packet16f& b,
-                                                 const Packet16f& c,
-                                                 const Packet16f& d) {
-  Packet16i a_int = _mm512_cvtps_epi32(a);
-  Packet16i b_int = _mm512_cvtps_epi32(b);
-  Packet16i c_int = _mm512_cvtps_epi32(c);
-  Packet16i d_int = _mm512_cvtps_epi32(d);
-#ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_packs_epi16(_mm512_packs_epi32(a_int, b_int),
-                            _mm512_packs_epi32(c_int, d_int));
-#else
-  Packet8i ab_int16_low = _mm256_permute4x64_epi64(
-      _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
-                         _mm512_castsi512_si256(b_int)),
-      _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i cd_int16_low = _mm256_permute4x64_epi64(
-      _mm256_packs_epi32(_mm512_castsi512_si256(c_int),
-                         _mm512_castsi512_si256(d_int)),
-      _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i ab_int16_high = _mm256_permute4x64_epi64(
-      _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
-                         _mm512_extracti32x8_epi32(b_int, 1)),
-      _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i cd_int16_high = _mm256_permute4x64_epi64(
-      _mm256_packs_epi32(_mm512_extracti32x8_epi32(c_int, 1),
-                         _mm512_extracti32x8_epi32(d_int, 1)),
-      _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i abcd_int8_low = _mm256_permute4x64_epi64(
-      _mm256_packs_epi16(ab_int16_low, cd_int16_low), _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i abcd_int8_high =
-      _mm256_permute4x64_epi64(_mm256_packs_epi16(ab_int16_high, cd_int16_high),
-                               _MM_SHUFFLE(0, 2, 1, 3));
-  return _mm512_inserti32x8(_mm512_castsi256_si512(abcd_int8_low),
-                            abcd_int8_high, 1);
-#endif
-}
-
-template <>
-struct type_casting_traits<QInt32, QInt8> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-struct type_casting_traits<QInt32, QInt16> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet64q8i
-pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a, const Packet16q32i& b,
-                                 const Packet16q32i& c, const Packet16q32i& d) {
-  __m128i a_part = _mm512_cvtsepi32_epi8(a);
-  __m128i b_part = _mm512_cvtsepi32_epi8(b);
-  __m128i c_part = _mm512_cvtsepi32_epi8(c);
-  __m128i d_part = _mm512_cvtsepi32_epi8(d);
-  __m256i ab =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(a_part), b_part, 1);
-  __m256i cd =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(c_part), d_part, 1);
-  __m512i converted = _mm512_inserti64x4(_mm512_castsi256_si512(ab), cd, 1);
-  return converted;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16q32i, Packet32q16i>(
-    const Packet16q32i& a, const Packet16q32i& b) {
-  __m256i a_part = _mm512_cvtsepi32_epi16(a);
-  __m256i b_part = _mm512_cvtsepi32_epi16(b);
-  __m512i converted =
-      _mm512_inserti64x4(_mm512_castsi256_si512(a_part), b_part, 1);
-  return converted;
-}
-
-template <>
-struct type_casting_traits<QInt32, QUInt8> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet64q8u
-pcast<Packet16q32i, Packet64q8u>(const Packet16q32i& a, const Packet16q32i& b,
-                                 const Packet16q32i& c, const Packet16q32i& d) {
-  // Brute-force saturation since there isn't a pack operation for unsigned
-  // numbers that keeps the elements in order.
-  __m128i a_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
-      _mm512_min_epi32(a, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
-  __m128i b_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
-      _mm512_min_epi32(b, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
-  __m128i c_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
-      _mm512_min_epi32(c, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
-  __m128i d_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
-      _mm512_min_epi32(d, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
-  __m256i ab =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(a_part), b_part, 1);
-  __m256i cd =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(c_part), d_part, 1);
-  __m512i converted = _mm512_inserti64x4(_mm512_castsi256_si512(ab), cd, 1);
-  return converted;
-}
-
-#if 0
-// The type Packet32q16u does not exist for AVX-512 yet
-template <>
-struct type_casting_traits<QInt32, QUInt16> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet32q16u
-pcast<Packet16q32i, Packet32q16u>(const Packet16q32i& a,
-                                  const Packet16q32i& b) {
-  // Brute-force saturation since there isn't a pack operation for unsigned
-  // numbers that keeps the elements in order.
-  __m256i a_part =
-      _mm512_cvtepi32_epi16(_mm512_max_epi32(
-        _mm512_min_epi32(a, _mm512_set1_epi32(65535)), _mm512_setzero_si512()));
-  __m256i b_part = _mm512_cvtepi32_epi16(
-    _mm512_max_epi32(_mm512_min_epi32(b, _mm512_set1_epi32(65535)),
-                     _mm512_setzero_si512()));
-  __m512i converted =
-      _mm512_inserti64x4(_mm512_castsi256_si512(a_part), b_part, 1);
-  return converted;
-}
-#endif
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif  // CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
commit	864374c4052d9d98e851a6515740163c6e86b18a	[log] [tgz]
author	Rasmus Munk Larsen <rmlarsen@google.com>	Fri Sep 30 15:45:19 2022 -0700
committer	Antonio Sanchez <cantonios@google.com>	Tue Oct 18 23:26:09 2022 -0700
tree	0cb27b14d71dba34c6a706993f5816232d94b1d6
parent	36eb2497550e758330a0c836c12cd6932a03c6c8 [diff]