Update Eigen to commit:0e187141679fdb91da33249d18cb79a011c0e2ea

CHANGELOG
=========
0e1871416 - Fix clang-tidy warnings about function definitions in headers.
8ed3b9dcd - Skip f16/bf16 bessel specializations on AVX512 if unavailable.
bc2ab8163 - Eliminate undef warnings when not compiling for AVX512.
0e083b172 - Use numext::sqrt in Householder.h.
37673ca1b - AVX512 TRSM kernels use alloca if EIGEN_NO_MALLOC requested
4d1c16eab - Fix tanh and erf to use vectorized version for EIGEN_FAST_MATH in VSX.
7ea823e82 - [SYCL-Spec] According to [SYCL-2020 spec](...
ba4d7304e - Document DiagonalBase
95463b59b - Mark `index_remap` as `EIGEN_DEVICE_FUNC` in `src/Core/Reshaped.h` (Fixes #2493)
28812d2eb - AVX512 TRSM Kernels respect EIGEN_NO_MALLOC
9960a3042 - Fix row vs column vector typo in Matrix class tutorial
8c2e0e3cb - Fix ambiguous comparisons for c++20 (again again)
14aae2947 - Provide DiagonalMatrix Product and Initializers
76cf6204f - Revert "Fix c++20 ambiguity of comparisons."
8fbb76a04 - Fix build issues with MSVC for AVX512
4f6354128 - Fix c++20 ambiguity of comparisons.
f542b0a71 - Adding an MKL adapter in FFT module.
d49ede4dc - Add AVX512 s/dgemm optimizations for compute kernel (2nd try)
510f6b9f1 - Fix integer shortening warnings in visitor tests.
705ae7064 - Add R-Bidiagonalization step to BDCSVD
e99163e73 - fix: issue 2481: LDLT produce wrong results with AutoDiffScalar
477eb7f63 - Revert "Avoid ambiguous Tensor comparison operators for C++20 compatibility"
c5a5ac680 - [SYCL] SYCL-2020 range does not have default constructor.
5c2179b6c - Avoid ambiguous Tensor comparison operators for C++20 compatibility
aa8b7e2c3 - Add subMappers to Power GEMM packing - simplifies the address calculations (10% faster)
32348091b - Avoid signed integer overflow in adjoint test.
cbe03f353 - [SYCL] Extending SYCL queue interface extension.
32a3f9ac3 - Improve plogical_shift_* implementations and fix typo in SVE/PacketMath.h
ac5c83a3f - unset executable flag
481a4a8c3 - Fix BDCSVD condition for failing with numerical issue.
a9868bd5b - Add arg() to tensor
028ab1258 - Prevent BDCSVD crash caused by index out of bounds.
798fc1c57 - Fix 'Incorrect reference code in STL_interface.hh for ata_product' eigen/isses/2425
9b9496ad9 - Revert "Add AVX512 optimizations for matrix multiply"
25db0b4a8 - Add AVX512 optimizations for matrix multiply
00b75375e - Adding PocketFFT  support in FFT module since kissfft has some flaw in accuracy and performance
73d65dbc4 - Update README.md. Remove obsolete comment about RowMajor not being fully supported.
68e03ab24 - Add `uninstall` target only if not already defined.
2c055f863 - make diagonal matrix cols() and rows() methods constexpr
c2f15edc4 - Add load vector_pairs for RHS of GEMM MMA. Improved predux GEMV.
9e026e5e2 - Removed need to supply the Symmetric flag to UpLo argument for Accelerate LLT and LDLT
44ba7a0da - Fix compiler bugs for GCC 10 & 11 for Power GEMM

PiperOrigin-RevId: 459518060
Change-Id: Id93ddb4dd55027304d4b8a0b347c395f7d01fdec
diff --git a/Eigen/Core b/Eigen/Core
index 7bbdee3..63b9850 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -356,6 +356,10 @@
   #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
 #endif
 
+#if defined(EIGEN_VECTORIZE_AVX512)
+  #include "src/Core/arch/AVX512/GemmKernel.h"
+#endif
+
 #include "src/Core/BooleanRedux.h"
 #include "src/Core/Select.h"
 #include "src/Core/VectorwiseOp.h"
diff --git a/Eigen/src/AccelerateSupport/AccelerateSupport.h b/Eigen/src/AccelerateSupport/AccelerateSupport.h
index a2c83d7..0417688 100644
--- a/Eigen/src/AccelerateSupport/AccelerateSupport.h
+++ b/Eigen/src/AccelerateSupport/AccelerateSupport.h
@@ -17,12 +17,12 @@
   * \warning Only single and double precision real scalar types are supported by Accelerate
   * 
   * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam UpLo_ additional information about the matrix structure. Default is Lower | Symmetric.
+  * \tparam UpLo_ additional information about the matrix structure. Default is Lower.
   *
   * \sa \ref TutorialSparseSolverConcept, class AccelerateLLT
   */
-template <typename MatrixType, int UpLo = Lower | Symmetric>
-using AccelerateLLT = AccelerateImpl<MatrixType, UpLo, SparseFactorizationCholesky, true>;
+template <typename MatrixType, int UpLo = Lower>
+using AccelerateLLT = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationCholesky, true>;
 
 /** \ingroup AccelerateSupport_Module
   * \class AccelerateLDLT
@@ -31,12 +31,12 @@
   * \warning Only single and double precision real scalar types are supported by Accelerate
   * 
   * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam UpLo_ additional information about the matrix structure. Default is Lower | Symmetric.
+  * \tparam UpLo_ additional information about the matrix structure. Default is Lower.
   *
   * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLT
   */
-template <typename MatrixType, int UpLo = Lower | Symmetric>
-using AccelerateLDLT = AccelerateImpl<MatrixType, UpLo, SparseFactorizationLDLT, true>;
+template <typename MatrixType, int UpLo = Lower>
+using AccelerateLDLT = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationLDLT, true>;
 
 /** \ingroup AccelerateSupport_Module
   * \class AccelerateLDLTUnpivoted
@@ -45,12 +45,12 @@
   * \warning Only single and double precision real scalar types are supported by Accelerate
   * 
   * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam UpLo_ additional information about the matrix structure. Default is Lower | Symmetric.
+  * \tparam UpLo_ additional information about the matrix structure. Default is Lower.
   *
   * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLTUnpivoted
   */
-template <typename MatrixType, int UpLo = Lower | Symmetric>
-using AccelerateLDLTUnpivoted = AccelerateImpl<MatrixType, UpLo, SparseFactorizationLDLTUnpivoted, true>;
+template <typename MatrixType, int UpLo = Lower>
+using AccelerateLDLTUnpivoted = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationLDLTUnpivoted, true>;
 
 /** \ingroup AccelerateSupport_Module
   * \class AccelerateLDLTSBK
@@ -59,12 +59,12 @@
   * \warning Only single and double precision real scalar types are supported by Accelerate
   * 
   * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam UpLo_ additional information about the matrix structure. Default is Lower | Symmetric.
+  * \tparam UpLo_ additional information about the matrix structure. Default is Lower.
   *
   * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLTSBK
   */
-template <typename MatrixType, int UpLo = Lower | Symmetric>
-using AccelerateLDLTSBK = AccelerateImpl<MatrixType, UpLo, SparseFactorizationLDLTSBK, true>;
+template <typename MatrixType, int UpLo = Lower>
+using AccelerateLDLTSBK = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationLDLTSBK, true>;
 
 /** \ingroup AccelerateSupport_Module
   * \class AccelerateLDLTTPP
@@ -73,12 +73,12 @@
   * \warning Only single and double precision real scalar types are supported by Accelerate
   * 
   * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam UpLo_ additional information about the matrix structure. Default is Lower | Symmetric.
+  * \tparam UpLo_ additional information about the matrix structure. Default is Lower.
   *
   * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLTTPP
   */
-template <typename MatrixType, int UpLo = Lower | Symmetric>
-using AccelerateLDLTTPP = AccelerateImpl<MatrixType, UpLo, SparseFactorizationLDLTTPP, true>;
+template <typename MatrixType, int UpLo = Lower>
+using AccelerateLDLTTPP = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationLDLTTPP, true>;
 
 /** \ingroup AccelerateSupport_Module
   * \class AccelerateQR
@@ -87,12 +87,11 @@
   * \warning Only single and double precision real scalar types are supported by Accelerate
   * 
   * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam UpLo_ additional information about the matrix structure. Default is 0.
   *
   * \sa \ref TutorialSparseSolverConcept, class AccelerateQR
   */
-template <typename MatrixType, int UpLo = 0>
-using AccelerateQR = AccelerateImpl<MatrixType, UpLo, SparseFactorizationQR, false>;
+template <typename MatrixType>
+using AccelerateQR = AccelerateImpl<MatrixType, 0, SparseFactorizationQR, false>;
 
 /** \ingroup AccelerateSupport_Module
   * \class AccelerateCholeskyAtA
@@ -101,12 +100,11 @@
   * \warning Only single and double precision real scalar types are supported by Accelerate
   * 
   * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam UpLo_ additional information about the matrix structure. Default is 0.
   *
   * \sa \ref TutorialSparseSolverConcept, class AccelerateCholeskyAtA
   */
-template <typename MatrixType, int UpLo = 0>
-using AccelerateCholeskyAtA = AccelerateImpl<MatrixType, UpLo, SparseFactorizationCholeskyAtA, false>;
+template <typename MatrixType>
+using AccelerateCholeskyAtA = AccelerateImpl<MatrixType, 0, SparseFactorizationCholeskyAtA, false>;
 
 namespace internal {
 template <typename T>
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index 06cfdc1..405cc71 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -13,9 +13,21 @@
 
 #include "./InternalHeaderCheck.h"
 
-namespace Eigen { 
+namespace Eigen {
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
+/** \class DiagonalBase
+ * \ingroup Core_Module
+ *
+ * \brief Base class for diagonal matrices and expressions
+ *
+ * This is the base class that is inherited by diagonal matrix and related expression
+ * types, which internally use a vector for storing the diagonal entries. Diagonal
+ * types always represent square matrices.
+ *
+ * \tparam Derived is the derived type, a DiagonalMatrix or DiagonalWrapper.
+ *
+ * \sa class DiagonalMatrix, class DiagonalWrapper
+ */
 template<typename Derived>
 class DiagonalBase : public EigenBase<Derived>
 {
@@ -39,24 +51,35 @@
     typedef DenseMatrixType DenseType;
     typedef DiagonalMatrix<Scalar,DiagonalVectorType::SizeAtCompileTime,DiagonalVectorType::MaxSizeAtCompileTime> PlainObject;
 
+    /** \returns a reference to the derived object. */
     EIGEN_DEVICE_FUNC
     inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    /** \returns a const reference to the derived object. */
     EIGEN_DEVICE_FUNC
     inline Derived& derived() { return *static_cast<Derived*>(this); }
 
+    /**
+     * Constructs a dense matrix from \c *this. Note, this directly returns a dense matrix type,
+     * not an expression.
+     * \returns A dense matrix, with its diagonal entries set from the the derived object. */
     EIGEN_DEVICE_FUNC
     DenseMatrixType toDenseMatrix() const { return derived(); }
 
+    /** \returns a reference to the derived object's vector of diagonal coefficients. */
     EIGEN_DEVICE_FUNC
     inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
+    /** \returns a const reference to the derived object's vector of diagonal coefficients. */
     EIGEN_DEVICE_FUNC
     inline DiagonalVectorType& diagonal() { return derived().diagonal(); }
 
-    EIGEN_DEVICE_FUNC
+    /** \returns the number of rows. */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR 
     inline Index rows() const { return diagonal().size(); }
-    EIGEN_DEVICE_FUNC
+    /** \returns the number of columns. */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR 
     inline Index cols() const { return diagonal().size(); }
 
+    /** \returns the diagonal matrix product of \c *this by the dense matrix, \a matrix */
     template<typename MatrixDerived>
     EIGEN_DEVICE_FUNC
     const Product<Derived,MatrixDerived,LazyProduct>
@@ -65,66 +88,77 @@
       return Product<Derived, MatrixDerived, LazyProduct>(derived(),matrix.derived());
     }
 
-    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> > InverseReturnType;
-    EIGEN_DEVICE_FUNC
-    inline const InverseReturnType
-    inverse() const
-    {
-      return InverseReturnType(diagonal().cwiseInverse());
-    }
-    
-    EIGEN_DEVICE_FUNC
-    inline const DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >
-    operator*(const Scalar& scalar) const
-    {
-      return DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >(diagonal() * scalar);
-    }
-    EIGEN_DEVICE_FUNC
-    friend inline const DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >
-    operator*(const Scalar& scalar, const DiagonalBase& other)
-    {
-      return DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >(scalar * other.diagonal());
+    template <typename OtherDerived>
+    using DiagonalProductReturnType = DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(
+        DiagonalVectorType, typename OtherDerived::DiagonalVectorType, product)>;
+
+    /** \returns the diagonal matrix product of \c *this by the diagonal matrix \a other */
+    template <typename OtherDerived>
+    EIGEN_DEVICE_FUNC const DiagonalProductReturnType<OtherDerived> operator*(
+        const DiagonalBase<OtherDerived>& other) const {
+      return diagonal().cwiseProduct(other.diagonal()).asDiagonal();
     }
 
-    template<typename OtherDerived>
+    using DiagonalInverseReturnType =
+        DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType>>;
+
+    /** \returns the inverse \c *this. Computed as the coefficient-wise inverse of the diagonal. */
     EIGEN_DEVICE_FUNC
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    inline unspecified_expression_type
-    #else
-    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,sum) >
-    #endif
-    operator+(const DiagonalBase<OtherDerived>& other) const
-    {
+    inline const DiagonalInverseReturnType inverse() const { return diagonal().cwiseInverse().asDiagonal(); }
+
+    using DiagonalScaleReturnType =
+        DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType, Scalar, product)>;
+
+    /** \returns the product of \c *this by the scalar \a scalar */
+    EIGEN_DEVICE_FUNC
+    inline const DiagonalScaleReturnType operator*(const Scalar& scalar) const {
+      return (diagonal() * scalar).asDiagonal();
+    }
+
+    using ScaleDiagonalReturnType =
+        DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar, DiagonalVectorType, product)>;
+
+    /** \returns the product of a scalar and the diagonal matrix \a other */
+    EIGEN_DEVICE_FUNC
+    friend inline const ScaleDiagonalReturnType operator*(const Scalar& scalar, const DiagonalBase& other) {
+      return (scalar * other.diagonal()).asDiagonal();
+    }
+
+    template <typename OtherDerived>
+    using DiagonalSumReturnType = DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(
+        DiagonalVectorType, typename OtherDerived::DiagonalVectorType, sum)>;
+
+    /** \returns the sum of \c *this and the diagonal matrix \a other */
+    template <typename OtherDerived>
+    EIGEN_DEVICE_FUNC inline const DiagonalSumReturnType<OtherDerived> operator+(
+        const DiagonalBase<OtherDerived>& other) const {
       return (diagonal() + other.diagonal()).asDiagonal();
     }
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    inline unspecified_expression_type
-    #else
-    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,difference) >
-    #endif
-    operator-(const DiagonalBase<OtherDerived>& other) const
-    {
+    template <typename OtherDerived>
+    using DiagonalDifferenceReturnType = DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(
+        DiagonalVectorType, typename OtherDerived::DiagonalVectorType, difference)>;
+
+    /** \returns the difference of \c *this and the diagonal matrix \a other */
+    template <typename OtherDerived>
+    EIGEN_DEVICE_FUNC inline const DiagonalDifferenceReturnType<OtherDerived> operator-(
+        const DiagonalBase<OtherDerived>& other) const {
       return (diagonal() - other.diagonal()).asDiagonal();
     }
 };
 
-#endif
-
 /** \class DiagonalMatrix
-  * \ingroup Core_Module
-  *
-  * \brief Represents a diagonal matrix with its storage
-  *
-  * \tparam Scalar_ the type of coefficients
-  * \tparam SizeAtCompileTime the dimension of the matrix, or Dynamic
-  * \tparam MaxSizeAtCompileTime the dimension of the matrix, or Dynamic. This parameter is optional and defaults
-  *        to SizeAtCompileTime. Most of the time, you do not need to specify it.
-  *
-  * \sa class DiagonalWrapper
-  */
+ * \ingroup Core_Module
+ *
+ * \brief Represents a diagonal matrix with its storage
+ *
+ * \tparam Scalar_ the type of coefficients
+ * \tparam SizeAtCompileTime the dimension of the matrix, or Dynamic
+ * \tparam MaxSizeAtCompileTime the dimension of the matrix, or Dynamic. This parameter is optional and defaults
+ *        to SizeAtCompileTime. Most of the time, you do not need to specify it.
+ *
+ * \sa class DiagonalBase, class DiagonalWrapper
+ */
 
 namespace internal {
 template<typename Scalar_, int SizeAtCompileTime, int MaxSizeAtCompileTime>
@@ -241,6 +275,22 @@
     }
     #endif
 
+    typedef DiagonalWrapper<const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, DiagonalVectorType>>
+        InitializeReturnType;
+
+    /** Initializes a diagonal matrix of size SizeAtCompileTime with coefficients set to zero */
+    EIGEN_DEVICE_FUNC
+    static const InitializeReturnType Zero() { return DiagonalVectorType::Zero().asDiagonal(); }
+    /** Initializes a diagonal matrix of size dim with coefficients set to zero */
+    EIGEN_DEVICE_FUNC
+    static const InitializeReturnType Zero(Index size) { return DiagonalVectorType::Zero(size).asDiagonal(); }
+    /** Initializes a identity matrix of size SizeAtCompileTime */
+    EIGEN_DEVICE_FUNC
+    static const InitializeReturnType Identity() { return DiagonalVectorType::Ones().asDiagonal(); }
+    /** Initializes a identity matrix of size dim */
+    EIGEN_DEVICE_FUNC
+    static const InitializeReturnType Identity(Index size) { return DiagonalVectorType::Ones(size).asDiagonal(); }
+
     /** Resizes to given size. */
     EIGEN_DEVICE_FUNC
     inline void resize(Index size) { m_diagonal.resize(size); }
@@ -388,6 +438,6 @@
 
 } // namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif // EIGEN_DIAGONALMATRIX_H
diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index 53889fe..c90e61f 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -325,7 +325,7 @@
 
   typedef std::pair<Index, Index> RowCol;
 
-  inline RowCol index_remap(Index rowId, Index colId) const
+  EIGEN_DEVICE_FUNC inline RowCol index_remap(Index rowId, Index colId) const
   {
     if(Order==ColMajor)
     {
diff --git a/Eigen/src/Core/arch/AVX512/GemmKernel.h b/Eigen/src/Core/arch/AVX512/GemmKernel.h
new file mode 100644
index 0000000..477c50f
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/GemmKernel.h
@@ -0,0 +1,1225 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2022 Intel Corporation
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef GEMM_KERNEL_H
+#define GEMM_KERNEL_H
+
+#if EIGEN_COMP_MSVC
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#include <immintrin.h>
+#include <type_traits>
+
+#include "../../InternalHeaderCheck.h"
+
+#define SECOND_FETCH (32)
+#if (EIGEN_COMP_GNUC_STRICT != 0) && !defined(EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS)
+// Use less registers to load A elements to workaround compiler spills. Loose a
+// bit of performance (less than ~2%).
+#define EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS
+#endif
+
+namespace Eigen {
+namespace internal {
+
+template <typename Scalar, bool is_unit_inc>
+class gemm_class {
+  using vec = typename packet_traits<Scalar>::type;
+  using vec_ymm = typename unpacket_traits<vec>::half;
+  using vec_xmm = typename unpacket_traits<vec_ymm>::half;
+  using umask_t = typename unpacket_traits<vec>::mask_t;
+
+  static constexpr bool is_f32 = sizeof(Scalar) == sizeof(float);
+  static constexpr bool is_f64 = sizeof(Scalar) == sizeof(double);
+
+#ifndef EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS
+  static constexpr bool use_less_a_regs = !is_unit_inc;
+#else
+  static constexpr bool use_less_a_regs = true;
+#endif
+#ifndef EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_B_REGS
+  static constexpr bool use_less_b_regs = !is_unit_inc;
+#else
+  static constexpr bool use_less_b_regs = true;
+#endif
+
+  static constexpr int a_regs[] = {0, 1, 2, use_less_a_regs ? 0 : 3, use_less_a_regs ? 1 : 4, use_less_a_regs ? 2 : 5};
+  static constexpr int b_regs[] = {6, use_less_b_regs ? 6 : 7};
+  static constexpr int c_regs[] = {
+      8, 16, 24, 9, 17, 25, 10, 18, 26, 11, 19, 27, 12, 20, 28, 13, 21, 29, 14, 22, 30, 15, 23, 31,
+  };
+
+  static constexpr int alpha_load_reg = 0;
+  static constexpr int c_load_regs[] = {1, 2, 6};
+
+  static constexpr int a_shift = 128;
+  static constexpr int b_shift = 128;
+
+  static constexpr int nelems_in_cache_line = is_f32 ? 16 : 8;
+  static constexpr int a_prefetch_size = nelems_in_cache_line * 2;
+  static constexpr int b_prefetch_size = nelems_in_cache_line * 8;
+
+  vec zmm[32];
+  umask_t mask;
+
+  // gemm arguments.
+  Index m;
+  const Index n, k, ldc;
+  const Index inc;
+  const Scalar *alpha;
+
+  const Scalar *a, *b;
+  Scalar *c;
+
+  const bool is_alpha1;
+  const bool is_beta0;
+
+  const Index a_stride, b_stride;
+  const Index a_off, b_off;
+
+  static EIGEN_ALWAYS_INLINE constexpr int div_up(int a, int b) { return a == 0 ? 0 : (a - 1) / b + 1; }
+
+  EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar *a_addr) {
+    _mm_prefetch((char *)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0);
+  }
+
+  EIGEN_ALWAYS_INLINE void prefetch_b(const Scalar *b_addr) {
+    _mm_prefetch((char *)(b_prefetch_size + b_addr - b_shift), _MM_HINT_T0);
+  }
+
+  EIGEN_ALWAYS_INLINE void prefetch_x(const Scalar *x_addr) { _mm_prefetch((char *)(x_addr - a_shift), _MM_HINT_T2); }
+
+  EIGEN_ALWAYS_INLINE void prefetch_c(const Scalar *c_addr) {
+#if defined(__PRFCHW__) && __PRFCHW__ == 1
+    _m_prefetchw((void *)c_addr);
+#else
+    _mm_prefetch((char *)c_addr, _MM_HINT_T0);
+#endif
+  }
+
+  template <int nelems>
+  EIGEN_ALWAYS_INLINE void a_load(vec &a_reg, const Scalar *a_addr) {
+    switch (nelems * sizeof(*a_addr) * 8) {
+      default:
+      case 512 * 3:
+        a_reg = ploadu<vec>(a_addr);
+        break;
+      case 512 * 2:
+        a_reg = ploadu<vec>(a_addr);
+        break;
+      case 512 * 1:
+        a_reg = ploadu<vec>(a_addr);
+        break;
+      case 256 * 1:
+        a_reg = preinterpret<vec>(_mm512_broadcast_f64x4(ploadu<Packet4d>(reinterpret_cast<const double *>(a_addr))));
+        break;
+      case 128 * 1:
+        a_reg = preinterpret<vec>(_mm512_broadcast_f32x4(ploadu<Packet4f>(reinterpret_cast<const float *>(a_addr))));
+        break;
+      case 64 * 1:
+        a_reg = preinterpret<vec>(pload1<Packet8d>(reinterpret_cast<const double *>(a_addr)));
+        break;
+      case 32 * 1:
+        a_reg = pload1<vec>(a_addr);
+        break;
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE void b_load(vec &b_reg, const Scalar *b_addr) { b_reg = pload1<vec>(b_addr); }
+
+  template <int nelems>
+  EIGEN_ALWAYS_INLINE void c_store(Scalar *mem, vec &src) {
+    if (is_unit_inc) {
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          pstoreu(mem, src);
+          break;
+        case 512 * 2:
+          pstoreu(mem, src);
+          break;
+        case 512 * 1:
+          pstoreu(mem, src);
+          break;
+        case 256 * 1:
+          pstoreu(mem, preinterpret<vec_ymm>(src));
+          break;
+        case 128 * 1:
+          pstoreu(mem, preinterpret<vec_xmm>(src));
+          break;
+        case 64 * 1:
+          pstorel(mem, preinterpret<vec_xmm>(src));
+          break;
+        case 32 * 1:
+          pstores(mem, preinterpret<vec_xmm>(src));
+          break;
+      }
+    } else {
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          pscatter(mem, src, inc);
+          break;
+        case 512 * 2:
+          pscatter(mem, src, inc);
+          break;
+        case 512 * 1:
+          pscatter(mem, src, inc);
+          break;
+        case 256 * 1:
+          pscatter(mem, src, inc, mask);
+          break;
+        case 128 * 1:
+          pscatter(mem, src, inc, mask);
+          break;
+        case 64 * 1:
+          pscatter(mem, src, inc, mask);
+          break;
+        case 32 * 1:
+          pscatter(mem, src, inc, mask);
+          break;
+      }
+    }
+  }
+
+  template <int nelems>
+  EIGEN_ALWAYS_INLINE void vaddm(vec &dst, const Scalar *mem, vec &src, vec &reg) {
+    if (is_unit_inc) {
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          dst = padd(src, ploadu<vec>(mem));
+          break;
+        case 512 * 2:
+          dst = padd(src, ploadu<vec>(mem));
+          break;
+        case 512 * 1:
+          dst = padd(src, ploadu<vec>(mem));
+          break;
+        case 256 * 1:
+          dst = preinterpret<vec>(padd(preinterpret<vec_ymm>(src), ploadu<vec_ymm>(mem)));
+          break;
+        case 128 * 1:
+          dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), ploadu<vec_xmm>(mem)));
+          break;
+        case 64 * 1:
+          dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), ploadl<vec_xmm>(mem)));
+          break;
+        case 32 * 1:
+          dst = preinterpret<vec>(padds(preinterpret<vec_xmm>(src), ploads<vec_xmm>(mem)));
+          break;
+      }
+    } else {
+      // Zero out scratch register
+      reg = pzero(reg);
+
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = padd(src, reg);
+          break;
+        case 512 * 2:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = padd(src, reg);
+          break;
+        case 512 * 1:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = padd(src, reg);
+          break;
+        case 256 * 1:
+          reg = preinterpret<vec>(pgather<Scalar, vec_ymm>(mem, inc));
+          dst = preinterpret<vec>(padd(preinterpret<vec_ymm>(src), preinterpret<vec_ymm>(reg)));
+          break;
+        case 128 * 1:
+          reg = preinterpret<vec>(pgather<Scalar, vec_xmm>(mem, inc));
+          dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), preinterpret<vec_xmm>(reg)));
+          break;
+        case 64 * 1:
+          if (is_f32) {
+            reg = pgather(reg, mem, inc, mask);
+            dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), preinterpret<vec_xmm>(reg)));
+          } else {
+            dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), ploadl<vec_xmm>(mem)));
+          }
+          break;
+        case 32 * 1:
+          dst = preinterpret<vec>(padds(preinterpret<vec_xmm>(src), ploads<vec_xmm>(mem)));
+          break;
+      }
+    }
+  }
+
+  EIGEN_STRONG_INLINE void vfmadd(vec &dst, const vec &src1, const vec &src2) {
+    dst = pmadd(src1, src2, dst);
+
+#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
+    // Workaround register spills for gcc and clang
+    __asm__("#" : [dst] "+v"(dst) : [src1] "%v"(src1), [src2] "v"(src2));
+#endif
+  }
+
+  template <int nelems>
+  EIGEN_ALWAYS_INLINE void vfmaddm(vec &dst, const Scalar *mem, vec &src, vec &scale, vec &reg) {
+    if (is_unit_inc) {
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          dst = pmadd(scale, src, ploadu<vec>(mem));
+          break;
+        case 512 * 2:
+          dst = pmadd(scale, src, ploadu<vec>(mem));
+          break;
+        case 512 * 1:
+          dst = pmadd(scale, src, ploadu<vec>(mem));
+          break;
+        case 256 * 1:
+          dst =
+              preinterpret<vec>(pmadd(preinterpret<vec_ymm>(scale), preinterpret<vec_ymm>(src), ploadu<vec_ymm>(mem)));
+          break;
+        case 128 * 1:
+          dst =
+              preinterpret<vec>(pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploadu<vec_xmm>(mem)));
+          break;
+        case 64 * 1:
+          dst =
+              preinterpret<vec>(pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploadl<vec_xmm>(mem)));
+          break;
+        case 32 * 1:
+          dst =
+              preinterpret<vec>(pmadds(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploads<vec_xmm>(mem)));
+          break;
+      }
+    } else {
+      // Zero out scratch register
+      reg = pzero(reg);
+
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = pmadd(scale, src, reg);
+          break;
+        case 512 * 2:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = pmadd(scale, src, reg);
+          break;
+        case 512 * 1:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = pmadd(scale, src, reg);
+          break;
+        case 256 * 1:
+          reg = preinterpret<vec>(pgather<Scalar, vec_ymm>(mem, inc));
+          dst = preinterpret<vec>(
+              pmadd(preinterpret<vec_ymm>(scale), preinterpret<vec_ymm>(src), preinterpret<vec_ymm>(reg)));
+          break;
+        case 128 * 1:
+          reg = preinterpret<vec>(pgather<Scalar, vec_xmm>(mem, inc));
+          dst = preinterpret<vec>(
+              pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), preinterpret<vec_xmm>(reg)));
+          break;
+        case 64 * 1:
+          if (is_f32) {
+            reg = pgather(reg, mem, inc, mask);
+            dst = preinterpret<vec>(
+                pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), preinterpret<vec_xmm>(reg)));
+          } else {
+            dst = preinterpret<vec>(
+                pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploadl<vec_xmm>(mem)));
+          }
+          break;
+        case 32 * 1:
+          dst =
+              preinterpret<vec>(pmadds(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploads<vec_xmm>(mem)));
+          break;
+      }
+    }
+  }
+
+  template <int j, int endX, int i, int endY, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(j > endX) || (i > endY)> a_loads(const Scalar *ao) {
+    EIGEN_UNUSED_VARIABLE(ao);
+  }
+
+  template <int j, int endX, int i, int endY, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(j <= endX) && (i <= endY)> a_loads(const Scalar *ao) {
+    if (j < endX) {
+      if (i < endY) {
+        auto &a_reg = zmm[a_regs[i + (j % 2) * 3]];
+        const Scalar *a_addr = ao + nelems * j + nelems_in_cache_line * i - a_shift;
+        a_load<nelems>(a_reg, a_addr);
+
+        a_loads<j, endX, i + 1, endY, nelems>(ao);
+      } else {
+        a_loads<j + 1, endX, 0, endY, nelems>(ao);
+      }
+    }
+  }
+
+  template <int un, int max_b_unroll, int i, int um_vecs, int a_unroll, int b_unroll>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(un > max_b_unroll) || (i > um_vecs)> prefetch_cs(const Scalar *co1,
+                                                                                         const Scalar *co2) {
+    EIGEN_UNUSED_VARIABLE(co1);
+    EIGEN_UNUSED_VARIABLE(co2);
+  }
+
+  /* C prefetch loop structure.
+   * for (int un = 0; un < 8; un++) {
+   *     if (b_unroll >= un + 1) {
+   *         if (un == 4) co2 = co1 + 4 * ldc;
+   *
+   *         for (int i = 0; i < um_vecs; i++) {
+   *             Scalar *co = (un + 1 <= 4) ? co1 : co2;
+   *             auto co_off = (un % 4) * ldc + a_unroll - 1 + i * nelems_in_cache_line * sizeof *co;
+   *             prefetch_c(co + co_off);
+   *         }
+   *     }
+   * }
+   */
+
+  template <int un, int max_b_unroll, int i, int um_vecs, int a_unroll, int b_unroll>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(un <= max_b_unroll) && (i <= um_vecs)> prefetch_cs(Scalar *&co1, Scalar *&co2) {
+    if (un < max_b_unroll) {
+      if (b_unroll >= un + 1) {
+        if (un == 4 && i == 0) co2 = co1 + 4 * ldc;
+
+        if (i < um_vecs) {
+          Scalar *co = (un + 1 <= 4) ? co1 : co2;
+          auto co_off = (un % 4) * ldc + a_unroll - 1 + i * nelems_in_cache_line * sizeof *co;
+          prefetch_c(co + co_off);
+
+          prefetch_cs<un, max_b_unroll, i + 1, um_vecs, a_unroll, b_unroll>(co1, co2);
+        } else {
+          prefetch_cs<un + 1, max_b_unroll, 0, um_vecs, a_unroll, b_unroll>(co1, co2);
+        }
+
+      } else {
+        prefetch_cs<un + 1, max_b_unroll, 0, um_vecs, a_unroll, b_unroll>(co1, co2);
+      }
+    }
+  }
+
+  // load_c
+  template <int i, int um_vecs, int idx, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> scale_load_c(const Scalar *cox, vec &alpha_reg) {
+    EIGEN_UNUSED_VARIABLE(cox);
+    EIGEN_UNUSED_VARIABLE(alpha_reg);
+  }
+
+  template <int i, int um_vecs, int idx, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> scale_load_c(const Scalar *cox, vec &alpha_reg) {
+    if (i < um_vecs) {
+      auto &c_reg = zmm[c_regs[i + idx * 3]];
+      auto &c_load_reg = zmm[c_load_regs[i % 3]];
+      auto c_mem = cox;
+      if (is_unit_inc)
+        c_mem += i * nelems_in_cache_line;
+      else
+        c_mem += i * nelems_in_cache_line * inc;
+
+      if (!is_beta0 && is_alpha1)
+        vaddm<nelems>(c_reg, c_mem, c_reg, c_load_reg);
+      else if (!is_beta0 && !is_alpha1)
+        vfmaddm<nelems>(c_reg, c_mem, c_reg, alpha_reg, c_load_reg);
+      else if (is_beta0 && !is_alpha1)
+        c_reg = pmul(alpha_reg, c_reg);
+
+      scale_load_c<i + 1, um_vecs, idx, nelems>(cox, alpha_reg);
+    }
+  }
+
+  // store_c
+  template <int i, int um_vecs, int idx, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> write_c(Scalar *cox) {
+    EIGEN_UNUSED_VARIABLE(cox);
+  }
+
+  template <int i, int um_vecs, int idx, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> write_c(Scalar *cox) {
+    if (i < um_vecs) {
+      auto &c_reg = zmm[c_regs[i + idx * 3]];
+      auto c_mem = cox;
+      if (is_unit_inc)
+        c_mem += i * nelems_in_cache_line;
+      else
+        c_mem += i * nelems_in_cache_line * inc;
+
+      c_store<nelems>(c_mem, c_reg);
+      c_reg = pzero(c_reg);
+
+      write_c<i + 1, um_vecs, idx, nelems>(cox);
+    }
+  }
+
+  /*  C update loop structure.
+   *  co2 = co1 + ldc;
+   *
+   *  auto &alpha_reg = zmm[alpha_load_reg];
+   *  if (!is_alpha1) alpha_reg = pload1<vec>(alpha);
+   *
+   *  int idx = 0;
+   *  for (pow = 1; pow <= 8; pow <<= 1) {
+   *
+   *      if (b_unroll >= pow) {
+   *          for (count = 1; count < (pow + 1) / 2 + 1;  count++) {
+   *              if (pow >= 4) co2 += ldc;
+   *
+   *              const Scalar *cox = (idx == 0) ? co1 : co2;
+   *
+   *              const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
+   *              scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
+   *              write_c<0, um_vecs, idx, a_unroll>(cox);
+   *
+   *              idx++;
+   *          }
+   *      }
+   *  }
+   *
+   *  if (b_unroll == 1)
+   *      co1 += ldc;
+   *  else
+   *      co1 = co2 + ldc;
+   */
+
+  template <int pow, int a_unroll, int idx>
+  EIGEN_ALWAYS_INLINE void c_update_1count(Scalar *&cox) {
+    if (pow >= 4) cox += ldc;
+
+    const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
+    auto &alpha_reg = zmm[alpha_load_reg];
+
+    scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
+    write_c<0, um_vecs, idx, a_unroll>(cox);
+  }
+
+  template <int pow, int a_unroll>
+  EIGEN_ALWAYS_INLINE void c_update_1pow(Scalar *&co1, Scalar *&co2) {
+    constexpr int idx = pow / 2;
+    Scalar *&cox = idx == 0 ? co1 : co2;
+
+    constexpr int max_count = (pow + 1) / 2;
+    static_assert(max_count <= 4, "Unsupported max_count.");
+
+    if (1 <= max_count) c_update_1count<pow, a_unroll, idx + 0>(cox);
+    if (2 <= max_count) c_update_1count<pow, a_unroll, idx + 1>(cox);
+    if (3 <= max_count) c_update_1count<pow, a_unroll, idx + 2>(cox);
+    if (4 <= max_count) c_update_1count<pow, a_unroll, idx + 3>(cox);
+  }
+
+  template <int max_b_unroll, int a_unroll, int b_unroll>
+  EIGEN_ALWAYS_INLINE void c_update(Scalar *&co1, Scalar *&co2) {
+    auto &alpha_reg = zmm[alpha_load_reg];
+
+    co2 = co1 + ldc;
+    if (!is_alpha1) alpha_reg = pload1<vec>(alpha);
+    if (!is_unit_inc && a_unroll < nelems_in_cache_line) mask = static_cast<umask_t>((1ull << a_unroll) - 1);
+
+    static_assert(max_b_unroll <= 8, "Unsupported max_b_unroll");
+
+    if (1 <= max_b_unroll && 1 <= b_unroll) c_update_1pow<1, a_unroll>(co1, co2);
+    if (2 <= max_b_unroll && 2 <= b_unroll) c_update_1pow<2, a_unroll>(co1, co2);
+    if (4 <= max_b_unroll && 4 <= b_unroll) c_update_1pow<4, a_unroll>(co1, co2);
+    if (8 <= max_b_unroll && 8 <= b_unroll) c_update_1pow<8, a_unroll>(co1, co2);
+
+    if (b_unroll == 1)
+      co1 += ldc;
+    else
+      co1 = co2 + ldc;
+  }
+
+  // compute
+  template <int um, int um_vecs, int idx, int uk, bool fetch_x, bool ktail>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> compute(const Scalar *ao, const Scalar *bo, int &fetchA_idx,
+                                                               int &fetchB_idx, vec &b_reg) {
+    EIGEN_UNUSED_VARIABLE(ao);
+    EIGEN_UNUSED_VARIABLE(bo);
+    EIGEN_UNUSED_VARIABLE(fetchA_idx);
+    EIGEN_UNUSED_VARIABLE(fetchB_idx);
+    EIGEN_UNUSED_VARIABLE(b_reg);
+  }
+
+  template <int um, int um_vecs, int idx, int uk, bool fetch_x, bool ktail>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> compute(const Scalar *ao, const Scalar *bo, int &fetchA_idx,
+                                                                int &fetchB_idx, vec &b_reg) {
+    if (um < um_vecs) {
+      auto &c_reg = zmm[c_regs[um + idx * 3]];
+      auto &a_reg = zmm[a_regs[um + (uk % 2) * 3]];
+
+      vfmadd(c_reg, a_reg, b_reg);
+
+      if (!fetch_x && um == 0 &&
+          (((idx == 0 || idx == 6) && (uk % 2 == 0 || is_f64 || ktail)) ||
+           (idx == 3 && (uk % 2 == 1 || is_f64 || ktail)))) {
+        prefetch_a(ao + nelems_in_cache_line * fetchA_idx);
+        fetchA_idx++;
+      }
+
+      if (um == 0 && idx == 1 && (uk % 2 == 0 || is_f64 || ktail)) {
+        prefetch_b(bo + nelems_in_cache_line * fetchB_idx);
+        fetchB_idx++;
+      }
+
+      compute<um + 1, um_vecs, idx, uk, fetch_x, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg);
+    }
+  }
+
+  // load_a
+  template <int um, int um_vecs, int uk, int nelems, bool ktail>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> load_a(const Scalar *ao) {
+    EIGEN_UNUSED_VARIABLE(ao);
+  }
+
+  template <int um, int um_vecs, int uk, int nelems, bool ktail>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> load_a(const Scalar *ao) {
+    if (um < um_vecs) {
+      auto &a_reg = zmm[a_regs[um + (uk % 2) * 3]];
+      const Scalar *a_addr = ao + nelems * (1 + !ktail * !use_less_a_regs + uk) + nelems_in_cache_line * um - a_shift;
+      a_load<nelems>(a_reg, a_addr);
+
+      load_a<um + 1, um_vecs, uk, nelems, ktail>(ao);
+    }
+  }
+  template <int uk, int pow, int count, int um_vecs, int b_unroll, bool ktail, bool fetch_x, bool c_fetch>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(count > (pow + 1) / 2)> innerkernel_1pow(const Scalar *&aa,
+                                                                                 const Scalar *const &ao,
+                                                                                 const Scalar *const &bo, Scalar *&co2,
+                                                                                 int &fetchA_idx, int &fetchB_idx) {
+    EIGEN_UNUSED_VARIABLE(aa);
+    EIGEN_UNUSED_VARIABLE(ao);
+    EIGEN_UNUSED_VARIABLE(bo);
+    EIGEN_UNUSED_VARIABLE(co2);
+    EIGEN_UNUSED_VARIABLE(fetchA_idx);
+    EIGEN_UNUSED_VARIABLE(fetchB_idx);
+  }
+
+  template <int uk, int pow, int count, int um_vecs, int b_unroll, bool ktail, bool fetch_x, bool c_fetch>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(count <= (pow + 1) / 2)> innerkernel_1pow(const Scalar *&aa,
+                                                                                  const Scalar *const &ao,
+                                                                                  const Scalar *const &bo, Scalar *&co2,
+                                                                                  int &fetchA_idx, int &fetchB_idx) {
+    const int idx = (pow / 2) + count;
+
+    if (count < (pow + 1) / 2) {
+      auto &b_reg = zmm[b_regs[idx % 2]];
+
+      if (fetch_x && uk == 3 && idx == 0) prefetch_x(aa);
+      if (fetch_x && uk == 3 && idx == 4) aa += 8;
+
+      if (b_unroll >= pow) {
+        compute<0, um_vecs, idx, uk, fetch_x, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg);
+
+        const Scalar *b_addr = bo + b_unroll * uk + idx + 1 + (b_unroll > 1) * !use_less_b_regs - b_shift;
+        b_load(b_reg, b_addr);
+      }
+
+      // Go to the next count.
+      innerkernel_1pow<uk, pow, count + 1, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx,
+                                                                                       fetchB_idx);
+
+    } else {
+      // Maybe prefetch C data after count-loop.
+      if (pow == 2 && c_fetch) {
+        if (uk % 3 == 0 && uk > 0) {
+          co2 += ldc;
+        } else {
+          prefetch_c(co2 + (uk % 3) * nelems_in_cache_line);
+        }
+      }
+    }
+  }
+
+  template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch>
+  EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo,
+                                           Scalar *&co2, int &fetchA_idx, int &fetchB_idx) {
+    const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
+
+    if (max_b_unroll >= 1)
+      innerkernel_1pow<uk, 1, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (max_b_unroll >= 2)
+      innerkernel_1pow<uk, 2, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (max_b_unroll >= 4)
+      innerkernel_1pow<uk, 4, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (max_b_unroll >= 8)
+      innerkernel_1pow<uk, 8, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+
+    // Load A after pow-loop.
+    load_a<0, um_vecs, uk, a_unroll, ktail>(ao);
+  }
+
+  /*  Inner kernel loop structure.
+   *  for (int uk = 0; uk < kfactor; uk++) {
+   *      int idx = 0;
+   *
+   *      for (pow = 1; pow < max_b_unroll << 1; pow <<= 1) {
+   *          for (int count = 0; count < (pow + 1) / 2; count++) {
+   *              auto &b_reg = zmm[b_regs[idx % 2]];
+   *
+   *              if (fetch_x && uk == 3 && idx == 0) prefetch_x(aa);
+   *              if (fetch_x && uk == 3 && idx == 4) aa += 8;
+   *
+   *              if (b_unroll >= pow) {
+   *                  compute<0, um_vecs, idx, uk, fetchx, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg);
+   *
+   *                  const Scalar *b_addr = bo + b_unroll * uk + idx + 1 + (b_unroll > 1) - b_shift ;
+   *                  b_load(b_reg, b_addr);
+   *              }
+   *              idx++;
+   *          }
+   *
+   *          Maybe prefetch C data.
+   *          if (pow == 2 && c_fetch) {
+   *              if (uk % 3 == 0 && uk > 0) {
+   *                  co2 += ldc;
+   *              } else {
+   *                  prefetch_c(co2 + (uk % 3) * nelems_in_cache_line);
+   *              }
+   *          }
+   *      }
+   *
+   *      Load A.
+   *      load_a<0, um_vecs, uk, ktail, a_unroll>(ao);
+   *  }
+   *
+   *  Advance A/B pointers after uk-loop.
+   *  ao += a_unroll * kfactor;
+   *  bo += b_unroll * kfactor;
+   */
+
+  template <int a_unroll, int b_unroll, int k_factor, int max_b_unroll, int max_k_factor, bool c_fetch>
+  EIGEN_ALWAYS_INLINE void innerkernel(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co2) {
+    int fetchA_idx = 0;
+    int fetchB_idx = 0;
+
+    const bool fetch_x = k_factor == max_k_factor;
+    const bool ktail = k_factor == 1;
+
+    static_assert(k_factor <= 4 && k_factor > 0, "innerkernel maximum k_factor supported is 4");
+
+    if (k_factor > 0)
+      innerkernel_1uk<0, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx,
+                                                                                    fetchB_idx);
+    if (k_factor > 1)
+      innerkernel_1uk<1, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx,
+                                                                                    fetchB_idx);
+    if (k_factor > 2)
+      innerkernel_1uk<2, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx,
+                                                                                    fetchB_idx);
+    if (k_factor > 3)
+      innerkernel_1uk<3, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx,
+                                                                                    fetchB_idx);
+
+    // Advance A/B pointers after uk-loop.
+    ao += a_unroll * k_factor;
+    bo += b_unroll * k_factor;
+  }
+
+  template <int a_unroll, int b_unroll, int max_b_unroll>
+  EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
+    const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
+    if (!use_less_a_regs)
+      a_loads<0, 2, 0, um_vecs, a_unroll>(ao);
+    else
+      a_loads<0, 1, 0, um_vecs, a_unroll>(ao);
+
+    b_load(zmm[b_regs[0]], bo - b_shift + 0);
+    if (!use_less_b_regs) b_load(zmm[b_regs[1]], bo - b_shift + 1);
+
+#ifndef SECOND_FETCH
+    prefetch_cs<0, max_b_unroll, 0, um_vecs, a_unroll, b_unroll>(co1, co2);
+#endif  // SECOND_FETCH
+
+    // Unrolling k-loop by a factor of 4.
+    const int max_k_factor = 4;
+    Index loop_count = k / max_k_factor;
+
+    if (loop_count > 0) {
+#ifdef SECOND_FETCH
+      loop_count -= SECOND_FETCH;
+#endif
+      while (loop_count > 0) {
+        innerkernel<a_unroll, b_unroll, max_k_factor, max_b_unroll, max_k_factor, 0>(aa, ao, bo, co2);
+        loop_count--;
+      }
+#ifdef SECOND_FETCH
+      co2 = co1 + nelems_in_cache_line - 1;
+
+      loop_count += b_unroll;
+      while (loop_count > 0) {
+        innerkernel<a_unroll, b_unroll, max_k_factor, max_b_unroll, max_k_factor, 1>(aa, ao, bo, co2);
+        loop_count--;
+      }
+
+      loop_count += SECOND_FETCH - b_unroll;
+      while (loop_count > 0) {
+        innerkernel<a_unroll, b_unroll, max_k_factor, max_b_unroll, max_k_factor, 0>(aa, ao, bo, co2);
+        loop_count--;
+      }
+#endif
+    }
+
+    // k-loop remainder handling.
+    loop_count = k % max_k_factor;
+    while (loop_count > 0) {
+      innerkernel<a_unroll, b_unroll, 1, max_b_unroll, max_k_factor, 0>(aa, ao, bo, co2);
+      loop_count--;
+    }
+
+    // Update C matrix.
+    c_update<max_b_unroll, a_unroll, b_unroll>(co1, co2);
+  }
+
+  template <int a_unroll, int b_unroll, int max_b_unroll>
+  EIGEN_ALWAYS_INLINE void nloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
+    // Set A matrix pointer.
+    ao = a + a_off * a_unroll;
+
+    // Set B matrix pointer if needed.
+    bo += b_unroll * b_off;
+
+    kloop<a_unroll, b_unroll, max_b_unroll>(aa, ao, bo, co1, co2);
+
+    // Advance B matrix pointer if needed.
+    bo += b_unroll * (b_stride - k - b_off);
+
+    // Advance prefetch A pointer.
+    aa += 16;
+  }
+
+  template <int a_unroll, int max_a_unroll, int max_b_unroll>
+  EIGEN_ALWAYS_INLINE void mloop(const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
+    // Set prefetch A pointers.
+    const Scalar *aa = a + a_unroll * a_stride;
+
+    // Set C matrix pointers.
+    co1 = c;
+    if (a_unroll >= max_a_unroll) co2 = c + 2 * ldc;
+    if (is_unit_inc)
+      c += a_unroll;
+    else
+      c += a_unroll * inc;
+
+    // Set B matrix pointer.
+    bo = b;
+
+    // Main n-loop.
+    for (Index i = n / max_b_unroll; i > 0; i--) nloop<a_unroll, max_b_unroll, max_b_unroll>(aa, ao, bo, co1, co2);
+
+    // n-remainders.
+    if (n & 4 && max_b_unroll > 4) nloop<a_unroll, 4, max_b_unroll>(aa, ao, bo, co1, co2);
+#if 0
+        if (n & 2 && max_b_unroll > 2) nloop<a_unroll, 2, max_b_unroll>(aa, ao, bo, co1, co2);
+        if (n & 1 && max_b_unroll > 1) nloop<a_unroll, 1, max_b_unroll>(aa, ao, bo, co1, co2);
+#else
+    // Copy kernels don't support tails of n = 2 for single/double precision.
+    // Loop over ones.
+    int n_rem = 2 * ((n & 2) != 0) + 1 * ((n & 1) != 0);
+    while (n_rem > 0) {
+      nloop<a_unroll, 1, max_b_unroll>(aa, ao, bo, co1, co2);
+      n_rem--;
+    }
+#endif
+
+    // Advance A matrix pointer.
+    a = ao + a_unroll * (a_stride - k - a_off);
+  }
+
+ public:
+  // Compute kernel unrolling C matrix by max_a_unroll x max_b_unroll.
+  template <int max_a_unroll, int max_b_unroll>
+  EIGEN_ALWAYS_INLINE void compute_kern() {
+    a -= -a_shift;
+    b -= -b_shift;
+
+    const Scalar *ao = nullptr;
+    const Scalar *bo = nullptr;
+    Scalar *co1 = nullptr;
+    Scalar *co2 = nullptr;
+
+    // Main m-loop.
+    for (; m >= max_a_unroll; m -= max_a_unroll) mloop<max_a_unroll, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+
+    // m-remainders.
+    if (m & 32 && max_a_unroll > 32) mloop<32, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 16 && max_a_unroll > 16) mloop<16, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 8 && max_a_unroll > 8) mloop<8, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 4 && max_a_unroll > 4) mloop<4, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 2 && max_a_unroll > 2 && is_f64) mloop<2, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 1 && max_a_unroll > 1 && is_f64) mloop<1, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+
+    // Copy kernels don't support tails of m = 2 for single precision.
+    // Loop over ones.
+    if (is_f32) {
+      int m_rem = 2 * ((m & 2) != 0) + 1 * ((m & 1) != 0);
+      while (m_rem > 0) {
+        mloop<1, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+        m_rem--;
+      }
+    }
+  }
+
+  gemm_class(Index m_, Index n_, Index k_, Index ldc_, Index inc_, const Scalar *alpha_, const Scalar *a_,
+             const Scalar *b_, Scalar *c_, bool is_alpha1_, bool is_beta0_, Index a_stride_, Index b_stride_,
+             Index a_off_, Index b_off_)
+      : m(m_),
+        n(n_),
+        k(k_),
+        ldc(ldc_),
+        inc(inc_),
+        alpha(alpha_),
+        a(a_),
+        b(b_),
+        c(c_),
+        is_alpha1(is_alpha1_),
+        is_beta0(is_beta0_),
+        a_stride(a_stride_),
+        b_stride(b_stride_),
+        a_off(a_off_),
+        b_off(b_off_) {
+    // Zero out all accumulation registers.
+    zmm[8] = pzero(zmm[8]);
+    zmm[9] = pzero(zmm[9]);
+    zmm[10] = pzero(zmm[10]);
+    zmm[11] = pzero(zmm[11]);
+    zmm[12] = pzero(zmm[12]);
+    zmm[13] = pzero(zmm[13]);
+    zmm[14] = pzero(zmm[14]);
+    zmm[15] = pzero(zmm[15]);
+    zmm[16] = pzero(zmm[16]);
+    zmm[17] = pzero(zmm[17]);
+    zmm[18] = pzero(zmm[18]);
+    zmm[19] = pzero(zmm[19]);
+    zmm[20] = pzero(zmm[20]);
+    zmm[21] = pzero(zmm[21]);
+    zmm[22] = pzero(zmm[22]);
+    zmm[23] = pzero(zmm[23]);
+    zmm[24] = pzero(zmm[24]);
+    zmm[25] = pzero(zmm[25]);
+    zmm[26] = pzero(zmm[26]);
+    zmm[27] = pzero(zmm[27]);
+    zmm[28] = pzero(zmm[28]);
+    zmm[29] = pzero(zmm[29]);
+    zmm[30] = pzero(zmm[30]);
+    zmm[31] = pzero(zmm[31]);
+  }
+};
+
+// Compute kernel with max unroll support of:
+//   Single precision:
+//     max_a_unroll: 48, 32, 16, 8, 4, 2, 1
+//     max_b_unroll: 8, 4, 2, 1
+//   Double precision:
+//     max_a_unroll: 24, 16, 8, 4, 2, 1
+//     max_b_unroll: 8, 4, 2, 1
+template <typename Scalar, int max_a_unroll, int max_b_unroll, bool is_alpha1, bool is_beta0, bool is_unit_inc>
+EIGEN_DONT_INLINE void gemm_kern_avx512(Index m, Index n, Index k, Scalar *alpha, const Scalar *a, const Scalar *b,
+                                        Scalar *c, Index ldc, Index inc = 1, Index a_stride = -1, Index b_stride = -1,
+                                        Index a_off = 0, Index b_off = 0) {
+  if (a_stride == -1) a_stride = k;
+  if (b_stride == -1) b_stride = k;
+
+  gemm_class<Scalar, is_unit_inc> g(m, n, k, ldc, inc, alpha, a, b, c, is_alpha1, is_beta0, a_stride, b_stride, a_off,
+                                    b_off);
+  g.template compute_kern<max_a_unroll, max_b_unroll>();
+}
+
+template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
+class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
+    : public gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
+  using Base = gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_>;
+
+ public:
+  enum { nr = Base::Vectorizable ? 8 : 4 };
+};
+
+template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
+class gebp_traits<double, double, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
+    : public gebp_traits<double, double, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
+  using Base = gebp_traits<double, double, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_>;
+
+ public:
+  enum { nr = Base::Vectorizable ? 8 : 4 };
+};
+
+template <typename Scalar, typename Index, typename DataMapper, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Conjugate, PanelMode> {
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  enum { PacketSize = packet_traits<Scalar>::size };
+  EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Scalar, typename Index, typename DataMapper, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Conjugate, PanelMode>::operator()(
+    Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset) {
+  constexpr int nr = 8;
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
+  Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
+  Index count = 0;
+  const Index peeled_k = (depth / PacketSize) * PacketSize;
+  if (nr >= 8) {
+    for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+      // skip what we have before
+      if (PanelMode) count += 8 * offset;
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+      const LinearMapper dm4 = rhs.getLinearMapper(0, j2 + 4);
+      const LinearMapper dm5 = rhs.getLinearMapper(0, j2 + 5);
+      const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6);
+      const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7);
+      Index k = 0;
+      if ((PacketSize % 8) == 0)  // TODO enable vectorized transposition for PacketSize==4
+      {
+        for (; k < peeled_k; k += PacketSize) {
+          PacketBlock<Packet, (PacketSize % 8) == 0 ? 8 : PacketSize> kernel;
+
+          kernel.packet[0] = dm0.template loadPacket<Packet>(k);
+          kernel.packet[1] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3] = dm3.template loadPacket<Packet>(k);
+          kernel.packet[4] = dm4.template loadPacket<Packet>(k);
+          kernel.packet[5] = dm5.template loadPacket<Packet>(k);
+          kernel.packet[6] = dm6.template loadPacket<Packet>(k);
+          kernel.packet[7] = dm7.template loadPacket<Packet>(k);
+
+          ptranspose(kernel);
+
+          pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
+          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
+          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
+          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
+          pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel.packet[4 % PacketSize]));
+          pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel.packet[5 % PacketSize]));
+          pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel.packet[6 % PacketSize]));
+          pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel.packet[7 % PacketSize]));
+          count += 8 * PacketSize;
+        }
+      }
+      for (; k < depth; k++) {
+        blockB[count + 0] = cj(dm0(k));
+        blockB[count + 1] = cj(dm1(k));
+        blockB[count + 2] = cj(dm2(k));
+        blockB[count + 3] = cj(dm3(k));
+        blockB[count + 4] = cj(dm4(k));
+        blockB[count + 5] = cj(dm5(k));
+        blockB[count + 6] = cj(dm6(k));
+        blockB[count + 7] = cj(dm7(k));
+        count += 8;
+      }
+      // skip what we have after
+      if (PanelMode) count += 8 * (stride - offset - depth);
+    }
+  }
+
+  if (nr >= 4) {
+    for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+      // skip what we have before
+      if (PanelMode) count += 4 * offset;
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if ((PacketSize % 4) == 0)  // TODO enable vectorized transposition for PacketSize==2 ??
+      {
+        for (; k < peeled_k; k += PacketSize) {
+          PacketBlock<Packet, (PacketSize % 4) == 0 ? 4 : PacketSize> kernel;
+          kernel.packet[0] = dm0.template loadPacket<Packet>(k);
+          kernel.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
+          ptranspose(kernel);
+          pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
+          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
+          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
+          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
+          count += 4 * PacketSize;
+        }
+      }
+      for (; k < depth; k++) {
+        blockB[count + 0] = cj(dm0(k));
+        blockB[count + 1] = cj(dm1(k));
+        blockB[count + 2] = cj(dm2(k));
+        blockB[count + 3] = cj(dm3(k));
+        count += 4;
+      }
+      // skip what we have after
+      if (PanelMode) count += 4 * (stride - offset - depth);
+    }
+  }
+
+  // copy the remaining columns one at a time (nr==1)
+  for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+    if (PanelMode) count += offset;
+    const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
+    for (Index k = 0; k < depth; k++) {
+      blockB[count] = cj(dm0(k));
+      count += 1;
+    }
+    if (PanelMode) count += (stride - offset - depth);
+  }
+}
+
+template <typename Scalar, typename Index, typename DataMapper, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, 8, RowMajor, Conjugate, PanelMode> {
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  enum {
+    PacketSize = packet_traits<Scalar>::size,
+    HalfPacketSize = unpacket_traits<HalfPacket>::size,
+    QuarterPacketSize = unpacket_traits<QuarterPacket>::size
+  };
+  EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) {
+    constexpr int nr = 8;
+    EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(offset);
+    eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
+    const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
+    const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
+    conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+    Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
+    Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
+    Index count = 0;
+
+    if (nr >= 8) {
+      for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+        // skip what we have before
+        if (PanelMode) count += 8 * offset;
+        for (Index k = 0; k < depth; k++) {
+          if (PacketSize == 8) {
+            // Packet A = ploadu<Packet>(&rhs.data()[k*rhs.stride() + j2]);
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+          } else if (HasHalf && HalfPacketSize == 8) {
+            HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+          } else if (HasQuarter && QuarterPacketSize == 8) {
+            QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+          } else if (PacketSize == 4) {
+            // Packet A = ploadu<Packet>(&rhs.data()[k*rhs.stride() + j2]);
+            // Packet B = ploadu<Packet>(&rhs.data()[k*rhs.stride() + j2 + PacketSize]);
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            Packet B = rhs.template loadPacket<Packet>(k, j2 + PacketSize);
+            pstoreu(blockB + count, cj.pconj(A));
+            pstoreu(blockB + count + PacketSize, cj.pconj(B));
+          } else {
+            // const Scalar* b0 = &rhs.data()[k*rhs.stride() + j2];
+            const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+            blockB[count + 0] = cj(dm0(0));
+            blockB[count + 1] = cj(dm0(1));
+            blockB[count + 2] = cj(dm0(2));
+            blockB[count + 3] = cj(dm0(3));
+            blockB[count + 4] = cj(dm0(4));
+            blockB[count + 5] = cj(dm0(5));
+            blockB[count + 6] = cj(dm0(6));
+            blockB[count + 7] = cj(dm0(7));
+          }
+          count += 8;
+        }
+        // skip what we have after
+        if (PanelMode) count += 8 * (stride - offset - depth);
+      }
+    }
+
+    if (nr >= 4) {
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        // skip what we have before
+        if (PanelMode) count += 4 * offset;
+        for (Index k = 0; k < depth; k++) {
+          if (PacketSize == 4) {
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += PacketSize;
+          } else if (HasHalf && HalfPacketSize == 4) {
+            HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += HalfPacketSize;
+          } else if (HasQuarter && QuarterPacketSize == 4) {
+            QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += QuarterPacketSize;
+          } else {
+            const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+            blockB[count + 0] = cj(dm0(0));
+            blockB[count + 1] = cj(dm0(1));
+            blockB[count + 2] = cj(dm0(2));
+            blockB[count + 3] = cj(dm0(3));
+            count += 4;
+          }
+        }
+        // skip what we have after
+        if (PanelMode) count += 4 * (stride - offset - depth);
+      }
+    }
+    // copy the remaining columns one at a time (nr==1)
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      if (PanelMode) count += offset;
+      for (Index k = 0; k < depth; k++) {
+        blockB[count] = cj(rhs(k, j2));
+        count += 1;
+      }
+      if (PanelMode) count += stride - offset - depth;
+    }
+  }
+};
+
+template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs> {
+  EIGEN_ALWAYS_INLINE
+  void operator()(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth,
+                  Index cols, Scalar alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0,
+                  Index offsetB = 0);
+};
+
+template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE void gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth, Index cols,
+    Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  if (res.incr() == 1) {
+    if (alpha == 1) {
+      gemm_kern_avx512<Scalar, mr, 8, true, false, true>(rows, cols, depth, &alpha, blockA, blockB,
+                                                         (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                         strideB, offsetA, offsetB);
+    } else {
+      gemm_kern_avx512<Scalar, mr, 8, false, false, true>(rows, cols, depth, &alpha, blockA, blockB,
+                                                          (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                          strideB, offsetA, offsetB);
+    }
+  } else {
+    if (alpha == 1) {
+      gemm_kern_avx512<Scalar, mr, 8, true, false, false>(rows, cols, depth, &alpha, blockA, blockB,
+                                                          (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                          strideB, offsetA, offsetB);
+    } else {
+      gemm_kern_avx512<Scalar, mr, 8, false, false, false>(rows, cols, depth, &alpha, blockA, blockB,
+                                                           (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                           strideB, offsetA, offsetB);
+    }
+  }
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // GEMM_KERNEL_H
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 337001b..aab066a 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -927,6 +927,35 @@
   EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_pd(to, mask, from);
 }
 
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pgather(const Packet& src, const Scalar* from,
+    Index stride, typename unpacket_traits<Packet>::mask_t umask);
+template <>
+EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const Packet16f& src,
+                                                             const float* from,
+                                                             Index stride,
+                                                             uint16_t umask) {
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
+  Packet16i stride_multiplier =
+      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
+  __mmask16 mask = static_cast<__mmask16>(umask);
+
+  return _mm512_mask_i32gather_ps(src, mask, indices, from, 4);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const Packet8d& src,
+                                                            const double* from,
+                                                            Index stride,
+                                                            uint8_t umask) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+  __mmask8 mask = static_cast<__mmask8>(umask);
+
+  return _mm512_mask_i32gather_pd(src, mask, indices, from, 8);
+}
+
 template <>
 EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
                                                              Index stride) {
@@ -956,6 +985,33 @@
   return _mm512_i32gather_epi32(indices, from, 4);
 }
 
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from,
+    Index stride, typename unpacket_traits<Packet>::mask_t umask);
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
+                                                         const Packet16f& from,
+                                                         Index stride,
+                                                         uint16_t umask) {
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
+  Packet16i stride_multiplier =
+      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
+  __mmask16 mask = static_cast<__mmask16>(umask);
+  _mm512_mask_i32scatter_ps(to, mask, indices, from, 4);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
+                                                         const Packet8d& from,
+                                                         Index stride,
+                                                         uint8_t umask) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+  __mmask8 mask = static_cast<__mmask8>(umask);
+  _mm512_mask_i32scatter_pd(to, mask, indices, from, 8);
+}
+
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
                                                          const Packet16f& from,
@@ -1450,28 +1506,24 @@
   kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4),_mm512_castps_pd(T6)));
   kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5),_mm512_castps_pd(T7)));
   kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5),_mm512_castps_pd(T7)));
-  
-  T0 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[4]), 0x4E));
-  T0 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[0], T0);
-  T4 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[0]), 0x4E));
-  T4 = _mm512_mask_blend_ps(0xF0F0, T4, kernel.packet[4]);
-  T1 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[5]), 0x4E));
-  T1 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[1], T1);
-  T5 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[1]), 0x4E));
-  T5 = _mm512_mask_blend_ps(0xF0F0, T5, kernel.packet[5]);
-  T2 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[6]), 0x4E));
-  T2 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[2], T2);
-  T6 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[2]), 0x4E));
-  T6 = _mm512_mask_blend_ps(0xF0F0, T6, kernel.packet[6]);
-  T3 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[7]), 0x4E));
-  T3 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[3], T3);
-  T7 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[3]), 0x4E));
-  T7 = _mm512_mask_blend_ps(0xF0F0, T7, kernel.packet[7]);
 
-  kernel.packet[0] = T0; kernel.packet[1] = T1;
-  kernel.packet[2] = T2; kernel.packet[3] = T3;
-  kernel.packet[4] = T4; kernel.packet[5] = T5;
-  kernel.packet[6] = T6; kernel.packet[7] = T7;
+  T0 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0x44);
+  T1 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0xee);
+  T2 = _mm512_shuffle_f32x4(kernel.packet[1], kernel.packet[5], 0x44);
+  T3 = _mm512_shuffle_f32x4(kernel.packet[1], kernel.packet[5], 0xee);
+  T4 = _mm512_shuffle_f32x4(kernel.packet[2], kernel.packet[6], 0x44);
+  T5 = _mm512_shuffle_f32x4(kernel.packet[2], kernel.packet[6], 0xee);
+  T6 = _mm512_shuffle_f32x4(kernel.packet[3], kernel.packet[7], 0x44);
+  T7 = _mm512_shuffle_f32x4(kernel.packet[3], kernel.packet[7], 0xee);
+
+  kernel.packet[0] = _mm512_shuffle_f32x4(T0, T2, 0x88);
+  kernel.packet[2] = _mm512_shuffle_f32x4(T0, T2, 0xdd);
+  kernel.packet[1] = _mm512_shuffle_f32x4(T4, T6, 0x88);
+  kernel.packet[3] = _mm512_shuffle_f32x4(T4, T6, 0xdd);
+  kernel.packet[4] = _mm512_shuffle_f32x4(T1, T3, 0x88);
+  kernel.packet[6] = _mm512_shuffle_f32x4(T1, T3, 0xdd);
+  kernel.packet[5] = _mm512_shuffle_f32x4(T5, T7, 0x88);
+  kernel.packet[7] = _mm512_shuffle_f32x4(T5, T7, 0xdd);
 }
 
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
diff --git a/Eigen/src/Core/arch/AVX512/TrsmKernel.h b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
index 4b81bf9..1b351ea 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
@@ -12,7 +12,21 @@
 
 #include "../../InternalHeaderCheck.h"
 
-#define EIGEN_USE_AVX512_TRSM_KERNELS // Comment out to prevent using optimized trsm kernels.
+#if !defined(EIGEN_USE_AVX512_TRSM_KERNELS)
+#define EIGEN_USE_AVX512_TRSM_KERNELS 1
+#endif
+
+#if EIGEN_USE_AVX512_TRSM_KERNELS
+#if !defined(EIGEN_USE_AVX512_TRSM_R_KERNELS)
+#define EIGEN_USE_AVX512_TRSM_R_KERNELS 1
+#endif
+#if !defined(EIGEN_USE_AVX512_TRSM_L_KERNELS)
+#define EIGEN_USE_AVX512_TRSM_L_KERNELS 1
+#endif
+#else  // EIGEN_USE_AVX512_TRSM_KERNELS == 0
+#define EIGEN_USE_AVX512_TRSM_R_KERNELS 0
+#define EIGEN_USE_AVX512_TRSM_L_KERNELS 0
+#endif
 
 #if defined(EIGEN_HAS_CXX17_IFCONSTEXPR)
 #define EIGEN_IF_CONSTEXPR(X) if constexpr (X)
@@ -28,11 +42,11 @@
 namespace Eigen {
 namespace internal {
 
-#define EIGEN_AVX_MAX_NUM_ACC (24L)
-#define EIGEN_AVX_MAX_NUM_ROW (8L)  // Denoted L in code.
-#define EIGEN_AVX_MAX_K_UNROL (4L)
-#define EIGEN_AVX_B_LOAD_SETS (2L)
-#define EIGEN_AVX_MAX_A_BCAST (2L)
+#define EIGEN_AVX_MAX_NUM_ACC (int64_t(24))
+#define EIGEN_AVX_MAX_NUM_ROW (int64_t(8))  // Denoted L in code.
+#define EIGEN_AVX_MAX_K_UNROL (int64_t(4))
+#define EIGEN_AVX_B_LOAD_SETS (int64_t(2))
+#define EIGEN_AVX_MAX_A_BCAST (int64_t(2))
 typedef Packet16f vecFullFloat;
 typedef Packet8d vecFullDouble;
 typedef Packet8f vecHalfFloat;
@@ -42,8 +56,7 @@
 // Note: this depends on macros and typedefs above.
 #include "TrsmUnrolls.inc"
 
-
-#if defined(EIGEN_USE_AVX512_TRSM_KERNELS) && (EIGEN_COMP_CLANG != 0)
+#if (EIGEN_USE_AVX512_TRSM_KERNELS) && (EIGEN_COMP_CLANG != 0)
 /**
  * For smaller problem sizes, and certain compilers, using the optimized kernels trsmKernelL/R directly
  * is faster than the packed versions in TriangularSolverMatrix.h.
@@ -60,27 +73,47 @@
  *  M = Dimension of triangular matrix
  *
  */
-#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS // Comment out to disable no-copy dispatch
-template <typename Scalar>
-int64_t avx512_trsm_cutoff(int64_t L2Size, int64_t N, double L2Cap){
-  const int64_t U3 = 3*packet_traits<Scalar>::size;
-  const int64_t MaxNb = 5*U3;
-  int64_t Nb = std::min(MaxNb, N);
-  double cutoff_d = (((L2Size*L2Cap)/(sizeof(Scalar)))-(EIGEN_AVX_MAX_NUM_ROW)*Nb)/
-    ((EIGEN_AVX_MAX_NUM_ROW)+Nb);
-  int64_t cutoff_l = static_cast<int64_t>(cutoff_d);
-  return (cutoff_l/EIGEN_AVX_MAX_NUM_ROW)*EIGEN_AVX_MAX_NUM_ROW;
-}
+#if !defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS)
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS 1
 #endif
 
+#if EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS
+
+#if EIGEN_USE_AVX512_TRSM_R_KERNELS
+#if !defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS)
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS 1
+#endif  // !defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS)
+#endif
+
+#if EIGEN_USE_AVX512_TRSM_L_KERNELS
+#if !defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS)
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS 1
+#endif
+#endif  // EIGEN_USE_AVX512_TRSM_L_KERNELS
+
+#else  // EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS == 0
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS 0
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS 0
+#endif  // EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS
+
+template <typename Scalar>
+int64_t avx512_trsm_cutoff(int64_t L2Size, int64_t N, double L2Cap) {
+  const int64_t U3 = 3 * packet_traits<Scalar>::size;
+  const int64_t MaxNb = 5 * U3;
+  int64_t Nb = std::min(MaxNb, N);
+  double cutoff_d =
+      (((L2Size * L2Cap) / (sizeof(Scalar))) - (EIGEN_AVX_MAX_NUM_ROW)*Nb) / ((EIGEN_AVX_MAX_NUM_ROW) + Nb);
+  int64_t cutoff_l = static_cast<int64_t>(cutoff_d);
+  return (cutoff_l / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW;
+}
+#endif
 
 /**
  * Used by gemmKernel for the case A/B row-major and C col-major.
  */
 template <typename Scalar, typename vec, int64_t unrollM, int64_t unrollN, bool remM, bool remN>
-static EIGEN_ALWAYS_INLINE
-void transStoreC(PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
-                 Scalar *C_arr, int64_t LDC, int64_t remM_ = 0, int64_t remN_ = 0) {
+static EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                            Scalar *C_arr, int64_t LDC, int64_t remM_ = 0, int64_t remN_ = 0) {
   EIGEN_UNUSED_VARIABLE(remN_);
   EIGEN_UNUSED_VARIABLE(remM_);
   using urolls = unrolls::trans<Scalar>;
@@ -89,76 +122,76 @@
   constexpr int64_t U2 = urolls::PacketSize * 2;
   constexpr int64_t U1 = urolls::PacketSize * 1;
 
-  static_assert( unrollN == U1 || unrollN == U2 || unrollN == U3, "unrollN should be a multiple of PacketSize");
-  static_assert( unrollM == EIGEN_AVX_MAX_NUM_ROW, "unrollM should be equal to EIGEN_AVX_MAX_NUM_ROW");
+  static_assert(unrollN == U1 || unrollN == U2 || unrollN == U3, "unrollN should be a multiple of PacketSize");
+  static_assert(unrollM == EIGEN_AVX_MAX_NUM_ROW, "unrollM should be equal to EIGEN_AVX_MAX_NUM_ROW");
 
   urolls::template transpose<unrollN, 0>(zmm);
   EIGEN_IF_CONSTEXPR(unrollN > U2) urolls::template transpose<unrollN, 2>(zmm);
   EIGEN_IF_CONSTEXPR(unrollN > U1) urolls::template transpose<unrollN, 1>(zmm);
 
-  static_assert( (remN && unrollN == U1) || !remN, "When handling N remainder set unrollN=U1");
+  static_assert((remN && unrollN == U1) || !remN, "When handling N remainder set unrollN=U1");
   EIGEN_IF_CONSTEXPR(!remN) {
-    urolls::template storeC<std::min(unrollN, U1), unrollN, 0, remM>(C_arr, LDC, zmm,  remM_);
+    urolls::template storeC<std::min(unrollN, U1), unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
     EIGEN_IF_CONSTEXPR(unrollN > U1) {
-      constexpr int64_t unrollN_ = std::min(unrollN-U1, U1);
-      urolls::template storeC<unrollN_, unrollN, 1, remM>(C_arr + U1*LDC, LDC, zmm,  remM_);
+      constexpr int64_t unrollN_ = std::min(unrollN - U1, U1);
+      urolls::template storeC<unrollN_, unrollN, 1, remM>(C_arr + U1 * LDC, LDC, zmm, remM_);
     }
     EIGEN_IF_CONSTEXPR(unrollN > U2) {
-      constexpr int64_t unrollN_ = std::min(unrollN-U2, U1);
-      urolls:: template storeC<unrollN_, unrollN, 2, remM>(C_arr + U2*LDC, LDC, zmm,  remM_);
+      constexpr int64_t unrollN_ = std::min(unrollN - U2, U1);
+      urolls::template storeC<unrollN_, unrollN, 2, remM>(C_arr + U2 * LDC, LDC, zmm, remM_);
     }
   }
   else {
-    EIGEN_IF_CONSTEXPR( (std::is_same<Scalar, float>::value) ) {
+    EIGEN_IF_CONSTEXPR((std::is_same<Scalar, float>::value)) {
       // Note: without "if constexpr" this section of code will also be
       // parsed by the compiler so each of the storeC will still be instantiated.
       // We use enable_if in aux_storeC to set it to an empty function for
       // these cases.
-      if(remN_ == 15)
+      if (remN_ == 15)
         urolls::template storeC<15, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 14)
+      else if (remN_ == 14)
         urolls::template storeC<14, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 13)
+      else if (remN_ == 13)
         urolls::template storeC<13, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 12)
+      else if (remN_ == 12)
         urolls::template storeC<12, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 11)
+      else if (remN_ == 11)
         urolls::template storeC<11, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 10)
+      else if (remN_ == 10)
         urolls::template storeC<10, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 9)
+      else if (remN_ == 9)
         urolls::template storeC<9, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 8)
+      else if (remN_ == 8)
         urolls::template storeC<8, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 7)
+      else if (remN_ == 7)
         urolls::template storeC<7, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 6)
+      else if (remN_ == 6)
         urolls::template storeC<6, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 5)
+      else if (remN_ == 5)
         urolls::template storeC<5, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 4)
+      else if (remN_ == 4)
         urolls::template storeC<4, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 3)
+      else if (remN_ == 3)
         urolls::template storeC<3, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 2)
+      else if (remN_ == 2)
         urolls::template storeC<2, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 1)
+      else if (remN_ == 1)
         urolls::template storeC<1, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
     }
     else {
-      if(remN_ == 7)
+      if (remN_ == 7)
         urolls::template storeC<7, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 6)
+      else if (remN_ == 6)
         urolls::template storeC<6, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 5)
+      else if (remN_ == 5)
         urolls::template storeC<5, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 4)
+      else if (remN_ == 4)
         urolls::template storeC<4, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 3)
+      else if (remN_ == 3)
         urolls::template storeC<3, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 2)
+      else if (remN_ == 2)
         urolls::template storeC<2, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
-      else if(remN_ == 1)
+      else if (remN_ == 1)
         urolls::template storeC<1, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
     }
   }
@@ -179,505 +212,503 @@
  * handleKRem: Handle arbitrary K? This is not needed for trsm.
  */
 template <typename Scalar, bool isARowMajor, bool isCRowMajor, bool isAdd, bool handleKRem>
-void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr,
-                int64_t M, int64_t N, int64_t K,
-                int64_t LDA, int64_t LDB, int64_t LDC) {
+void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t N, int64_t K, int64_t LDA, int64_t LDB,
+                int64_t LDC) {
   using urolls = unrolls::gemm<Scalar, isAdd>;
   constexpr int64_t U3 = urolls::PacketSize * 3;
   constexpr int64_t U2 = urolls::PacketSize * 2;
   constexpr int64_t U1 = urolls::PacketSize * 1;
-  using vec = typename std::conditional<std::is_same<Scalar, float>::value,
-                                        vecFullFloat,
-                                        vecFullDouble>::type;
-  int64_t N_ = (N/U3)*U3;
-  int64_t M_ = (M/EIGEN_AVX_MAX_NUM_ROW)*EIGEN_AVX_MAX_NUM_ROW;
-  int64_t K_ = (K/EIGEN_AVX_MAX_K_UNROL)*EIGEN_AVX_MAX_K_UNROL;
+  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
+  int64_t N_ = (N / U3) * U3;
+  int64_t M_ = (M / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW;
+  int64_t K_ = (K / EIGEN_AVX_MAX_K_UNROL) * EIGEN_AVX_MAX_K_UNROL;
   int64_t j = 0;
-  for(; j < N_; j += U3) {
-    constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS*3;
+  for (; j < N_; j += U3) {
+    constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 3;
     int64_t i = 0;
-    for(; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)], *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<3,EIGEN_AVX_MAX_NUM_ROW>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,3,EIGEN_AVX_MAX_NUM_ROW,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST>(
-                                        B_t, A_t, LDB, LDA, zmm);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)], *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<3, EIGEN_AVX_MAX_NUM_ROW>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 3, EIGEN_AVX_MAX_NUM_ROW, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,3,EIGEN_AVX_MAX_NUM_ROW,1,
-                                        EIGEN_AVX_B_LOAD_SETS*3,EIGEN_AVX_MAX_A_BCAST>(
-                                          B_t, A_t, LDB, LDA, zmm);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 3, EIGEN_AVX_MAX_NUM_ROW, 1, EIGEN_AVX_B_LOAD_SETS * 3,
+                                       EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<3,EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i*LDC + j], LDC, zmm);
-        urolls::template storeC<3,EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i*LDC+ j], LDC, zmm);
+        urolls::template updateC<3, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<3, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U3,false, false>(zmm, &C_arr[i + j*LDC], LDC);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U3, false, false>(zmm, &C_arr[i + j * LDC], LDC);
       }
     }
-    if(M - i >= 4) { // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<3,4>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,3,4,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_B_LOAD_SETS*3,EIGEN_AVX_MAX_A_BCAST>(
-                                        B_t, A_t, LDB, LDA, zmm);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    if (M - i >= 4) {  // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<3, 4>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 3, 4, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_B_LOAD_SETS * 3,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,3,4,1,
-                                        EIGEN_AVX_B_LOAD_SETS*3,EIGEN_AVX_MAX_A_BCAST>(
-                                          B_t, A_t, LDB, LDA, zmm);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 3, 4, 1, EIGEN_AVX_B_LOAD_SETS * 3, EIGEN_AVX_MAX_A_BCAST>(
+              B_t, A_t, LDB, LDA, zmm);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<3,4>(&C_arr[i*LDC + j], LDC, zmm);
-        urolls::template storeC<3,4>(&C_arr[i*LDC + j], LDC, zmm);
+        urolls::template updateC<3, 4>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<3, 4>(&C_arr[i * LDC + j], LDC, zmm);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U3,true, false>(zmm, &C_arr[i + j*LDC], LDC, 4);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U3, true, false>(zmm, &C_arr[i + j * LDC], LDC, 4);
       }
       i += 4;
     }
-    if(M - i >= 2) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<3,2>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,3,2,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_B_LOAD_SETS*3,EIGEN_AVX_MAX_A_BCAST>(
-                                        B_t, A_t, LDB, LDA, zmm);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    if (M - i >= 2) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<3, 2>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 3, 2, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_B_LOAD_SETS * 3,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,3,2,1,
-                                        EIGEN_AVX_B_LOAD_SETS*3,EIGEN_AVX_MAX_A_BCAST>(
-                                          B_t, A_t, LDB, LDA, zmm);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 3, 2, 1, EIGEN_AVX_B_LOAD_SETS * 3, EIGEN_AVX_MAX_A_BCAST>(
+              B_t, A_t, LDB, LDA, zmm);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<3,2>(&C_arr[i*LDC + j], LDC, zmm);
-        urolls::template storeC<3,2>(&C_arr[i*LDC + j], LDC, zmm);
+        urolls::template updateC<3, 2>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<3, 2>(&C_arr[i * LDC + j], LDC, zmm);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U3,true, false>(zmm, &C_arr[i + j*LDC], LDC, 2);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U3, true, false>(zmm, &C_arr[i + j * LDC], LDC, 2);
       }
       i += 2;
     }
-    if(M - i > 0) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<3,1>(zmm);
+    if (M - i > 0) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<3, 1>(zmm);
       {
-        for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-          urolls:: template microKernel<isARowMajor,3,1,EIGEN_AVX_MAX_K_UNROL,
-                                        EIGEN_AVX_B_LOAD_SETS*3,1>(
-                                          B_t, A_t, LDB, LDA, zmm);
-          B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+        for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+          urolls::template microKernel<isARowMajor, 3, 1, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_B_LOAD_SETS * 3, 1>(
+              B_t, A_t, LDB, LDA, zmm);
+          B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+          else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
         }
         EIGEN_IF_CONSTEXPR(handleKRem) {
-          for(int64_t k = K_; k < K ; k ++) {
-            urolls:: template microKernel<isARowMajor,3,1,1,
-                                          EIGEN_AVX_B_LOAD_SETS*3,1>(B_t, A_t, LDB, LDA, zmm);
+          for (int64_t k = K_; k < K; k++) {
+            urolls::template microKernel<isARowMajor, 3, 1, 1, EIGEN_AVX_B_LOAD_SETS * 3, 1>(B_t, A_t, LDB, LDA, zmm);
             B_t += LDB;
-            EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+            EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+            else A_t += LDA;
           }
         }
         EIGEN_IF_CONSTEXPR(isCRowMajor) {
-          urolls::template updateC<3,1>(&C_arr[i*LDC + j], LDC, zmm);
-          urolls::template storeC<3,1>(&C_arr[i*LDC + j], LDC, zmm);
+          urolls::template updateC<3, 1>(&C_arr[i * LDC + j], LDC, zmm);
+          urolls::template storeC<3, 1>(&C_arr[i * LDC + j], LDC, zmm);
         }
         else {
-          transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U3,true, false>(zmm, &C_arr[i + j*LDC], LDC, 1);
+          transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U3, true, false>(zmm, &C_arr[i + j * LDC], LDC, 1);
         }
       }
     }
   }
-  if(N - j >= U2) {
-    constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS*2;
+  if (N - j >= U2) {
+    constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 2;
     int64_t i = 0;
-    for(; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)], *B_t = &B_arr[0*LDB + j];
-      EIGEN_IF_CONSTEXPR(isCRowMajor) B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<2,EIGEN_AVX_MAX_NUM_ROW>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,2,EIGEN_AVX_MAX_NUM_ROW,
-                                      EIGEN_AVX_MAX_K_UNROL,EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST>(
-                                        B_t, A_t, LDB, LDA, zmm);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)], *B_t = &B_arr[0 * LDB + j];
+      EIGEN_IF_CONSTEXPR(isCRowMajor) B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<2, EIGEN_AVX_MAX_NUM_ROW>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 2, EIGEN_AVX_MAX_NUM_ROW, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,2,EIGEN_AVX_MAX_NUM_ROW,1,
-                                        EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST>(
-                                          B_t, A_t, LDB, LDA, zmm);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 2, EIGEN_AVX_MAX_NUM_ROW, 1, EIGEN_AVX_MAX_B_LOAD,
+                                       EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<2,EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i*LDC + j], LDC, zmm);
-        urolls::template storeC<2,EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i*LDC + j], LDC, zmm);
+        urolls::template updateC<2, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<2, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U2,false, false>(zmm, &C_arr[i + j*LDC], LDC);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U2, false, false>(zmm, &C_arr[i + j * LDC], LDC);
       }
     }
-    if(M - i >= 4) { // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<2,4>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,2,4,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST>(
-                                        B_t, A_t, LDB, LDA, zmm);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    if (M - i >= 4) {  // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<2, 4>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 2, 4, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,2,4,1,
-                                        EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST>(
-                                          B_t, A_t, LDB, LDA, zmm);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 2, 4, 1, EIGEN_AVX_MAX_B_LOAD, EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB,
+                                                                                                          LDA, zmm);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<2,4>(&C_arr[i*LDC + j], LDC, zmm);
-        urolls::template storeC<2,4>(&C_arr[i*LDC + j], LDC, zmm);
+        urolls::template updateC<2, 4>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<2, 4>(&C_arr[i * LDC + j], LDC, zmm);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U2,true, false>(zmm, &C_arr[i + j*LDC], LDC, 4);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U2, true, false>(zmm, &C_arr[i + j * LDC], LDC, 4);
       }
       i += 4;
     }
-    if(M - i >= 2) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<2,2>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,2,2,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST>(
-                                        B_t, A_t, LDB, LDA, zmm);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    if (M - i >= 2) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<2, 2>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 2, 2, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,2,2,1,
-                                        EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST>(
-                                          B_t, A_t, LDB, LDA, zmm);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 2, 2, 1, EIGEN_AVX_MAX_B_LOAD, EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB,
+                                                                                                          LDA, zmm);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<2,2>(&C_arr[i*LDC + j], LDC, zmm);
-        urolls::template storeC<2,2>(&C_arr[i*LDC + j], LDC, zmm);
+        urolls::template updateC<2, 2>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<2, 2>(&C_arr[i * LDC + j], LDC, zmm);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U2,true, false>(zmm, &C_arr[i + j*LDC], LDC, 2);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U2, true, false>(zmm, &C_arr[i + j * LDC], LDC, 2);
       }
       i += 2;
     }
-    if(M - i > 0) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<2,1>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,2,1,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_MAX_B_LOAD,1>(
-                                        B_t, A_t, LDB, LDA, zmm);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    if (M - i > 0) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<2, 1>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 2, 1, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD, 1>(B_t, A_t, LDB,
+                                                                                                        LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,2,1,1,
-                                        EIGEN_AVX_MAX_B_LOAD,1>(B_t, A_t, LDB, LDA, zmm);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 2, 1, 1, EIGEN_AVX_MAX_B_LOAD, 1>(B_t, A_t, LDB, LDA, zmm);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-          urolls::template updateC<2,1>(&C_arr[i*LDC + j], LDC, zmm);
-          urolls::template storeC<2,1>(&C_arr[i*LDC + j], LDC, zmm);
+        urolls::template updateC<2, 1>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<2, 1>(&C_arr[i * LDC + j], LDC, zmm);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U2,true, false>(zmm, &C_arr[i + j*LDC], LDC, 1);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U2, true, false>(zmm, &C_arr[i + j * LDC], LDC, 1);
       }
     }
     j += U2;
   }
-  if(N - j >= U1) {
-    constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS*1;
+  if (N - j >= U1) {
+    constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 1;
     int64_t i = 0;
-    for(; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)], *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<1,EIGEN_AVX_MAX_NUM_ROW>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,1,EIGEN_AVX_MAX_NUM_ROW,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST>(
-                                        B_t, A_t, LDB, LDA, zmm);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)], *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, EIGEN_AVX_MAX_NUM_ROW>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, EIGEN_AVX_MAX_NUM_ROW, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,1,EIGEN_AVX_MAX_NUM_ROW,1,
-                                        EIGEN_AVX_B_LOAD_SETS*1,EIGEN_AVX_MAX_A_BCAST>(
-                                          B_t, A_t, LDB, LDA, zmm);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, EIGEN_AVX_MAX_NUM_ROW, 1, EIGEN_AVX_B_LOAD_SETS * 1,
+                                       EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<1,EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i*LDC + j], LDC, zmm);
-        urolls::template storeC<1,EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i*LDC + j], LDC, zmm);
+        urolls::template updateC<1, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<1, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U1,false, false>(zmm, &C_arr[i + j*LDC], LDC);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, false, false>(zmm, &C_arr[i + j * LDC], LDC);
       }
     }
-    if(M - i >= 4) { // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<1,4>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,1,4,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST>(
-                                        B_t, A_t, LDB, LDA, zmm);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    if (M - i >= 4) {  // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, 4>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, 4, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,1,4,1,
-                                        EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST>(
-                                          B_t, A_t, LDB, LDA, zmm);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, 4, 1, EIGEN_AVX_MAX_B_LOAD, EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB,
+                                                                                                          LDA, zmm);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<1,4>(&C_arr[i*LDC + j], LDC, zmm);
-        urolls::template storeC<1,4>(&C_arr[i*LDC + j], LDC, zmm);
+        urolls::template updateC<1, 4>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<1, 4>(&C_arr[i * LDC + j], LDC, zmm);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U1,true, false>(zmm, &C_arr[i + j*LDC], LDC, 4);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, true, false>(zmm, &C_arr[i + j * LDC], LDC, 4);
       }
       i += 4;
     }
-    if(M - i >= 2) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<1,2>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,1,2,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST>(
-                                        B_t, A_t, LDB, LDA, zmm);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    if (M - i >= 2) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, 2>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, 2, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,1,2,1,
-                                        EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST>(
-                                          B_t, A_t, LDB, LDA, zmm);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, 2, 1, EIGEN_AVX_MAX_B_LOAD, EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB,
+                                                                                                          LDA, zmm);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<1,2>(&C_arr[i*LDC + j], LDC, zmm);
-        urolls::template storeC<1,2>(&C_arr[i*LDC + j], LDC, zmm);
+        urolls::template updateC<1, 2>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<1, 2>(&C_arr[i * LDC + j], LDC, zmm);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U1,true, false>(zmm, &C_arr[i + j*LDC], LDC, 2);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, true, false>(zmm, &C_arr[i + j * LDC], LDC, 2);
       }
       i += 2;
     }
-    if(M - i > 0) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<1,1>(zmm);
+    if (M - i > 0) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, 1>(zmm);
       {
-        for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-          urolls:: template microKernel<isARowMajor,1,1,EIGEN_AVX_MAX_K_UNROL,
-                                        EIGEN_AVX_MAX_B_LOAD,1>(
-                                          B_t, A_t, LDB, LDA, zmm);
-          B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+        for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+          urolls::template microKernel<isARowMajor, 1, 1, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD, 1>(B_t, A_t, LDB,
+                                                                                                          LDA, zmm);
+          B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+          else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
         }
         EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,1,1,1,EIGEN_AVX_B_LOAD_SETS*1,1>(B_t, A_t, LDB, LDA, zmm);
-          B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          for (int64_t k = K_; k < K; k++) {
+            urolls::template microKernel<isARowMajor, 1, 1, 1, EIGEN_AVX_B_LOAD_SETS * 1, 1>(B_t, A_t, LDB, LDA, zmm);
+            B_t += LDB;
+            EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+            else A_t += LDA;
+          }
         }
-      }
         EIGEN_IF_CONSTEXPR(isCRowMajor) {
-          urolls::template updateC<1,1>(&C_arr[i*LDC + j], LDC, zmm);
-          urolls::template storeC<1,1>(&C_arr[i*LDC + j], LDC, zmm);
+          urolls::template updateC<1, 1>(&C_arr[i * LDC + j], LDC, zmm);
+          urolls::template storeC<1, 1>(&C_arr[i * LDC + j], LDC, zmm);
         }
         else {
-          transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U1,true, false>(zmm, &C_arr[i + j*LDC], LDC, 1);
+          transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, true, false>(zmm, &C_arr[i + j * LDC], LDC, 1);
         }
       }
     }
     j += U1;
   }
-  if(N - j > 0) {
-    constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS*1;
+  if (N - j > 0) {
+    constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 1;
     int64_t i = 0;
-    for(; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<1,EIGEN_AVX_MAX_NUM_ROW>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,1,EIGEN_AVX_MAX_NUM_ROW,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST,true>(
-          B_t, A_t, LDB, LDA, zmm, N - j);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, EIGEN_AVX_MAX_NUM_ROW>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, EIGEN_AVX_MAX_NUM_ROW, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST, true>(B_t, A_t, LDB, LDA, zmm, N - j);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,1,EIGEN_AVX_MAX_NUM_ROW,1,
-                                        EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST,true>(
-                                          B_t, A_t, LDB, LDA, zmm, N - j);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, EIGEN_AVX_MAX_NUM_ROW, 1, EIGEN_AVX_MAX_B_LOAD,
+                                       EIGEN_AVX_MAX_A_BCAST, true>(B_t, A_t, LDB, LDA, zmm, N - j);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<1,EIGEN_AVX_MAX_NUM_ROW,true>(&C_arr[i*LDC + j], LDC, zmm, N - j);
-        urolls::template storeC<1,EIGEN_AVX_MAX_NUM_ROW,true>(&C_arr[i*LDC + j], LDC, zmm, N - j);
+        urolls::template updateC<1, EIGEN_AVX_MAX_NUM_ROW, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
+        urolls::template storeC<1, EIGEN_AVX_MAX_NUM_ROW, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U1,false, true>(zmm, &C_arr[i + j*LDC], LDC, 0, N-j);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, false, true>(zmm, &C_arr[i + j * LDC], LDC, 0, N - j);
       }
     }
-    if(M - i >= 4) { // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<1,4>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,1,4,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST,true>(
-                                        B_t, A_t, LDB, LDA, zmm, N - j);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    if (M - i >= 4) {  // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, 4>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, 4, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST, true>(B_t, A_t, LDB, LDA, zmm, N - j);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,1,4,1,
-                                        EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST,true>(
-                                          B_t, A_t, LDB, LDA, zmm, N - j);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, 4, 1, EIGEN_AVX_MAX_B_LOAD, EIGEN_AVX_MAX_A_BCAST, true>(
+              B_t, A_t, LDB, LDA, zmm, N - j);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<1,4,true>(&C_arr[i*LDC + j], LDC, zmm, N - j);
-        urolls::template storeC<1,4,true>(&C_arr[i*LDC + j], LDC, zmm, N - j);
+        urolls::template updateC<1, 4, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
+        urolls::template storeC<1, 4, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U1,true, true>(zmm, &C_arr[i + j*LDC], LDC, 4, N-j);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, true, true>(zmm, &C_arr[i + j * LDC], LDC, 4, N - j);
       }
       i += 4;
     }
-    if(M - i >= 2) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<1,2>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,1,2,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST,true>(
-                                        B_t, A_t, LDB, LDA, zmm, N - j);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    if (M - i >= 2) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, 2>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, 2, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST, true>(B_t, A_t, LDB, LDA, zmm, N - j);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,1,2,1,
-                                        EIGEN_AVX_MAX_B_LOAD,EIGEN_AVX_MAX_A_BCAST,true>(
-                                          B_t, A_t, LDB, LDA, zmm, N - j);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, 2, 1, EIGEN_AVX_MAX_B_LOAD, EIGEN_AVX_MAX_A_BCAST, true>(
+              B_t, A_t, LDB, LDA, zmm, N - j);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<1,2,true>(&C_arr[i*LDC + j], LDC, zmm, N - j);
-        urolls::template storeC<1,2,true>(&C_arr[i*LDC + j], LDC, zmm, N - j);
+        urolls::template updateC<1, 2, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
+        urolls::template storeC<1, 2, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U1,true, true>(zmm, &C_arr[i + j*LDC], LDC, 2, N-j);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, true, true>(zmm, &C_arr[i + j * LDC], LDC, 2, N - j);
       }
       i += 2;
     }
-    if(M - i > 0) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i,0,LDA)];
-      Scalar *B_t = &B_arr[0*LDB + j];
-      PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
-      urolls::template setzero<1,1>(zmm);
-      for(int64_t k = 0; k < K_ ; k += EIGEN_AVX_MAX_K_UNROL) {
-        urolls:: template microKernel<isARowMajor,1,1,EIGEN_AVX_MAX_K_UNROL,
-                                      EIGEN_AVX_MAX_B_LOAD,1,true>(
-                                        B_t, A_t, LDB, LDA, zmm, N - j);
-        B_t += EIGEN_AVX_MAX_K_UNROL*LDB;
-        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; else A_t += EIGEN_AVX_MAX_K_UNROL*LDA;
+    if (M - i > 0) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, 1>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, 1, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD, 1, true>(
+            B_t, A_t, LDB, LDA, zmm, N - j);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
       }
       EIGEN_IF_CONSTEXPR(handleKRem) {
-        for(int64_t k = K_; k < K ; k ++) {
-          urolls:: template microKernel<isARowMajor,1,1,1,
-                                        EIGEN_AVX_MAX_B_LOAD,1,true>(
-                                          B_t, A_t, LDB, LDA, zmm, N - j);
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, 1, 1, EIGEN_AVX_MAX_B_LOAD, 1, true>(B_t, A_t, LDB, LDA, zmm,
+                                                                                            N - j);
           B_t += LDB;
-          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; else A_t += LDA;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
         }
       }
       EIGEN_IF_CONSTEXPR(isCRowMajor) {
-        urolls::template updateC<1,1,true>(&C_arr[i*LDC + j], LDC, zmm, N - j);
-        urolls::template storeC<1,1,true>(&C_arr[i*LDC + j], LDC, zmm, N - j);
+        urolls::template updateC<1, 1, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
+        urolls::template storeC<1, 1, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
       }
       else {
-        transStoreC<Scalar,vec,EIGEN_AVX_MAX_NUM_ROW,U1,true, true>(zmm, &C_arr[i + j*LDC], LDC, 1, N-j);
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, true, true>(zmm, &C_arr[i + j * LDC], LDC, 1, N - j);
       }
     }
   }
@@ -690,48 +721,46 @@
  * isFWDSolve: is forward solve?
  * isUnitDiag: is the diagonal of A all ones?
  * The B matrix (RHS) is assumed to be row-major
-*/
+ */
 template <typename Scalar, typename vec, int64_t unrollM, bool isARowMajor, bool isFWDSolve, bool isUnitDiag>
-static EIGEN_ALWAYS_INLINE
-void triSolveKernel(Scalar *A_arr, Scalar *B_arr, int64_t K, int64_t LDA, int64_t LDB) {
-
-  static_assert( unrollM <= EIGEN_AVX_MAX_NUM_ROW, "unrollM should be equal to EIGEN_AVX_MAX_NUM_ROW" );
+static EIGEN_ALWAYS_INLINE void triSolveKernel(Scalar *A_arr, Scalar *B_arr, int64_t K, int64_t LDA, int64_t LDB) {
+  static_assert(unrollM <= EIGEN_AVX_MAX_NUM_ROW, "unrollM should be equal to EIGEN_AVX_MAX_NUM_ROW");
   using urolls = unrolls::trsm<Scalar>;
   constexpr int64_t U3 = urolls::PacketSize * 3;
   constexpr int64_t U2 = urolls::PacketSize * 2;
   constexpr int64_t U1 = urolls::PacketSize * 1;
 
-  PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> RHSInPacket;
-  PacketBlock<vec,EIGEN_AVX_MAX_NUM_ROW> AInPacket;
+  PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> RHSInPacket;
+  PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> AInPacket;
 
   int64_t k = 0;
-  while(K - k >= U3) {
-    urolls:: template loadRHS<isFWDSolve, unrollM, 3>(B_arr + k, LDB, RHSInPacket);
-    urolls:: template triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, unrollM, 3>(
-      A_arr, LDA, RHSInPacket, AInPacket);
-    urolls:: template storeRHS<isFWDSolve, unrollM, 3>(B_arr + k, LDB, RHSInPacket);
+  while (K - k >= U3) {
+    urolls::template loadRHS<isFWDSolve, unrollM, 3>(B_arr + k, LDB, RHSInPacket);
+    urolls::template triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, unrollM, 3>(A_arr, LDA, RHSInPacket,
+                                                                                          AInPacket);
+    urolls::template storeRHS<isFWDSolve, unrollM, 3>(B_arr + k, LDB, RHSInPacket);
     k += U3;
   }
-  if(K - k >= U2) {
-    urolls:: template loadRHS<isFWDSolve, unrollM, 2>(B_arr + k, LDB, RHSInPacket);
-    urolls:: template triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, unrollM, 2>(
-      A_arr, LDA, RHSInPacket, AInPacket);
-    urolls:: template storeRHS<isFWDSolve, unrollM, 2>(B_arr + k, LDB, RHSInPacket);
+  if (K - k >= U2) {
+    urolls::template loadRHS<isFWDSolve, unrollM, 2>(B_arr + k, LDB, RHSInPacket);
+    urolls::template triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, unrollM, 2>(A_arr, LDA, RHSInPacket,
+                                                                                          AInPacket);
+    urolls::template storeRHS<isFWDSolve, unrollM, 2>(B_arr + k, LDB, RHSInPacket);
     k += U2;
   }
-  if(K - k >= U1) {
-    urolls:: template loadRHS<isFWDSolve, unrollM, 1>(B_arr + k, LDB, RHSInPacket);
-    urolls:: template triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, unrollM, 1>(
-      A_arr, LDA, RHSInPacket, AInPacket);
-    urolls:: template storeRHS<isFWDSolve, unrollM, 1>(B_arr + k, LDB, RHSInPacket);
+  if (K - k >= U1) {
+    urolls::template loadRHS<isFWDSolve, unrollM, 1>(B_arr + k, LDB, RHSInPacket);
+    urolls::template triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, unrollM, 1>(A_arr, LDA, RHSInPacket,
+                                                                                          AInPacket);
+    urolls::template storeRHS<isFWDSolve, unrollM, 1>(B_arr + k, LDB, RHSInPacket);
     k += U1;
   }
-  if(K - k > 0) {
+  if (K - k > 0) {
     // Handle remaining number of RHS
-    urolls::template loadRHS<isFWDSolve, unrollM, 1, true>(B_arr + k, LDB, RHSInPacket, K-k);
-    urolls::template triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, unrollM, 1>(
-      A_arr, LDA, RHSInPacket, AInPacket);
-    urolls::template storeRHS<isFWDSolve, unrollM, 1, true>(B_arr + k, LDB, RHSInPacket, K-k);
+    urolls::template loadRHS<isFWDSolve, unrollM, 1, true>(B_arr + k, LDB, RHSInPacket, K - k);
+    urolls::template triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, unrollM, 1>(A_arr, LDA, RHSInPacket,
+                                                                                          AInPacket);
+    urolls::template storeRHS<isFWDSolve, unrollM, 1, true>(B_arr + k, LDB, RHSInPacket, K - k);
   }
 }
 
@@ -742,7 +771,7 @@
  * isFWDSolve: is forward solve?
  * isUnitDiag: is the diagonal of A all ones?
  * The B matrix (RHS) is assumed to be row-major
-*/
+ */
 template <typename Scalar, bool isARowMajor, bool isFWDSolve, bool isUnitDiag>
 void triSolveKernelLxK(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t K, int64_t LDA, int64_t LDB) {
   // Note: this assumes EIGEN_AVX_MAX_NUM_ROW = 8. Unrolls should be adjusted
@@ -775,59 +804,90 @@
  *
  */
 template <typename Scalar, bool toTemp = true, bool remM = false>
-static EIGEN_ALWAYS_INLINE
-void copyBToRowMajor(Scalar *B_arr, int64_t LDB, int64_t K,
-                 Scalar *B_temp, int64_t LDB_, int64_t remM_ = 0) {
+static EIGEN_ALWAYS_INLINE void copyBToRowMajor(Scalar *B_arr, int64_t LDB, int64_t K, Scalar *B_temp, int64_t LDB_,
+                                                int64_t remM_ = 0) {
   EIGEN_UNUSED_VARIABLE(remM_);
   using urolls = unrolls::transB<Scalar>;
   using vecHalf = typename std::conditional<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>::type;
-  PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> ymm;
+  PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> ymm;
   constexpr int64_t U3 = urolls::PacketSize * 3;
   constexpr int64_t U2 = urolls::PacketSize * 2;
   constexpr int64_t U1 = urolls::PacketSize * 1;
-  int64_t K_ = K/U3*U3;
+  int64_t K_ = K / U3 * U3;
   int64_t k = 0;
 
-  for(; k < K_; k += U3) {
-    urolls::template transB_kernel<U3, toTemp, remM>(B_arr + k*LDB, LDB, B_temp, LDB_, ymm, remM_);
+  for (; k < K_; k += U3) {
+    urolls::template transB_kernel<U3, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
     B_temp += U3;
   }
-  if(K - k >= U2) {
-    urolls::template transB_kernel<U2, toTemp, remM>(B_arr + k*LDB, LDB, B_temp, LDB_, ymm, remM_);
-    B_temp += U2; k += U2;
+  if (K - k >= U2) {
+    urolls::template transB_kernel<U2, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+    B_temp += U2;
+    k += U2;
   }
-  if(K - k >= U1) {
-    urolls::template transB_kernel<U1, toTemp, remM>(B_arr + k*LDB, LDB, B_temp, LDB_, ymm, remM_);
-    B_temp += U1; k += U1;
+  if (K - k >= U1) {
+    urolls::template transB_kernel<U1, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+    B_temp += U1;
+    k += U1;
   }
-  EIGEN_IF_CONSTEXPR( U1 > 8) {
+  EIGEN_IF_CONSTEXPR(U1 > 8) {
     // Note: without "if constexpr" this section of code will also be
     // parsed by the compiler so there is an additional check in {load/store}BBlock
     // to make sure the counter is not non-negative.
-    if(K - k >= 8) {
-      urolls::template transB_kernel<8, toTemp, remM>(B_arr + k*LDB, LDB, B_temp, LDB_, ymm, remM_);
-      B_temp += 8; k += 8;
+    if (K - k >= 8) {
+      urolls::template transB_kernel<8, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+      B_temp += 8;
+      k += 8;
     }
   }
-  EIGEN_IF_CONSTEXPR( U1 > 4) {
+  EIGEN_IF_CONSTEXPR(U1 > 4) {
     // Note: without "if constexpr" this section of code will also be
     // parsed by the compiler so there is an additional check in {load/store}BBlock
     // to make sure the counter is not non-negative.
-    if(K - k >= 4) {
-      urolls::template transB_kernel<4, toTemp, remM>(B_arr + k*LDB, LDB, B_temp, LDB_, ymm, remM_);
-      B_temp += 4; k += 4;
+    if (K - k >= 4) {
+      urolls::template transB_kernel<4, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+      B_temp += 4;
+      k += 4;
     }
   }
-  if(K - k >= 2) {
-    urolls::template transB_kernel<2, toTemp, remM>(B_arr + k*LDB, LDB, B_temp, LDB_, ymm, remM_);
-    B_temp += 2; k += 2;
+  if (K - k >= 2) {
+    urolls::template transB_kernel<2, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+    B_temp += 2;
+    k += 2;
   }
-  if(K - k >= 1) {
-    urolls::template transB_kernel<1, toTemp, remM>(B_arr + k*LDB, LDB, B_temp, LDB_, ymm, remM_);
-    B_temp += 1; k += 1;
+  if (K - k >= 1) {
+    urolls::template transB_kernel<1, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+    B_temp += 1;
+    k += 1;
   }
 }
 
+#if (EIGEN_USE_AVX512_TRSM_L_KERNELS) && defined(EIGEN_NO_MALLOC)
+/**
+ * Reduce blocking sizes so that the size of the temporary workspace needed is less than "limit" bytes,
+ *  - kB must be at least psize
+ *  - numM must be at least EIGEN_AVX_MAX_NUM_ROW
+ */
+template <typename Scalar, bool isBRowMajor>
+constexpr std::pair<int64_t, int64_t> trsmBlocking(const int64_t limit) {
+  constexpr int64_t psize = packet_traits<Scalar>::size;
+  int64_t kB = 15 * psize;
+  int64_t numM = 8 * EIGEN_AVX_MAX_NUM_ROW;
+  // If B is rowmajor, no temp workspace needed, so use default blocking sizes.
+  if (isBRowMajor) return {kB, numM};
+
+  // Very simple heuristic, prefer keeping kB as large as possible to fully use vector registers.
+  for (int64_t k = kB; k > psize; k -= psize) {
+    for (int64_t m = numM; m > EIGEN_AVX_MAX_NUM_ROW; m -= EIGEN_AVX_MAX_NUM_ROW) {
+      if ((((k + psize - 1) / psize + 4) * psize) * m * sizeof(Scalar) < limit) {
+        return {k, m};
+      }
+    }
+  }
+  return {psize, EIGEN_AVX_MAX_NUM_ROW};  // Minimum blocking size required
+}
+#endif  // (EIGEN_USE_AVX512_TRSM_L_KERNELS) && defined(EIGEN_NO_MALLOC)
+
 /**
  * Main triangular solve driver
  *
@@ -854,9 +914,11 @@
  *
  * Note: For RXX cases M,numRHS should be swapped.
  *
-*/
-template <typename Scalar, bool isARowMajor = true, bool isBRowMajor = true, bool isFWDSolve = true, bool isUnitDiag = false>
+ */
+template <typename Scalar, bool isARowMajor = true, bool isBRowMajor = true, bool isFWDSolve = true,
+          bool isUnitDiag = false>
 void triSolve(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t numRHS, int64_t LDA, int64_t LDB) {
+  constexpr int64_t psize = packet_traits<Scalar>::size;
   /**
    * The values for kB, numM were determined experimentally.
    * kB: Number of RHS we process at a time.
@@ -870,8 +932,30 @@
    * large enough to allow GEMM updates to have larger "K"s (see below.) No benchmarking has been done so far to
    * determine optimal values for numM.
    */
-  const int64_t kB = (3*packet_traits<Scalar>::size)*5; // 5*U3
-  const int64_t numM = 64;
+#if (EIGEN_USE_AVX512_TRSM_L_KERNELS) && defined(EIGEN_NO_MALLOC)
+  /**
+   * If EIGEN_NO_MALLOC is requested, we try to reduce kB and numM so the maximum temp workspace required is less
+   * than EIGEN_STACK_ALLOCATION_LIMIT. Actual workspace size may be less, depending on the number of vectors to
+   * solve.
+   *  - kB must be at least psize
+   *  - numM must be at least EIGEN_AVX_MAX_NUM_ROW
+   *
+   * If B is row-major, the blocking sizes are not reduced (no temp workspace needed).
+   */
+  constexpr std::pair<int64_t, int64_t> blocking_ = trsmBlocking<Scalar, isBRowMajor>(EIGEN_STACK_ALLOCATION_LIMIT);
+  constexpr int64_t kB = blocking_.first;
+  constexpr int64_t numM = blocking_.second;
+  /**
+   * If the temp workspace size exceeds EIGEN_STACK_ALLOCATION_LIMIT even with the minimum blocking sizes,
+   * we throw an assertion. Use -DEIGEN_USE_AVX512_TRSM_L_KERNELS=0 if necessary
+   */
+  static_assert(!(((((kB + psize - 1) / psize + 4) * psize) * numM * sizeof(Scalar) >= EIGEN_STACK_ALLOCATION_LIMIT) &&
+                  !isBRowMajor),
+                "Temp workspace required is too large.");
+#else
+  constexpr int64_t kB = (3 * psize) * 5;  // 5*U3
+  constexpr int64_t numM = 8 * EIGEN_AVX_MAX_NUM_ROW;
+#endif
 
   int64_t sizeBTemp = 0;
   Scalar *B_temp = NULL;
@@ -881,42 +965,50 @@
      * transpose it to row-major. Call the solve routine, and copy+transpose it back to the original array.
      * The updated row-major copy of B is reused in the GEMM updates.
      */
-    sizeBTemp = (((std::min(kB, numRHS) + 15)/16+ 4)*16)*numM;
-    B_temp = (Scalar*) aligned_alloc(4096,sizeof(Scalar)*sizeBTemp);
+    sizeBTemp = (((std::min(kB, numRHS) + psize - 1) / psize + 4) * psize) * numM;
   }
-  for(int64_t k = 0; k < numRHS; k += kB) {
+
+#if !defined(EIGEN_NO_MALLOC)
+  EIGEN_IF_CONSTEXPR(!isBRowMajor) B_temp = (Scalar *)handmade_aligned_malloc(sizeof(Scalar) * sizeBTemp, 64);
+#elif (EIGEN_USE_AVX512_TRSM_L_KERNELS) && defined(EIGEN_NO_MALLOC)
+  // Use alloca if malloc not allowed, requested temp workspace size should be less than EIGEN_STACK_ALLOCATION_LIMIT
+  ei_declare_aligned_stack_constructed_variable(Scalar, B_temp_alloca, sizeBTemp, 0);
+  B_temp = B_temp_alloca;
+#endif
+
+  for (int64_t k = 0; k < numRHS; k += kB) {
     int64_t bK = numRHS - k > kB ? kB : numRHS - k;
-    int64_t M_ = (M/EIGEN_AVX_MAX_NUM_ROW)*EIGEN_AVX_MAX_NUM_ROW, gemmOff = 0;
+    int64_t M_ = (M / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW, gemmOff = 0;
 
     // bK rounded up to next multiple of L=EIGEN_AVX_MAX_NUM_ROW. When B_temp is used, we solve for bkL RHS
     // instead of bK RHS in triSolveKernelLxK.
-    int64_t bkL = ((bK + (EIGEN_AVX_MAX_NUM_ROW-1))/EIGEN_AVX_MAX_NUM_ROW)*EIGEN_AVX_MAX_NUM_ROW;
-    const int64_t numScalarPerCache = 64/sizeof(Scalar);
+    int64_t bkL = ((bK + (EIGEN_AVX_MAX_NUM_ROW - 1)) / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW;
+    const int64_t numScalarPerCache = 64 / sizeof(Scalar);
     // Leading dimension of B_temp, will be a multiple of the cache line size.
-    int64_t LDT = ((bkL+(numScalarPerCache-1))/numScalarPerCache)*numScalarPerCache;
+    int64_t LDT = ((bkL + (numScalarPerCache - 1)) / numScalarPerCache) * numScalarPerCache;
     int64_t offsetBTemp = 0;
-    for(int64_t i = 0; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
+    for (int64_t i = 0; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
       EIGEN_IF_CONSTEXPR(!isBRowMajor) {
-        int64_t indA_i  = isFWDSolve ? i           : M - 1 - i;
-        int64_t indB_i  = isFWDSolve ? i           : M - (i + EIGEN_AVX_MAX_NUM_ROW);
-        int64_t offB_1  = isFWDSolve ? offsetBTemp : sizeBTemp - EIGEN_AVX_MAX_NUM_ROW*LDT - offsetBTemp;
-        int64_t offB_2  = isFWDSolve ? offsetBTemp : sizeBTemp - LDT - offsetBTemp;
+        int64_t indA_i = isFWDSolve ? i : M - 1 - i;
+        int64_t indB_i = isFWDSolve ? i : M - (i + EIGEN_AVX_MAX_NUM_ROW);
+        int64_t offB_1 = isFWDSolve ? offsetBTemp : sizeBTemp - EIGEN_AVX_MAX_NUM_ROW * LDT - offsetBTemp;
+        int64_t offB_2 = isFWDSolve ? offsetBTemp : sizeBTemp - LDT - offsetBTemp;
         // Copy values from B to B_temp.
-        copyBToRowMajor<Scalar, true, false>(B_arr + indB_i + k*LDB, LDB, bK, B_temp + offB_1, LDT);
+        copyBToRowMajor<Scalar, true, false>(B_arr + indB_i + k * LDB, LDB, bK, B_temp + offB_1, LDT);
         // Triangular solve with a small block of A and long horizontal blocks of B (or B_temp if B col-major)
         triSolveKernelLxK<Scalar, isARowMajor, isFWDSolve, isUnitDiag>(
-          &A_arr[idA<isARowMajor>(indA_i, indA_i, LDA)], B_temp + offB_2, EIGEN_AVX_MAX_NUM_ROW, bkL, LDA, LDT);
+            &A_arr[idA<isARowMajor>(indA_i, indA_i, LDA)], B_temp + offB_2, EIGEN_AVX_MAX_NUM_ROW, bkL, LDA, LDT);
         // Copy values from B_temp back to B. B_temp will be reused in gemm call below.
-        copyBToRowMajor<Scalar, false, false>(B_arr + indB_i + k*LDB, LDB, bK, B_temp + offB_1, LDT);
+        copyBToRowMajor<Scalar, false, false>(B_arr + indB_i + k * LDB, LDB, bK, B_temp + offB_1, LDT);
 
-        offsetBTemp += EIGEN_AVX_MAX_NUM_ROW*LDT;
+        offsetBTemp += EIGEN_AVX_MAX_NUM_ROW * LDT;
       }
       else {
         int64_t ind = isFWDSolve ? i : M - 1 - i;
         triSolveKernelLxK<Scalar, isARowMajor, isFWDSolve, isUnitDiag>(
-          &A_arr[idA<isARowMajor>(ind, ind, LDA)], B_arr + k + ind*LDB, EIGEN_AVX_MAX_NUM_ROW, bK, LDA, LDB);
+            &A_arr[idA<isARowMajor>(ind, ind, LDA)], B_arr + k + ind * LDB, EIGEN_AVX_MAX_NUM_ROW, bK, LDA, LDB);
       }
-      if(i+EIGEN_AVX_MAX_NUM_ROW < M_) {
+      if (i + EIGEN_AVX_MAX_NUM_ROW < M_) {
         /**
          * For the GEMM updates, we want "K" (K=i+8 in this case) to be large as soon as possible
          * to reuse the accumulators in GEMM as much as possible. So we only update 8xbK blocks of
@@ -930,19 +1022,16 @@
          *    |********|__|    |**|
          */
         EIGEN_IF_CONSTEXPR(isBRowMajor) {
-          int64_t indA_i  = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2*EIGEN_AVX_MAX_NUM_ROW);
-          int64_t indA_j  = isFWDSolve ? 0                         : M - (i + EIGEN_AVX_MAX_NUM_ROW);
-          int64_t indB_i  = isFWDSolve ? 0                         : M - (i + EIGEN_AVX_MAX_NUM_ROW);
-          int64_t indB_i2 = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2*EIGEN_AVX_MAX_NUM_ROW);
-          gemmKernel<Scalar,isARowMajor, isBRowMajor,false,false>(
-            &A_arr[idA<isARowMajor>(indA_i,indA_j,LDA)],
-            B_arr + k + indB_i*LDB,
-            B_arr + k + indB_i2*LDB,
-            EIGEN_AVX_MAX_NUM_ROW, bK, i + EIGEN_AVX_MAX_NUM_ROW,
-            LDA, LDB, LDB);
+          int64_t indA_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2 * EIGEN_AVX_MAX_NUM_ROW);
+          int64_t indA_j = isFWDSolve ? 0 : M - (i + EIGEN_AVX_MAX_NUM_ROW);
+          int64_t indB_i = isFWDSolve ? 0 : M - (i + EIGEN_AVX_MAX_NUM_ROW);
+          int64_t indB_i2 = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2 * EIGEN_AVX_MAX_NUM_ROW);
+          gemmKernel<Scalar, isARowMajor, isBRowMajor, false, false>(
+              &A_arr[idA<isARowMajor>(indA_i, indA_j, LDA)], B_arr + k + indB_i * LDB, B_arr + k + indB_i2 * LDB,
+              EIGEN_AVX_MAX_NUM_ROW, bK, i + EIGEN_AVX_MAX_NUM_ROW, LDA, LDB, LDB);
         }
         else {
-          if(offsetBTemp + EIGEN_AVX_MAX_NUM_ROW*LDT > sizeBTemp) {
+          if (offsetBTemp + EIGEN_AVX_MAX_NUM_ROW * LDT > sizeBTemp) {
             /**
              * Similar idea as mentioned above, but here we are limited by the number of updated values of B
              * that can be stored (row-major) in B_temp.
@@ -951,157 +1040,148 @@
              * update and partially update the remaining old values of B which depends on the new values
              * of B stored in B_temp. These values are then no longer needed and can be overwritten.
              */
-            int64_t indA_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW   : 0;
-            int64_t indA_j = isFWDSolve ? gemmOff                     : M - (i + EIGEN_AVX_MAX_NUM_ROW);
-            int64_t indB_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW   : 0;
-            int64_t offB_1 = isFWDSolve ? 0                           : sizeBTemp - offsetBTemp;
-            gemmKernel<Scalar,isARowMajor, isBRowMajor,false,false>(
-              &A_arr[idA<isARowMajor>(indA_i, indA_j,LDA)],
-              B_temp + offB_1,
-              B_arr + indB_i + (k)*LDB,
-              M - (i + EIGEN_AVX_MAX_NUM_ROW), bK, i + EIGEN_AVX_MAX_NUM_ROW - gemmOff,
-              LDA, LDT, LDB);
-            offsetBTemp = 0; gemmOff = i + EIGEN_AVX_MAX_NUM_ROW;
-          }
-          else {
+            int64_t indA_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : 0;
+            int64_t indA_j = isFWDSolve ? gemmOff : M - (i + EIGEN_AVX_MAX_NUM_ROW);
+            int64_t indB_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : 0;
+            int64_t offB_1 = isFWDSolve ? 0 : sizeBTemp - offsetBTemp;
+            gemmKernel<Scalar, isARowMajor, isBRowMajor, false, false>(
+                &A_arr[idA<isARowMajor>(indA_i, indA_j, LDA)], B_temp + offB_1, B_arr + indB_i + (k)*LDB,
+                M - (i + EIGEN_AVX_MAX_NUM_ROW), bK, i + EIGEN_AVX_MAX_NUM_ROW - gemmOff, LDA, LDT, LDB);
+            offsetBTemp = 0;
+            gemmOff = i + EIGEN_AVX_MAX_NUM_ROW;
+          } else {
             /**
              * If there is enough space in B_temp, we only update the next 8xbK values of B.
              */
-            int64_t indA_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW   : M - (i + 2*EIGEN_AVX_MAX_NUM_ROW);
-            int64_t indA_j = isFWDSolve ? gemmOff                     : M - (i + EIGEN_AVX_MAX_NUM_ROW);
-            int64_t indB_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW   : M - (i + 2*EIGEN_AVX_MAX_NUM_ROW);
-            int64_t offB_1 = isFWDSolve ? 0                           : sizeBTemp - offsetBTemp;
-            gemmKernel<Scalar,isARowMajor, isBRowMajor,false,false>(
-              &A_arr[idA<isARowMajor>(indA_i,indA_j,LDA)],
-              B_temp + offB_1,
-              B_arr + indB_i + (k)*LDB,
-              EIGEN_AVX_MAX_NUM_ROW, bK, i + EIGEN_AVX_MAX_NUM_ROW - gemmOff,
-              LDA, LDT, LDB);
+            int64_t indA_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2 * EIGEN_AVX_MAX_NUM_ROW);
+            int64_t indA_j = isFWDSolve ? gemmOff : M - (i + EIGEN_AVX_MAX_NUM_ROW);
+            int64_t indB_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2 * EIGEN_AVX_MAX_NUM_ROW);
+            int64_t offB_1 = isFWDSolve ? 0 : sizeBTemp - offsetBTemp;
+            gemmKernel<Scalar, isARowMajor, isBRowMajor, false, false>(
+                &A_arr[idA<isARowMajor>(indA_i, indA_j, LDA)], B_temp + offB_1, B_arr + indB_i + (k)*LDB,
+                EIGEN_AVX_MAX_NUM_ROW, bK, i + EIGEN_AVX_MAX_NUM_ROW - gemmOff, LDA, LDT, LDB);
           }
         }
       }
     }
     // Handle M remainder..
-    int64_t bM = M-M_;
-    if (bM > 0){
-      if( M_ > 0) {
+    int64_t bM = M - M_;
+    if (bM > 0) {
+      if (M_ > 0) {
         EIGEN_IF_CONSTEXPR(isBRowMajor) {
-          int64_t indA_i  = isFWDSolve ? M_ : 0;
-          int64_t indA_j  = isFWDSolve ? 0  : bM;
-          int64_t indB_i  = isFWDSolve ? 0  : bM;
+          int64_t indA_i = isFWDSolve ? M_ : 0;
+          int64_t indA_j = isFWDSolve ? 0 : bM;
+          int64_t indB_i = isFWDSolve ? 0 : bM;
           int64_t indB_i2 = isFWDSolve ? M_ : 0;
-          gemmKernel<Scalar,isARowMajor, isBRowMajor,false,false>(
-            &A_arr[idA<isARowMajor>(indA_i,indA_j,LDA)],
-            B_arr + k +indB_i*LDB,
-            B_arr + k + indB_i2*LDB,
-            bM , bK, M_,
-            LDA, LDB, LDB);
+          gemmKernel<Scalar, isARowMajor, isBRowMajor, false, false>(
+              &A_arr[idA<isARowMajor>(indA_i, indA_j, LDA)], B_arr + k + indB_i * LDB, B_arr + k + indB_i2 * LDB, bM,
+              bK, M_, LDA, LDB, LDB);
         }
         else {
-          int64_t indA_i = isFWDSolve ? M_      : 0;
+          int64_t indA_i = isFWDSolve ? M_ : 0;
           int64_t indA_j = isFWDSolve ? gemmOff : bM;
-          int64_t indB_i = isFWDSolve ? M_      : 0;
-          int64_t offB_1 = isFWDSolve ? 0       : sizeBTemp - offsetBTemp;
-          gemmKernel<Scalar,isARowMajor, isBRowMajor,false,false>(
-            &A_arr[idA<isARowMajor>(indA_i,indA_j,LDA)],
-            B_temp + offB_1,
-            B_arr + indB_i + (k)*LDB,
-            bM , bK, M_ - gemmOff,
-            LDA, LDT, LDB);
+          int64_t indB_i = isFWDSolve ? M_ : 0;
+          int64_t offB_1 = isFWDSolve ? 0 : sizeBTemp - offsetBTemp;
+          gemmKernel<Scalar, isARowMajor, isBRowMajor, false, false>(&A_arr[idA<isARowMajor>(indA_i, indA_j, LDA)],
+                                                                     B_temp + offB_1, B_arr + indB_i + (k)*LDB, bM, bK,
+                                                                     M_ - gemmOff, LDA, LDT, LDB);
         }
       }
       EIGEN_IF_CONSTEXPR(!isBRowMajor) {
-        int64_t indA_i  = isFWDSolve ? M_ : M - 1 - M_;
-        int64_t indB_i  = isFWDSolve ? M_ : 0;
-        int64_t offB_1  = isFWDSolve ? 0  : (bM-1)*bkL;
-        copyBToRowMajor<Scalar, true,  true>(B_arr + indB_i + k*LDB, LDB, bK, B_temp, bkL, bM);
-        triSolveKernelLxK<Scalar, isARowMajor, isFWDSolve, isUnitDiag>(
-          &A_arr[idA<isARowMajor>(indA_i, indA_i, LDA)], B_temp + offB_1, bM, bkL, LDA, bkL);
-        copyBToRowMajor<Scalar, false, true>(B_arr + indB_i + k*LDB, LDB, bK, B_temp, bkL, bM);
+        int64_t indA_i = isFWDSolve ? M_ : M - 1 - M_;
+        int64_t indB_i = isFWDSolve ? M_ : 0;
+        int64_t offB_1 = isFWDSolve ? 0 : (bM - 1) * bkL;
+        copyBToRowMajor<Scalar, true, true>(B_arr + indB_i + k * LDB, LDB, bK, B_temp, bkL, bM);
+        triSolveKernelLxK<Scalar, isARowMajor, isFWDSolve, isUnitDiag>(&A_arr[idA<isARowMajor>(indA_i, indA_i, LDA)],
+                                                                       B_temp + offB_1, bM, bkL, LDA, bkL);
+        copyBToRowMajor<Scalar, false, true>(B_arr + indB_i + k * LDB, LDB, bK, B_temp, bkL, bM);
       }
       else {
         int64_t ind = isFWDSolve ? M_ : M - 1 - M_;
-        triSolveKernelLxK<Scalar, isARowMajor, isFWDSolve, isUnitDiag>(
-          &A_arr[idA<isARowMajor>(ind, ind, LDA)], B_arr + k + ind*LDB, bM, bK, LDA, LDB);
+        triSolveKernelLxK<Scalar, isARowMajor, isFWDSolve, isUnitDiag>(&A_arr[idA<isARowMajor>(ind, ind, LDA)],
+                                                                       B_arr + k + ind * LDB, bM, bK, LDA, LDB);
       }
     }
   }
-  EIGEN_IF_CONSTEXPR(!isBRowMajor) free(B_temp);
-}
 
-template <typename Scalar, bool isARowMajor = true, bool isCRowMajor = true>
-void gemmKer(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr,
-                int64_t M, int64_t N, int64_t K,
-                int64_t LDA, int64_t LDB, int64_t LDC) {
-  gemmKernel<Scalar, isARowMajor, isCRowMajor, true, true>(B_arr, A_arr, C_arr, N, M, K, LDB, LDA, LDC);
+#if !defined(EIGEN_NO_MALLOC)
+  EIGEN_IF_CONSTEXPR(!isBRowMajor) handmade_aligned_free(B_temp);
+#endif
 }
 
-
 // Template specializations of trsmKernelL/R for float/double and inner strides of 1.
-#if defined(EIGEN_USE_AVX512_TRSM_KERNELS)
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride>
-struct trsm_kernels;
+#if (EIGEN_USE_AVX512_TRSM_KERNELS)
+#if (EIGEN_USE_AVX512_TRSM_R_KERNELS)
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+struct trsmKernelR;
 
 template <typename Index, int Mode, int TriStorageOrder>
-struct trsm_kernels<float, Index, Mode, false, TriStorageOrder, 1>{
-  static void trsmKernelL(Index size, Index otherSize, const float* _tri, Index triStride,
-    float* _other, Index otherIncr, Index otherStride);
-  static void trsmKernelR(Index size, Index otherSize, const float* _tri, Index triStride,
-    float* _other, Index otherIncr, Index otherStride);
+struct trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1> {
+  static void kernel(Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
+                     Index otherStride);
 };
 
 template <typename Index, int Mode, int TriStorageOrder>
-struct trsm_kernels<double, Index, Mode, false, TriStorageOrder, 1>{
-  static void trsmKernelL(Index size, Index otherSize, const double* _tri, Index triStride,
-    double* _other, Index otherIncr, Index otherStride);
-  static void trsmKernelR(Index size, Index otherSize, const double* _tri, Index triStride,
-    double* _other, Index otherIncr, Index otherStride);
+struct trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1> {
+  static void kernel(Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
+                     Index otherStride);
 };
 
 template <typename Index, int Mode, int TriStorageOrder>
-EIGEN_DONT_INLINE void trsm_kernels<float, Index, Mode, false, TriStorageOrder, 1>::trsmKernelL(
-  Index size, Index otherSize,
-  const float* _tri, Index triStride,
-  float* _other, Index otherIncr, Index otherStride)
-{
+EIGEN_DONT_INLINE void trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1>::kernel(
+    Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
+    Index otherStride) {
   EIGEN_UNUSED_VARIABLE(otherIncr);
-  triSolve<float, TriStorageOrder==RowMajor, false, (Mode&Lower)==Lower, (Mode & UnitDiag)!=0>(
-    const_cast<float*>(_tri), _other, size, otherSize, triStride, otherStride);
+  triSolve<float, TriStorageOrder != RowMajor, true, (Mode & Lower) != Lower, (Mode & UnitDiag) != 0>(
+      const_cast<float *>(_tri), _other, size, otherSize, triStride, otherStride);
 }
 
 template <typename Index, int Mode, int TriStorageOrder>
-EIGEN_DONT_INLINE void trsm_kernels<float, Index, Mode, false, TriStorageOrder, 1>::trsmKernelR(
-    Index size, Index otherSize,
-    const float* _tri, Index triStride,
-    float* _other, Index otherIncr, Index otherStride)
-{
+EIGEN_DONT_INLINE void trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1>::kernel(
+    Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
+    Index otherStride) {
   EIGEN_UNUSED_VARIABLE(otherIncr);
-  triSolve<float, TriStorageOrder!=RowMajor, true, (Mode&Lower)!=Lower, (Mode & UnitDiag)!=0>(
-    const_cast<float*>(_tri), _other, size, otherSize, triStride, otherStride);
+  triSolve<double, TriStorageOrder != RowMajor, true, (Mode & Lower) != Lower, (Mode & UnitDiag) != 0>(
+      const_cast<double *>(_tri), _other, size, otherSize, triStride, otherStride);
+}
+#endif  // (EIGEN_USE_AVX512_TRSM_R_KERNELS)
+
+// These trsm kernels require temporary memory allocation
+#if (EIGEN_USE_AVX512_TRSM_L_KERNELS)
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+struct trsmKernelL;
+
+template <typename Index, int Mode, int TriStorageOrder>
+struct trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1> {
+  static void kernel(Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
+                     Index otherStride);
+};
+
+template <typename Index, int Mode, int TriStorageOrder>
+struct trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1> {
+  static void kernel(Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
+                     Index otherStride);
+};
+
+template <typename Index, int Mode, int TriStorageOrder>
+EIGEN_DONT_INLINE void trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1>::kernel(
+    Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
+    Index otherStride) {
+  EIGEN_UNUSED_VARIABLE(otherIncr);
+  triSolve<float, TriStorageOrder == RowMajor, false, (Mode & Lower) == Lower, (Mode & UnitDiag) != 0>(
+      const_cast<float *>(_tri), _other, size, otherSize, triStride, otherStride);
 }
 
 template <typename Index, int Mode, int TriStorageOrder>
-EIGEN_DONT_INLINE void trsm_kernels<double, Index, Mode, false, TriStorageOrder, 1>::trsmKernelL(
-  Index size, Index otherSize,
-  const double* _tri, Index triStride,
-  double* _other, Index otherIncr, Index otherStride)
-{
+EIGEN_DONT_INLINE void trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1>::kernel(
+    Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
+    Index otherStride) {
   EIGEN_UNUSED_VARIABLE(otherIncr);
-  triSolve<double, TriStorageOrder==RowMajor, false, (Mode&Lower)==Lower, (Mode & UnitDiag)!=0>(
-    const_cast<double*>(_tri), _other, size, otherSize, triStride, otherStride);
+  triSolve<double, TriStorageOrder == RowMajor, false, (Mode & Lower) == Lower, (Mode & UnitDiag) != 0>(
+      const_cast<double *>(_tri), _other, size, otherSize, triStride, otherStride);
 }
-
-template <typename Index, int Mode, int TriStorageOrder>
-EIGEN_DONT_INLINE void trsm_kernels<double, Index, Mode, false, TriStorageOrder, 1>::trsmKernelR(
-    Index size, Index otherSize,
-    const double* _tri, Index triStride,
-    double* _other, Index otherIncr, Index otherStride)
-{
-  EIGEN_UNUSED_VARIABLE(otherIncr);
-  triSolve<double, TriStorageOrder!=RowMajor, true, (Mode&Lower)!=Lower, (Mode & UnitDiag)!=0>(
-    const_cast<double*>(_tri), _other, size, otherSize, triStride, otherStride);
-}
-#endif //EIGEN_USE_AVX512_TRSM_KERNELS
-}
-}
-#endif //EIGEN_TRSM_KERNEL_IMPL_H
+#endif  // EIGEN_USE_AVX512_TRSM_L_KERNELS
+#endif  // EIGEN_USE_AVX512_TRSM_KERNELS
+}  // namespace internal
+}  // namespace Eigen
+#endif  // EIGEN_TRSM_KERNEL_IMPL_H
diff --git a/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
index 22cb1c9..032937c 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
+++ b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
@@ -11,8 +11,7 @@
 #define EIGEN_UNROLLS_IMPL_H
 
 template <bool isARowMajor = true>
-static EIGEN_ALWAYS_INLINE
-int64_t idA(int64_t i, int64_t j, int64_t LDA) {
+static EIGEN_ALWAYS_INLINE int64_t idA(int64_t i, int64_t j, int64_t LDA) {
   EIGEN_IF_CONSTEXPR(isARowMajor) return i * LDA + j;
   else return i + j * LDA;
 }
@@ -59,23 +58,81 @@
 
 template <int64_t N>
 EIGEN_ALWAYS_INLINE auto remMask(int64_t m) {
-  EIGEN_IF_CONSTEXPR( N == 16) { return 0xFFFF >> (16 - m); }
-  else EIGEN_IF_CONSTEXPR( N == 8) { return 0xFF >> (8 - m); }
-  else EIGEN_IF_CONSTEXPR( N == 4) { return 0x0F >> (4 - m); }
+  EIGEN_IF_CONSTEXPR(N == 16) { return 0xFFFF >> (16 - m); }
+  else EIGEN_IF_CONSTEXPR(N == 8) {
+    return 0xFF >> (8 - m);
+  }
+  else EIGEN_IF_CONSTEXPR(N == 4) {
+    return 0x0F >> (4 - m);
+  }
   return 0;
 }
 
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void trans8x8blocks(PacketBlock<Packet, 8> &kernel);
+
+template <>
+EIGEN_ALWAYS_INLINE void trans8x8blocks(PacketBlock<Packet16f, 8> &kernel) {
+  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+  __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
+
+  kernel.packet[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
+  kernel.packet[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
+  kernel.packet[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
+  kernel.packet[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
+  kernel.packet[4] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
+  kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
+  kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
+  kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
+
+  T0 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[4]), 0x4E));
+  T0 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[0], T0);
+  T4 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[0]), 0x4E));
+  T4 = _mm512_mask_blend_ps(0xF0F0, T4, kernel.packet[4]);
+  T1 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[5]), 0x4E));
+  T1 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[1], T1);
+  T5 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[1]), 0x4E));
+  T5 = _mm512_mask_blend_ps(0xF0F0, T5, kernel.packet[5]);
+  T2 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[6]), 0x4E));
+  T2 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[2], T2);
+  T6 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[2]), 0x4E));
+  T6 = _mm512_mask_blend_ps(0xF0F0, T6, kernel.packet[6]);
+  T3 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[7]), 0x4E));
+  T3 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[3], T3);
+  T7 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[3]), 0x4E));
+  T7 = _mm512_mask_blend_ps(0xF0F0, T7, kernel.packet[7]);
+
+  kernel.packet[0] = T0;
+  kernel.packet[1] = T1;
+  kernel.packet[2] = T2;
+  kernel.packet[3] = T3;
+  kernel.packet[4] = T4;
+  kernel.packet[5] = T5;
+  kernel.packet[6] = T6;
+  kernel.packet[7] = T7;
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void trans8x8blocks(PacketBlock<Packet8d, 8> &kernel) {
+  ptranspose(kernel);
+}
+
 /***
  * Unrolls for tranposed C stores
  */
 template <typename Scalar>
 class trans {
-public:
+ public:
   using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
   using vecHalf = typename std::conditional<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>::type;
   static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
 
-
   /***********************************
    * Auxillary Functions for:
    *  - storeC
@@ -91,70 +148,67 @@
    * (endN <= PacketSize) is required to handle the fp32 case, see comments in transStoreC
    *
    **/
-  template<int64_t endN, int64_t counter, int64_t unrollN, int64_t packetIndexOffset, bool remM>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0 && endN <= PacketSize)>
-  aux_storeC(Scalar *C_arr, int64_t LDC,
-             PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t remM_ = 0) {
-    constexpr int64_t counterReverse = endN-counter;
+  template <int64_t endN, int64_t counter, int64_t unrollN, int64_t packetIndexOffset, bool remM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0 && endN <= PacketSize)> aux_storeC(
+      Scalar *C_arr, int64_t LDC, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t remM_ = 0) {
+    constexpr int64_t counterReverse = endN - counter;
     constexpr int64_t startN = counterReverse;
 
     EIGEN_IF_CONSTEXPR(startN < EIGEN_AVX_MAX_NUM_ROW) {
       EIGEN_IF_CONSTEXPR(remM) {
         pstoreu<Scalar>(
-          C_arr + LDC*startN,
-          padd(ploadu<vecHalf>((const Scalar*)C_arr + LDC*startN, remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_)),
-               preinterpret<vecHalf>(zmm.packet[packetIndexOffset + (unrollN/PacketSize)*startN]),
-               remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_)),
-          remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_));
+            C_arr + LDC * startN,
+            padd(ploadu<vecHalf>((const Scalar *)C_arr + LDC * startN, remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_)),
+                 preinterpret<vecHalf>(zmm.packet[packetIndexOffset + (unrollN / PacketSize) * startN]),
+                 remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_)),
+            remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_));
       }
       else {
-        pstoreu<Scalar>(
-          C_arr + LDC*startN,
-          padd(ploadu<vecHalf>((const Scalar*)C_arr + LDC*startN),
-               preinterpret<vecHalf>(zmm.packet[packetIndexOffset + (unrollN/PacketSize)*startN])));
+        pstoreu<Scalar>(C_arr + LDC * startN,
+                        padd(ploadu<vecHalf>((const Scalar *)C_arr + LDC * startN),
+                             preinterpret<vecHalf>(zmm.packet[packetIndexOffset + (unrollN / PacketSize) * startN])));
       }
     }
-    else { // This block is only needed for fp32 case
+    else {  // This block is only needed for fp32 case
       // Reinterpret as __m512 for _mm512_shuffle_f32x4
       vecFullFloat zmm2vecFullFloat = preinterpret<vecFullFloat>(
-        zmm.packet[packetIndexOffset + (unrollN/PacketSize)*(startN - EIGEN_AVX_MAX_NUM_ROW)]);
+          zmm.packet[packetIndexOffset + (unrollN / PacketSize) * (startN - EIGEN_AVX_MAX_NUM_ROW)]);
       // Swap lower and upper half of avx register.
-      zmm.packet[packetIndexOffset + (unrollN/PacketSize)*(startN - EIGEN_AVX_MAX_NUM_ROW)] =
-        preinterpret<vec>(_mm512_shuffle_f32x4(zmm2vecFullFloat, zmm2vecFullFloat, 0b01001110));
+      zmm.packet[packetIndexOffset + (unrollN / PacketSize) * (startN - EIGEN_AVX_MAX_NUM_ROW)] =
+          preinterpret<vec>(_mm512_shuffle_f32x4(zmm2vecFullFloat, zmm2vecFullFloat, 0b01001110));
 
       EIGEN_IF_CONSTEXPR(remM) {
         pstoreu<Scalar>(
-          C_arr + LDC*startN,
-          padd(ploadu<vecHalf>((const Scalar*)C_arr + LDC*startN,
-                               remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_)),
-               preinterpret<vecHalf>(zmm.packet[packetIndexOffset + (unrollN/PacketSize)*(startN-EIGEN_AVX_MAX_NUM_ROW)])),
-          remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_));
+            C_arr + LDC * startN,
+            padd(ploadu<vecHalf>((const Scalar *)C_arr + LDC * startN, remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_)),
+                 preinterpret<vecHalf>(
+                     zmm.packet[packetIndexOffset + (unrollN / PacketSize) * (startN - EIGEN_AVX_MAX_NUM_ROW)])),
+            remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_));
       }
       else {
         pstoreu<Scalar>(
-          C_arr + LDC*startN,
-          padd(ploadu<vecHalf>((const Scalar*)C_arr + LDC*startN),
-               preinterpret<vecHalf>(zmm.packet[packetIndexOffset + (unrollN/PacketSize)*(startN-EIGEN_AVX_MAX_NUM_ROW)])));
+            C_arr + LDC * startN,
+            padd(ploadu<vecHalf>((const Scalar *)C_arr + LDC * startN),
+                 preinterpret<vecHalf>(
+                     zmm.packet[packetIndexOffset + (unrollN / PacketSize) * (startN - EIGEN_AVX_MAX_NUM_ROW)])));
       }
     }
     aux_storeC<endN, counter - 1, unrollN, packetIndexOffset, remM>(C_arr, LDC, zmm, remM_);
   }
 
-  template<int64_t endN, int64_t counter, int64_t unrollN, int64_t packetIndexOffset, bool remM>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<!(counter > 0 && endN <= PacketSize)>
-  aux_storeC(Scalar *C_arr, int64_t LDC,
-             PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t remM_ = 0)
-    {
-      EIGEN_UNUSED_VARIABLE(C_arr);
-      EIGEN_UNUSED_VARIABLE(LDC);
-      EIGEN_UNUSED_VARIABLE(zmm);
-      EIGEN_UNUSED_VARIABLE(remM_);
-    }
+  template <int64_t endN, int64_t counter, int64_t unrollN, int64_t packetIndexOffset, bool remM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<!(counter > 0 && endN <= PacketSize)> aux_storeC(
+      Scalar *C_arr, int64_t LDC, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t remM_ = 0) {
+    EIGEN_UNUSED_VARIABLE(C_arr);
+    EIGEN_UNUSED_VARIABLE(LDC);
+    EIGEN_UNUSED_VARIABLE(zmm);
+    EIGEN_UNUSED_VARIABLE(remM_);
+  }
 
-  template<int64_t endN, int64_t unrollN, int64_t packetIndexOffset, bool remM>
-  static EIGEN_ALWAYS_INLINE
-  void storeC(Scalar *C_arr, int64_t LDC,
-              PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t remM_ = 0){
+  template <int64_t endN, int64_t unrollN, int64_t packetIndexOffset, bool remM>
+  static EIGEN_ALWAYS_INLINE void storeC(Scalar *C_arr, int64_t LDC,
+                                         PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                         int64_t remM_ = 0) {
     aux_storeC<endN, endN, unrollN, packetIndexOffset, remM>(C_arr, LDC, zmm, remM_);
   }
 
@@ -183,30 +237,29 @@
    * EIGEN_AVX_MAX_NUM_ROW. packetIndexOffset is used to select which "block" of
    * avx registers are being transposed.
    */
-  template<int64_t unrollN, int64_t packetIndexOffset>
-  static EIGEN_ALWAYS_INLINE
-  void transpose(PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
+  template <int64_t unrollN, int64_t packetIndexOffset>
+  static EIGEN_ALWAYS_INLINE void transpose(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
     // Note: this assumes EIGEN_AVX_MAX_NUM_ROW = 8. Unrolls should be adjusted
     // accordingly if EIGEN_AVX_MAX_NUM_ROW is smaller.
-    constexpr int64_t zmmStride = unrollN/PacketSize;
-    PacketBlock<vec,EIGEN_AVX_MAX_NUM_ROW> r;
-    r.packet[0] = zmm.packet[packetIndexOffset + zmmStride*0];
-    r.packet[1] = zmm.packet[packetIndexOffset + zmmStride*1];
-    r.packet[2] = zmm.packet[packetIndexOffset + zmmStride*2];
-    r.packet[3] = zmm.packet[packetIndexOffset + zmmStride*3];
-    r.packet[4] = zmm.packet[packetIndexOffset + zmmStride*4];
-    r.packet[5] = zmm.packet[packetIndexOffset + zmmStride*5];
-    r.packet[6] = zmm.packet[packetIndexOffset + zmmStride*6];
-    r.packet[7] = zmm.packet[packetIndexOffset + zmmStride*7];
-    ptranspose(r);
-    zmm.packet[packetIndexOffset + zmmStride*0] = r.packet[0];
-    zmm.packet[packetIndexOffset + zmmStride*1] = r.packet[1];
-    zmm.packet[packetIndexOffset + zmmStride*2] = r.packet[2];
-    zmm.packet[packetIndexOffset + zmmStride*3] = r.packet[3];
-    zmm.packet[packetIndexOffset + zmmStride*4] = r.packet[4];
-    zmm.packet[packetIndexOffset + zmmStride*5] = r.packet[5];
-    zmm.packet[packetIndexOffset + zmmStride*6] = r.packet[6];
-    zmm.packet[packetIndexOffset + zmmStride*7] = r.packet[7];
+    constexpr int64_t zmmStride = unrollN / PacketSize;
+    PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> r;
+    r.packet[0] = zmm.packet[packetIndexOffset + zmmStride * 0];
+    r.packet[1] = zmm.packet[packetIndexOffset + zmmStride * 1];
+    r.packet[2] = zmm.packet[packetIndexOffset + zmmStride * 2];
+    r.packet[3] = zmm.packet[packetIndexOffset + zmmStride * 3];
+    r.packet[4] = zmm.packet[packetIndexOffset + zmmStride * 4];
+    r.packet[5] = zmm.packet[packetIndexOffset + zmmStride * 5];
+    r.packet[6] = zmm.packet[packetIndexOffset + zmmStride * 6];
+    r.packet[7] = zmm.packet[packetIndexOffset + zmmStride * 7];
+    trans8x8blocks(r);
+    zmm.packet[packetIndexOffset + zmmStride * 0] = r.packet[0];
+    zmm.packet[packetIndexOffset + zmmStride * 1] = r.packet[1];
+    zmm.packet[packetIndexOffset + zmmStride * 2] = r.packet[2];
+    zmm.packet[packetIndexOffset + zmmStride * 3] = r.packet[3];
+    zmm.packet[packetIndexOffset + zmmStride * 4] = r.packet[4];
+    zmm.packet[packetIndexOffset + zmmStride * 5] = r.packet[5];
+    zmm.packet[packetIndexOffset + zmmStride * 6] = r.packet[6];
+    zmm.packet[packetIndexOffset + zmmStride * 7] = r.packet[7];
   }
 };
 
@@ -226,7 +279,7 @@
  */
 template <typename Scalar>
 class transB {
-public:
+ public:
   using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
   using vecHalf = typename std::conditional<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>::type;
   static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
@@ -246,33 +299,31 @@
    * 1-D unroll
    *      for(startN = 0; startN < endN; startN++)
    **/
-  template<int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remM>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_loadB(Scalar *B_arr, int64_t LDB,
-            PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
-    constexpr int64_t counterReverse = endN-counter;
+  template <int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadB(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+      int64_t remM_ = 0) {
+    constexpr int64_t counterReverse = endN - counter;
     constexpr int64_t startN = counterReverse;
 
     EIGEN_IF_CONSTEXPR(remM) {
-      ymm.packet[packetIndexOffset + startN] = ploadu<vecHalf>(
-        (const Scalar*)&B_arr[startN*LDB], remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_));
+      ymm.packet[packetIndexOffset + startN] =
+          ploadu<vecHalf>((const Scalar *)&B_arr[startN * LDB], remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_));
     }
-    else
-      ymm.packet[packetIndexOffset + startN] = ploadu<vecHalf>((const Scalar*)&B_arr[startN*LDB]);
+    else ymm.packet[packetIndexOffset + startN] = ploadu<vecHalf>((const Scalar *)&B_arr[startN * LDB]);
 
-    aux_loadB<endN, counter-1, packetIndexOffset, remM>(B_arr, LDB, ymm, remM_);
+    aux_loadB<endN, counter - 1, packetIndexOffset, remM>(B_arr, LDB, ymm, remM_);
   }
 
-  template<int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remM>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_loadB(Scalar *B_arr, int64_t LDB,
-            PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0)
-    {
-      EIGEN_UNUSED_VARIABLE(B_arr);
-      EIGEN_UNUSED_VARIABLE(LDB);
-      EIGEN_UNUSED_VARIABLE(ymm);
-      EIGEN_UNUSED_VARIABLE(remM_);
-    }
+  template <int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadB(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+      int64_t remM_ = 0) {
+    EIGEN_UNUSED_VARIABLE(B_arr);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(ymm);
+    EIGEN_UNUSED_VARIABLE(remM_);
+  }
 
   /**
    * aux_storeB
@@ -280,36 +331,31 @@
    * 1-D unroll
    *      for(startN = 0; startN < endN; startN++)
    **/
-  template<int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remK, bool remM>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_storeB(Scalar *B_arr, int64_t LDB,
-             PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t rem_ = 0) {
-    constexpr int64_t counterReverse = endN-counter;
+  template <int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remK, bool remM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeB(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t rem_ = 0) {
+    constexpr int64_t counterReverse = endN - counter;
     constexpr int64_t startN = counterReverse;
 
-    EIGEN_IF_CONSTEXPR( remK || remM) {
-      pstoreu<Scalar>(
-        &B_arr[startN*LDB],
-             ymm.packet[packetIndexOffset + startN],
-             remMask<EIGEN_AVX_MAX_NUM_ROW>(rem_));
+    EIGEN_IF_CONSTEXPR(remK || remM) {
+      pstoreu<Scalar>(&B_arr[startN * LDB], ymm.packet[packetIndexOffset + startN],
+                      remMask<EIGEN_AVX_MAX_NUM_ROW>(rem_));
     }
     else {
-      pstoreu<Scalar>(&B_arr[startN*LDB], ymm.packet[packetIndexOffset + startN]);
+      pstoreu<Scalar>(&B_arr[startN * LDB], ymm.packet[packetIndexOffset + startN]);
     }
 
-    aux_storeB<endN, counter-1, packetIndexOffset, remK, remM>(B_arr, LDB, ymm, rem_);
+    aux_storeB<endN, counter - 1, packetIndexOffset, remK, remM>(B_arr, LDB, ymm, rem_);
   }
 
-  template<int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remK, bool remM>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_storeB(Scalar *B_arr, int64_t LDB,
-             PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t rem_ = 0)
-    {
-      EIGEN_UNUSED_VARIABLE(B_arr);
-      EIGEN_UNUSED_VARIABLE(LDB);
-      EIGEN_UNUSED_VARIABLE(ymm);
-      EIGEN_UNUSED_VARIABLE(rem_);
-    }
+  template <int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remK, bool remM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_storeB(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(B_arr);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(ymm);
+    EIGEN_UNUSED_VARIABLE(rem_);
+  }
 
   /**
    * aux_loadBBlock
@@ -317,32 +363,27 @@
    * 1-D unroll
    *      for(startN = 0; startN < endN; startN += EIGEN_AVX_MAX_NUM_ROW)
    **/
-  template<int64_t endN, int64_t counter, bool toTemp, bool remM>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_loadBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
-                 PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
-                 int64_t remM_ = 0) {
-    constexpr int64_t counterReverse = endN-counter;
+  template <int64_t endN, int64_t counter, bool toTemp, bool remM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadBBlock(
+      Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+      PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
+    constexpr int64_t counterReverse = endN - counter;
     constexpr int64_t startN = counterReverse;
-    transB::template loadB<EIGEN_AVX_MAX_NUM_ROW,startN, false>(&B_temp[startN], LDB_, ymm);
-    aux_loadBBlock<endN, counter-EIGEN_AVX_MAX_NUM_ROW, toTemp, remM>(
-      B_arr, LDB, B_temp, LDB_, ymm, remM_);
+    transB::template loadB<EIGEN_AVX_MAX_NUM_ROW, startN, false>(&B_temp[startN], LDB_, ymm);
+    aux_loadBBlock<endN, counter - EIGEN_AVX_MAX_NUM_ROW, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
   }
 
-  template<int64_t endN, int64_t counter, bool toTemp, bool remM>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_loadBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
-                 PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
-                 int64_t remM_ = 0)
-    {
-      EIGEN_UNUSED_VARIABLE(B_arr);
-      EIGEN_UNUSED_VARIABLE(LDB);
-      EIGEN_UNUSED_VARIABLE(B_temp);
-      EIGEN_UNUSED_VARIABLE(LDB_);
-      EIGEN_UNUSED_VARIABLE(ymm);
-      EIGEN_UNUSED_VARIABLE(remM_);
-    }
-
+  template <int64_t endN, int64_t counter, bool toTemp, bool remM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadBBlock(
+      Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+      PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
+    EIGEN_UNUSED_VARIABLE(B_arr);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(B_temp);
+    EIGEN_UNUSED_VARIABLE(LDB_);
+    EIGEN_UNUSED_VARIABLE(ymm);
+    EIGEN_UNUSED_VARIABLE(remM_);
+  }
 
   /**
    * aux_storeBBlock
@@ -350,88 +391,75 @@
    * 1-D unroll
    *      for(startN = 0; startN < endN; startN += EIGEN_AVX_MAX_NUM_ROW)
    **/
-  template<int64_t endN, int64_t counter, bool toTemp, bool remM, int64_t remK_>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_storeBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
-                  PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
-                  int64_t remM_ = 0) {
-    constexpr int64_t counterReverse = endN-counter;
+  template <int64_t endN, int64_t counter, bool toTemp, bool remM, int64_t remK_>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeBBlock(
+      Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+      PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
+    constexpr int64_t counterReverse = endN - counter;
     constexpr int64_t startN = counterReverse;
 
     EIGEN_IF_CONSTEXPR(toTemp) {
-      transB::template storeB<EIGEN_AVX_MAX_NUM_ROW,startN, remK_ != 0, false>(
-        &B_temp[startN], LDB_, ymm, remK_);
+      transB::template storeB<EIGEN_AVX_MAX_NUM_ROW, startN, remK_ != 0, false>(&B_temp[startN], LDB_, ymm, remK_);
     }
     else {
-      transB::template storeB<std::min(EIGEN_AVX_MAX_NUM_ROW,endN),startN, false, remM>(
-        &B_arr[0 + startN*LDB], LDB, ymm, remM_);
+      transB::template storeB<std::min(EIGEN_AVX_MAX_NUM_ROW, endN), startN, false, remM>(&B_arr[0 + startN * LDB], LDB,
+                                                                                          ymm, remM_);
     }
-    aux_storeBBlock<endN, counter-EIGEN_AVX_MAX_NUM_ROW, toTemp, remM, remK_>(
-      B_arr, LDB, B_temp, LDB_, ymm, remM_);
+    aux_storeBBlock<endN, counter - EIGEN_AVX_MAX_NUM_ROW, toTemp, remM, remK_>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
   }
 
-  template<int64_t endN, int64_t counter, bool toTemp, bool remM, int64_t remK_>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_storeBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
-                  PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
-                  int64_t remM_ = 0)
-    {
-      EIGEN_UNUSED_VARIABLE(B_arr);
-      EIGEN_UNUSED_VARIABLE(LDB);
-      EIGEN_UNUSED_VARIABLE(B_temp);
-      EIGEN_UNUSED_VARIABLE(LDB_);
-      EIGEN_UNUSED_VARIABLE(ymm);
-      EIGEN_UNUSED_VARIABLE(remM_);
-    }
-
+  template <int64_t endN, int64_t counter, bool toTemp, bool remM, int64_t remK_>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_storeBBlock(
+      Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+      PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
+    EIGEN_UNUSED_VARIABLE(B_arr);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(B_temp);
+    EIGEN_UNUSED_VARIABLE(LDB_);
+    EIGEN_UNUSED_VARIABLE(ymm);
+    EIGEN_UNUSED_VARIABLE(remM_);
+  }
 
   /********************************************************
    * Wrappers for aux_XXXX to hide counter parameter
    ********************************************************/
 
-  template<int64_t endN, int64_t packetIndexOffset, bool remM>
-  static EIGEN_ALWAYS_INLINE
-  void loadB(Scalar *B_arr, int64_t LDB,
-             PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
+  template <int64_t endN, int64_t packetIndexOffset, bool remM>
+  static EIGEN_ALWAYS_INLINE void loadB(Scalar *B_arr, int64_t LDB,
+                                        PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+                                        int64_t remM_ = 0) {
     aux_loadB<endN, endN, packetIndexOffset, remM>(B_arr, LDB, ymm, remM_);
   }
 
-  template<int64_t endN, int64_t packetIndexOffset, bool remK, bool remM>
-  static EIGEN_ALWAYS_INLINE
-  void storeB(Scalar *B_arr, int64_t LDB,
-              PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t rem_ = 0) {
+  template <int64_t endN, int64_t packetIndexOffset, bool remK, bool remM>
+  static EIGEN_ALWAYS_INLINE void storeB(Scalar *B_arr, int64_t LDB,
+                                         PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+                                         int64_t rem_ = 0) {
     aux_storeB<endN, endN, packetIndexOffset, remK, remM>(B_arr, LDB, ymm, rem_);
   }
 
-  template<int64_t unrollN, bool toTemp, bool remM>
-  static EIGEN_ALWAYS_INLINE
-  void loadBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
-                  PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
-                  int64_t remM_ = 0) {
-    EIGEN_IF_CONSTEXPR(toTemp) {
-      transB::template loadB<unrollN,0,remM>(&B_arr[0],LDB, ymm, remM_);
-    }
+  template <int64_t unrollN, bool toTemp, bool remM>
+  static EIGEN_ALWAYS_INLINE void loadBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+                                             PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+                                             int64_t remM_ = 0) {
+    EIGEN_IF_CONSTEXPR(toTemp) { transB::template loadB<unrollN, 0, remM>(&B_arr[0], LDB, ymm, remM_); }
     else {
-      aux_loadBBlock<unrollN, unrollN, toTemp, remM>(
-        B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      aux_loadBBlock<unrollN, unrollN, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
     }
   }
 
-  template<int64_t unrollN, bool toTemp, bool remM, int64_t remK_>
-  static EIGEN_ALWAYS_INLINE
-  void storeBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
-                   PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
-                   int64_t remM_ = 0) {
-    aux_storeBBlock<unrollN, unrollN, toTemp, remM, remK_>(
-      B_arr, LDB, B_temp, LDB_, ymm, remM_);
+  template <int64_t unrollN, bool toTemp, bool remM, int64_t remK_>
+  static EIGEN_ALWAYS_INLINE void storeBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+                                              PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+                                              int64_t remM_ = 0) {
+    aux_storeBBlock<unrollN, unrollN, toTemp, remM, remK_>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
   }
 
-  template<int64_t packetIndexOffset>
-  static EIGEN_ALWAYS_INLINE
-  void transposeLxL(PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm){
+  template <int64_t packetIndexOffset>
+  static EIGEN_ALWAYS_INLINE void transposeLxL(PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm) {
     // Note: this assumes EIGEN_AVX_MAX_NUM_ROW = 8. Unrolls should be adjusted
     // accordingly if EIGEN_AVX_MAX_NUM_ROW is smaller.
-    PacketBlock<vecHalf,EIGEN_AVX_MAX_NUM_ROW> r;
+    PacketBlock<vecHalf, EIGEN_AVX_MAX_NUM_ROW> r;
     r.packet[0] = ymm.packet[packetIndexOffset + 0];
     r.packet[1] = ymm.packet[packetIndexOffset + 1];
     r.packet[2] = ymm.packet[packetIndexOffset + 2];
@@ -451,10 +479,10 @@
     ymm.packet[packetIndexOffset + 7] = r.packet[7];
   }
 
-  template<int64_t unrollN, bool toTemp, bool remM>
-  static EIGEN_ALWAYS_INLINE
-  void transB_kernel(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
-                     PacketBlock<vecHalf,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
+  template <int64_t unrollN, bool toTemp, bool remM>
+  static EIGEN_ALWAYS_INLINE void transB_kernel(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+                                                PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+                                                int64_t remM_ = 0) {
     constexpr int64_t U3 = PacketSize * 3;
     constexpr int64_t U2 = PacketSize * 2;
     constexpr int64_t U1 = PacketSize * 1;
@@ -467,70 +495,70 @@
      */
     EIGEN_IF_CONSTEXPR(unrollN == U3) {
       // load LxU3 B col major, transpose LxU3 row major
-      constexpr int64_t maxUBlock = std::min(3*EIGEN_AVX_MAX_NUM_ROW, U3);
-      transB::template loadBBlock<maxUBlock,toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
-      transB::template transposeLxL<0*EIGEN_AVX_MAX_NUM_ROW>(ymm);
-      transB::template transposeLxL<1*EIGEN_AVX_MAX_NUM_ROW>(ymm);
-      transB::template transposeLxL<2*EIGEN_AVX_MAX_NUM_ROW>(ymm);
-      transB::template storeBBlock<maxUBlock,toTemp, remM,0>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      constexpr int64_t maxUBlock = std::min(3 * EIGEN_AVX_MAX_NUM_ROW, U3);
+      transB::template loadBBlock<maxUBlock, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template transposeLxL<0 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+      transB::template transposeLxL<1 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+      transB::template transposeLxL<2 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+      transB::template storeBBlock<maxUBlock, toTemp, remM, 0>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
 
-      EIGEN_IF_CONSTEXPR( maxUBlock < U3) {
-        transB::template loadBBlock<maxUBlock,toTemp, remM>(&B_arr[maxUBlock*LDB], LDB, &B_temp[maxUBlock], LDB_, ymm, remM_);
-        transB::template transposeLxL<0*EIGEN_AVX_MAX_NUM_ROW>(ymm);
-        transB::template transposeLxL<1*EIGEN_AVX_MAX_NUM_ROW>(ymm);
-        transB::template transposeLxL<2*EIGEN_AVX_MAX_NUM_ROW>(ymm);
-        transB::template storeBBlock<maxUBlock,toTemp, remM,0>(&B_arr[maxUBlock*LDB], LDB, &B_temp[maxUBlock], LDB_, ymm, remM_);
+      EIGEN_IF_CONSTEXPR(maxUBlock < U3) {
+        transB::template loadBBlock<maxUBlock, toTemp, remM>(&B_arr[maxUBlock * LDB], LDB, &B_temp[maxUBlock], LDB_,
+                                                             ymm, remM_);
+        transB::template transposeLxL<0 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+        transB::template transposeLxL<1 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+        transB::template transposeLxL<2 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+        transB::template storeBBlock<maxUBlock, toTemp, remM, 0>(&B_arr[maxUBlock * LDB], LDB, &B_temp[maxUBlock], LDB_,
+                                                                 ymm, remM_);
       }
     }
     else EIGEN_IF_CONSTEXPR(unrollN == U2) {
       // load LxU2 B col major, transpose LxU2 row major
-      constexpr int64_t maxUBlock = std::min(3*EIGEN_AVX_MAX_NUM_ROW, U2);
-      transB::template loadBBlock<maxUBlock,toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
-      transB::template transposeLxL<0*EIGEN_AVX_MAX_NUM_ROW>(ymm);
-      transB::template transposeLxL<1*EIGEN_AVX_MAX_NUM_ROW>(ymm);
-      EIGEN_IF_CONSTEXPR(maxUBlock < U2) transB::template transposeLxL<2*EIGEN_AVX_MAX_NUM_ROW>(ymm);
-      transB::template storeBBlock<maxUBlock,toTemp,remM,0>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      constexpr int64_t maxUBlock = std::min(3 * EIGEN_AVX_MAX_NUM_ROW, U2);
+      transB::template loadBBlock<maxUBlock, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template transposeLxL<0 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+      transB::template transposeLxL<1 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+      EIGEN_IF_CONSTEXPR(maxUBlock < U2) transB::template transposeLxL<2 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+      transB::template storeBBlock<maxUBlock, toTemp, remM, 0>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
 
-      EIGEN_IF_CONSTEXPR( maxUBlock < U2) {
-        transB::template loadBBlock<EIGEN_AVX_MAX_NUM_ROW,toTemp, remM>(
-          &B_arr[maxUBlock*LDB], LDB, &B_temp[maxUBlock], LDB_, ymm, remM_);
+      EIGEN_IF_CONSTEXPR(maxUBlock < U2) {
+        transB::template loadBBlock<EIGEN_AVX_MAX_NUM_ROW, toTemp, remM>(&B_arr[maxUBlock * LDB], LDB,
+                                                                         &B_temp[maxUBlock], LDB_, ymm, remM_);
         transB::template transposeLxL<0>(ymm);
-        transB::template storeBBlock<EIGEN_AVX_MAX_NUM_ROW,toTemp,remM,0>(
-          &B_arr[maxUBlock*LDB], LDB, &B_temp[maxUBlock], LDB_, ymm, remM_);
+        transB::template storeBBlock<EIGEN_AVX_MAX_NUM_ROW, toTemp, remM, 0>(&B_arr[maxUBlock * LDB], LDB,
+                                                                             &B_temp[maxUBlock], LDB_, ymm, remM_);
       }
     }
     else EIGEN_IF_CONSTEXPR(unrollN == U1) {
       // load LxU1 B col major, transpose LxU1 row major
-      transB::template loadBBlock<U1,toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template loadBBlock<U1, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
       transB::template transposeLxL<0>(ymm);
-      EIGEN_IF_CONSTEXPR(EIGEN_AVX_MAX_NUM_ROW < U1) {
-        transB::template transposeLxL<1*EIGEN_AVX_MAX_NUM_ROW>(ymm);
-      }
-      transB::template storeBBlock<U1,toTemp,remM,0>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      EIGEN_IF_CONSTEXPR(EIGEN_AVX_MAX_NUM_ROW < U1) { transB::template transposeLxL<1 * EIGEN_AVX_MAX_NUM_ROW>(ymm); }
+      transB::template storeBBlock<U1, toTemp, remM, 0>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
     }
     else EIGEN_IF_CONSTEXPR(unrollN == 8 && U1 > 8) {
       // load Lx4 B col major, transpose Lx4 row major
-      transB::template loadBBlock<8,toTemp,remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template loadBBlock<8, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
       transB::template transposeLxL<0>(ymm);
-      transB::template storeBBlock<8,toTemp,remM,8>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template storeBBlock<8, toTemp, remM, 8>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
     }
     else EIGEN_IF_CONSTEXPR(unrollN == 4 && U1 > 4) {
       // load Lx4 B col major, transpose Lx4 row major
-      transB::template loadBBlock<4,toTemp,remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template loadBBlock<4, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
       transB::template transposeLxL<0>(ymm);
-      transB::template storeBBlock<4,toTemp,remM,4>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template storeBBlock<4, toTemp, remM, 4>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
     }
     else EIGEN_IF_CONSTEXPR(unrollN == 2) {
       // load Lx2 B col major, transpose Lx2 row major
-      transB::template loadBBlock<2,toTemp,remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template loadBBlock<2, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
       transB::template transposeLxL<0>(ymm);
-      transB::template storeBBlock<2,toTemp,remM,2>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template storeBBlock<2, toTemp, remM, 2>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
     }
     else EIGEN_IF_CONSTEXPR(unrollN == 1) {
       // load Lx1 B col major, transpose Lx1 row major
-      transB::template loadBBlock<1,toTemp,remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template loadBBlock<1, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
       transB::template transposeLxL<0>(ymm);
-      transB::template storeBBlock<1,toTemp,remM,1>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template storeBBlock<1, toTemp, remM, 1>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
     }
   }
 };
@@ -549,10 +577,8 @@
  */
 template <typename Scalar>
 class trsm {
-public:
-  using vec = typename std::conditional<std::is_same<Scalar, float>::value,
-                                        vecFullFloat,
-                                        vecFullDouble>::type;
+ public:
+  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
   static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
 
   /***********************************
@@ -570,35 +596,33 @@
    *      for(startM = 0; startM < endM; startM++)
    *        for(startK = 0; startK < endK; startK++)
    **/
-  template<bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_loadRHS(Scalar* B_arr, int64_t LDB, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+  template <bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadRHS(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+    constexpr int64_t counterReverse = endM * endK - counter;
+    constexpr int64_t startM = counterReverse / (endK);
+    constexpr int64_t startK = counterReverse % endK;
 
-    constexpr int64_t counterReverse = endM*endK-counter;
-    constexpr int64_t startM = counterReverse/(endK);
-    constexpr int64_t startK = counterReverse%endK;
-
-    constexpr int64_t packetIndex = startM*endK + startK;
+    constexpr int64_t packetIndex = startM * endK + startK;
     constexpr int64_t startM_ = isFWDSolve ? startM : -startM;
-    const int64_t rhsIndex = (startK*PacketSize) + startM_*LDB;
+    const int64_t rhsIndex = (startK * PacketSize) + startM_ * LDB;
     EIGEN_IF_CONSTEXPR(krem) {
       RHSInPacket.packet[packetIndex] = ploadu<vec>(&B_arr[rhsIndex], remMask<PacketSize>(rem));
     }
     else {
       RHSInPacket.packet[packetIndex] = ploadu<vec>(&B_arr[rhsIndex]);
     }
-    aux_loadRHS<isFWDSolve,endM, endK, counter-1, krem>(B_arr, LDB, RHSInPacket, rem);
+    aux_loadRHS<isFWDSolve, endM, endK, counter - 1, krem>(B_arr, LDB, RHSInPacket, rem);
   }
 
-  template<bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_loadRHS(Scalar* B_arr, int64_t LDB, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0)
-    {
-      EIGEN_UNUSED_VARIABLE(B_arr);
-      EIGEN_UNUSED_VARIABLE(LDB);
-      EIGEN_UNUSED_VARIABLE(RHSInPacket);
-      EIGEN_UNUSED_VARIABLE(rem);
-    }
+  template <bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadRHS(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+    EIGEN_UNUSED_VARIABLE(B_arr);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(RHSInPacket);
+    EIGEN_UNUSED_VARIABLE(rem);
+  }
 
   /**
    * aux_storeRHS
@@ -607,34 +631,33 @@
    *      for(startM = 0; startM < endM; startM++)
    *        for(startK = 0; startK < endK; startK++)
    **/
-  template<bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_storeRHS(Scalar* B_arr, int64_t LDB, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
-    constexpr int64_t counterReverse = endM*endK-counter;
-    constexpr int64_t startM = counterReverse/(endK);
-    constexpr int64_t startK = counterReverse%endK;
+  template <bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeRHS(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+    constexpr int64_t counterReverse = endM * endK - counter;
+    constexpr int64_t startM = counterReverse / (endK);
+    constexpr int64_t startK = counterReverse % endK;
 
-    constexpr int64_t packetIndex = startM*endK + startK;
+    constexpr int64_t packetIndex = startM * endK + startK;
     constexpr int64_t startM_ = isFWDSolve ? startM : -startM;
-    const int64_t rhsIndex = (startK*PacketSize) + startM_*LDB;
+    const int64_t rhsIndex = (startK * PacketSize) + startM_ * LDB;
     EIGEN_IF_CONSTEXPR(krem) {
       pstoreu<Scalar>(&B_arr[rhsIndex], RHSInPacket.packet[packetIndex], remMask<PacketSize>(rem));
     }
     else {
       pstoreu<Scalar>(&B_arr[rhsIndex], RHSInPacket.packet[packetIndex]);
     }
-    aux_storeRHS<isFWDSolve,endM, endK, counter-1, krem>(B_arr, LDB, RHSInPacket, rem);
+    aux_storeRHS<isFWDSolve, endM, endK, counter - 1, krem>(B_arr, LDB, RHSInPacket, rem);
   }
 
-  template<bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_storeRHS(Scalar* B_arr, int64_t LDB, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0)
-    {
-      EIGEN_UNUSED_VARIABLE(B_arr);
-      EIGEN_UNUSED_VARIABLE(LDB);
-      EIGEN_UNUSED_VARIABLE(RHSInPacket);
-      EIGEN_UNUSED_VARIABLE(rem);
-    }
+  template <bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_storeRHS(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+    EIGEN_UNUSED_VARIABLE(B_arr);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(RHSInPacket);
+    EIGEN_UNUSED_VARIABLE(rem);
+  }
 
   /**
    * aux_divRHSByDiag
@@ -644,20 +667,20 @@
    * 1-D unroll
    *      for(startK = 0; startK < endK; startK++)
    **/
-  template<int64_t currM, int64_t endK, int64_t counter>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0 && currM >= 0)>
-  aux_divRHSByDiag(PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
-    constexpr int64_t counterReverse = endK-counter;
+  template <int64_t currM, int64_t endK, int64_t counter>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0 && currM >= 0)> aux_divRHSByDiag(
+      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    constexpr int64_t counterReverse = endK - counter;
     constexpr int64_t startK = counterReverse;
 
-    constexpr int64_t packetIndex = currM*endK + startK;
+    constexpr int64_t packetIndex = currM * endK + startK;
     RHSInPacket.packet[packetIndex] = pmul(AInPacket.packet[currM], RHSInPacket.packet[packetIndex]);
-    aux_divRHSByDiag<currM, endK, counter-1>(RHSInPacket, AInPacket);
+    aux_divRHSByDiag<currM, endK, counter - 1>(RHSInPacket, AInPacket);
   }
 
-  template<int64_t currM, int64_t endK, int64_t counter>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<!(counter > 0 && currM >= 0)>
-  aux_divRHSByDiag(PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+  template <int64_t currM, int64_t endK, int64_t counter>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<!(counter > 0 && currM >= 0)> aux_divRHSByDiag(
+      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
     EIGEN_UNUSED_VARIABLE(RHSInPacket);
     EIGEN_UNUSED_VARIABLE(AInPacket);
   }
@@ -669,52 +692,53 @@
    *      for(startM = initM; startM < endM; startM++)
    *        for(startK = 0; startK < endK; startK++)
    **/
-  template<bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t initM, int64_t endM, int64_t endK, int64_t counter, int64_t currentM>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_updateRHS(Scalar *A_arr, int64_t LDA, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+  template <bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t initM, int64_t endM, int64_t endK,
+            int64_t counter, int64_t currentM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_updateRHS(
+      Scalar *A_arr, int64_t LDA, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    constexpr int64_t counterReverse = (endM - initM) * endK - counter;
+    constexpr int64_t startM = initM + counterReverse / (endK);
+    constexpr int64_t startK = counterReverse % endK;
 
-      constexpr int64_t counterReverse = (endM-initM)*endK-counter;
-      constexpr int64_t startM = initM + counterReverse/(endK);
-      constexpr int64_t startK = counterReverse%endK;
-
-      // For each row of A, first update all corresponding RHS
-      constexpr int64_t packetIndex = startM*endK + startK;
-      EIGEN_IF_CONSTEXPR(currentM > 0) {
-        RHSInPacket.packet[packetIndex] =
-          pnmadd(AInPacket.packet[startM],
-                 RHSInPacket.packet[(currentM-1)*endK+startK],
+    // For each row of A, first update all corresponding RHS
+    constexpr int64_t packetIndex = startM * endK + startK;
+    EIGEN_IF_CONSTEXPR(currentM > 0) {
+      RHSInPacket.packet[packetIndex] =
+          pnmadd(AInPacket.packet[startM], RHSInPacket.packet[(currentM - 1) * endK + startK],
                  RHSInPacket.packet[packetIndex]);
-      }
+    }
 
-      EIGEN_IF_CONSTEXPR(startK == endK - 1) {
-        // Once all RHS for previous row of A is updated, we broadcast the next element in the column A_{i, currentM}.
-        EIGEN_IF_CONSTEXPR(startM == currentM && !isUnitDiag) {
-          // If diagonal is not unit, we broadcast reciprocals of diagonals AinPacket.packet[currentM].
-          // This will be used in divRHSByDiag
-          EIGEN_IF_CONSTEXPR(isFWDSolve)
-            AInPacket.packet[currentM] = pset1<vec>(Scalar(1)/A_arr[idA<isARowMajor>(currentM,currentM,LDA)]);
-          else
-            AInPacket.packet[currentM] = pset1<vec>(Scalar(1)/A_arr[idA<isARowMajor>(-currentM,-currentM,LDA)]);
-        }
-        else {
-          // Broadcast next off diagonal element of A
-          EIGEN_IF_CONSTEXPR(isFWDSolve)
-            AInPacket.packet[startM] = pset1<vec>(A_arr[idA<isARowMajor>(startM,currentM,LDA)]);
-          else
-            AInPacket.packet[startM] = pset1<vec>(A_arr[idA<isARowMajor>(-startM,-currentM,LDA)]);
-        }
+    EIGEN_IF_CONSTEXPR(startK == endK - 1) {
+      // Once all RHS for previous row of A is updated, we broadcast the next element in the column A_{i, currentM}.
+      EIGEN_IF_CONSTEXPR(startM == currentM && !isUnitDiag) {
+        // If diagonal is not unit, we broadcast reciprocals of diagonals AinPacket.packet[currentM].
+        // This will be used in divRHSByDiag
+        EIGEN_IF_CONSTEXPR(isFWDSolve)
+        AInPacket.packet[currentM] = pset1<vec>(Scalar(1) / A_arr[idA<isARowMajor>(currentM, currentM, LDA)]);
+        else AInPacket.packet[currentM] = pset1<vec>(Scalar(1) / A_arr[idA<isARowMajor>(-currentM, -currentM, LDA)]);
       }
+      else {
+        // Broadcast next off diagonal element of A
+        EIGEN_IF_CONSTEXPR(isFWDSolve)
+        AInPacket.packet[startM] = pset1<vec>(A_arr[idA<isARowMajor>(startM, currentM, LDA)]);
+        else AInPacket.packet[startM] = pset1<vec>(A_arr[idA<isARowMajor>(-startM, -currentM, LDA)]);
+      }
+    }
 
-      aux_updateRHS<isARowMajor, isFWDSolve, isUnitDiag, initM, endM, endK, counter - 1, currentM>(A_arr, LDA, RHSInPacket, AInPacket);
+    aux_updateRHS<isARowMajor, isFWDSolve, isUnitDiag, initM, endM, endK, counter - 1, currentM>(
+        A_arr, LDA, RHSInPacket, AInPacket);
   }
 
-  template<bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t initM, int64_t endM, int64_t endK, int64_t counter, int64_t currentM>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_updateRHS(Scalar *A_arr, int64_t LDA, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
-        EIGEN_UNUSED_VARIABLE(A_arr);
-        EIGEN_UNUSED_VARIABLE(LDA);
-        EIGEN_UNUSED_VARIABLE(RHSInPacket);
-        EIGEN_UNUSED_VARIABLE(AInPacket);
+  template <bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t initM, int64_t endM, int64_t endK,
+            int64_t counter, int64_t currentM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_updateRHS(
+      Scalar *A_arr, int64_t LDA, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    EIGEN_UNUSED_VARIABLE(A_arr);
+    EIGEN_UNUSED_VARIABLE(LDA);
+    EIGEN_UNUSED_VARIABLE(RHSInPacket);
+    EIGEN_UNUSED_VARIABLE(AInPacket);
   }
 
   /**
@@ -724,10 +748,10 @@
    *      for(startM = 0; startM < endM; startM++)
    **/
   template <bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t endM, int64_t counter, int64_t numK>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_triSolveMicroKernel(Scalar *A_arr, int64_t LDA, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
-
-    constexpr int64_t counterReverse = endM-counter;
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_triSolveMicroKernel(
+      Scalar *A_arr, int64_t LDA, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    constexpr int64_t counterReverse = endM - counter;
     constexpr int64_t startM = counterReverse;
 
     constexpr int64_t currentM = startM;
@@ -738,31 +762,31 @@
     // this is handled with enable_if to prevent out-of-bound warnings
     // from the compiler
     EIGEN_IF_CONSTEXPR(!isUnitDiag && startM > 0)
-      trsm::template divRHSByDiag<startM-1, numK>(RHSInPacket, AInPacket);
+    trsm::template divRHSByDiag<startM - 1, numK>(RHSInPacket, AInPacket);
 
     // After division, the rhs corresponding to subsequent rows of A can be partially updated
     // We also broadcast the reciprocal of the next diagonal to AInPacket.packet[currentM] (if needed)
     // to be used in the next iteration.
-    trsm::template
-      updateRHS<isARowMajor, isFWDSolve, isUnitDiag, startM, endM, numK, currentM>(
-        A_arr, LDA, RHSInPacket, AInPacket);
+    trsm::template updateRHS<isARowMajor, isFWDSolve, isUnitDiag, startM, endM, numK, currentM>(A_arr, LDA, RHSInPacket,
+                                                                                                AInPacket);
 
     // Handle division for the RHS corresponding to the final row of A.
-    EIGEN_IF_CONSTEXPR(!isUnitDiag && startM == endM-1)
-      trsm::template divRHSByDiag<startM, numK>(RHSInPacket, AInPacket);
+    EIGEN_IF_CONSTEXPR(!isUnitDiag && startM == endM - 1)
+    trsm::template divRHSByDiag<startM, numK>(RHSInPacket, AInPacket);
 
-    aux_triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, endM, counter - 1, numK>(A_arr, LDA, RHSInPacket, AInPacket);
+    aux_triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, endM, counter - 1, numK>(A_arr, LDA, RHSInPacket,
+                                                                                          AInPacket);
   }
 
   template <bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t endM, int64_t counter, int64_t numK>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_triSolveMicroKernel(Scalar *A_arr, int64_t LDA, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ROW> &AInPacket)
-    {
-      EIGEN_UNUSED_VARIABLE(A_arr);
-      EIGEN_UNUSED_VARIABLE(LDA);
-      EIGEN_UNUSED_VARIABLE(RHSInPacket);
-      EIGEN_UNUSED_VARIABLE(AInPacket);
-    }
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_triSolveMicroKernel(
+      Scalar *A_arr, int64_t LDA, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    EIGEN_UNUSED_VARIABLE(A_arr);
+    EIGEN_UNUSED_VARIABLE(LDA);
+    EIGEN_UNUSED_VARIABLE(RHSInPacket);
+    EIGEN_UNUSED_VARIABLE(AInPacket);
+  }
 
   /********************************************************
    * Wrappers for aux_XXXX to hide counter parameter
@@ -772,40 +796,42 @@
    * Load endMxendK block of B to RHSInPacket
    * Masked loads are used for cases where endK is not a multiple of PacketSize
    */
-  template<bool isFWDSolve, int64_t endM, int64_t endK, bool krem = false>
-  static EIGEN_ALWAYS_INLINE
-  void loadRHS(Scalar* B_arr, int64_t LDB, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
-    aux_loadRHS<isFWDSolve, endM, endK, endM*endK, krem>(B_arr, LDB, RHSInPacket, rem);
+  template <bool isFWDSolve, int64_t endM, int64_t endK, bool krem = false>
+  static EIGEN_ALWAYS_INLINE void loadRHS(Scalar *B_arr, int64_t LDB,
+                                          PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+    aux_loadRHS<isFWDSolve, endM, endK, endM * endK, krem>(B_arr, LDB, RHSInPacket, rem);
   }
 
   /**
    * Load endMxendK block of B to RHSInPacket
    * Masked loads are used for cases where endK is not a multiple of PacketSize
    */
-  template<bool isFWDSolve, int64_t endM, int64_t endK, bool krem = false>
-  static EIGEN_ALWAYS_INLINE
-  void storeRHS(Scalar* B_arr, int64_t LDB, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
-    aux_storeRHS<isFWDSolve, endM, endK, endM*endK, krem>(B_arr, LDB, RHSInPacket, rem);
+  template <bool isFWDSolve, int64_t endM, int64_t endK, bool krem = false>
+  static EIGEN_ALWAYS_INLINE void storeRHS(Scalar *B_arr, int64_t LDB,
+                                           PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+    aux_storeRHS<isFWDSolve, endM, endK, endM * endK, krem>(B_arr, LDB, RHSInPacket, rem);
   }
 
   /**
    * Only used if Triangular matrix has non-unit diagonal values
    */
-  template<int64_t currM, int64_t endK>
-  static EIGEN_ALWAYS_INLINE
-  void divRHSByDiag(PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+  template <int64_t currM, int64_t endK>
+  static EIGEN_ALWAYS_INLINE void divRHSByDiag(PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+                                               PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
     aux_divRHSByDiag<currM, endK, endK>(RHSInPacket, AInPacket);
   }
 
   /**
    * Update right-hand sides (stored in avx registers)
    * Traversing along the column A_{i,currentM}, where currentM <= i <= endM, and broadcasting each value to AInPacket.
-  **/
-  template<bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t startM, int64_t endM, int64_t endK, int64_t currentM>
-  static EIGEN_ALWAYS_INLINE
-  void updateRHS(Scalar *A_arr, int64_t LDA, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
-    aux_updateRHS<isARowMajor, isFWDSolve, isUnitDiag, startM, endM, endK, (endM-startM)*endK, currentM>(
-      A_arr, LDA, RHSInPacket, AInPacket);
+   **/
+  template <bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t startM, int64_t endM, int64_t endK,
+            int64_t currentM>
+  static EIGEN_ALWAYS_INLINE void updateRHS(Scalar *A_arr, int64_t LDA,
+                                            PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+                                            PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    aux_updateRHS<isARowMajor, isFWDSolve, isUnitDiag, startM, endM, endK, (endM - startM) * endK, currentM>(
+        A_arr, LDA, RHSInPacket, AInPacket);
   }
 
   /**
@@ -815,11 +841,11 @@
    * isUnitDiag: true => triangular matrix has unit diagonal.
    */
   template <bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t endM, int64_t numK>
-  static EIGEN_ALWAYS_INLINE
-  void triSolveMicroKernel(Scalar *A_arr, int64_t LDA, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec,EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
-    static_assert( numK >= 1 && numK <= 3, "numK out of range" );
-    aux_triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, endM, endM, numK>(
-      A_arr, LDA, RHSInPacket, AInPacket);
+  static EIGEN_ALWAYS_INLINE void triSolveMicroKernel(Scalar *A_arr, int64_t LDA,
+                                                      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+                                                      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    static_assert(numK >= 1 && numK <= 3, "numK out of range");
+    aux_triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, endM, endM, numK>(A_arr, LDA, RHSInPacket, AInPacket);
   }
 };
 
@@ -830,7 +856,7 @@
  */
 template <typename Scalar, bool isAdd>
 class gemm {
-public:
+ public:
   using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
   static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
 
@@ -850,23 +876,22 @@
    *      for(startM = 0; startM < endM; startM++)
    *        for(startN = 0; startN < endN; startN++)
    **/
-  template<int64_t endM, int64_t endN, int64_t counter>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_setzero(PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
-    constexpr int64_t counterReverse = endM*endN-counter;
-    constexpr int64_t startM = counterReverse/(endN);
-    constexpr int64_t startN = counterReverse%endN;
+  template <int64_t endM, int64_t endN, int64_t counter>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_setzero(
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
+    constexpr int64_t counterReverse = endM * endN - counter;
+    constexpr int64_t startM = counterReverse / (endN);
+    constexpr int64_t startN = counterReverse % endN;
 
-    zmm.packet[startN*endM + startM] = pzero(zmm.packet[startN*endM + startM]);
-    aux_setzero<endM, endN, counter-1>(zmm);
+    zmm.packet[startN * endM + startM] = pzero(zmm.packet[startN * endM + startM]);
+    aux_setzero<endM, endN, counter - 1>(zmm);
   }
 
-  template<int64_t endM, int64_t endN, int64_t counter>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_setzero(PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm)
-    {
-      EIGEN_UNUSED_VARIABLE(zmm);
-    }
+  template <int64_t endM, int64_t endN, int64_t counter>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_setzero(
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
+    EIGEN_UNUSED_VARIABLE(zmm);
+  }
 
   /**
    * aux_updateC
@@ -875,34 +900,31 @@
    *      for(startM = 0; startM < endM; startM++)
    *        for(startN = 0; startN < endN; startN++)
    **/
-  template<int64_t endM, int64_t endN, int64_t counter, bool rem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_updateC(Scalar *C_arr, int64_t LDC, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+  template <int64_t endM, int64_t endN, int64_t counter, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_updateC(
+      Scalar *C_arr, int64_t LDC, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(rem_);
-    constexpr int64_t counterReverse = endM*endN-counter;
-    constexpr int64_t startM = counterReverse/(endN);
-    constexpr int64_t startN = counterReverse%endN;
+    constexpr int64_t counterReverse = endM * endN - counter;
+    constexpr int64_t startM = counterReverse / (endN);
+    constexpr int64_t startN = counterReverse % endN;
 
     EIGEN_IF_CONSTEXPR(rem)
-      zmm.packet[startN*endM + startM] =
-      padd(ploadu<vec>(&C_arr[(startN) * LDC + startM*PacketSize], remMask<PacketSize>(rem_)),
-           zmm.packet[startN*endM + startM],
-           remMask<PacketSize>(rem_));
-    else
-      zmm.packet[startN*endM + startM] =
-        padd(ploadu<vec>(&C_arr[(startN) * LDC + startM*PacketSize]), zmm.packet[startN*endM + startM]);
-    aux_updateC<endM, endN, counter-1, rem>(C_arr, LDC, zmm, rem_);
+    zmm.packet[startN * endM + startM] =
+        padd(ploadu<vec>(&C_arr[(startN)*LDC + startM * PacketSize], remMask<PacketSize>(rem_)),
+             zmm.packet[startN * endM + startM], remMask<PacketSize>(rem_));
+    else zmm.packet[startN * endM + startM] =
+        padd(ploadu<vec>(&C_arr[(startN)*LDC + startM * PacketSize]), zmm.packet[startN * endM + startM]);
+    aux_updateC<endM, endN, counter - 1, rem>(C_arr, LDC, zmm, rem_);
   }
 
-  template<int64_t endM, int64_t endN, int64_t counter, bool rem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_updateC(Scalar *C_arr, int64_t LDC, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0)
-    {
-      EIGEN_UNUSED_VARIABLE(C_arr);
-      EIGEN_UNUSED_VARIABLE(LDC);
-      EIGEN_UNUSED_VARIABLE(zmm);
-      EIGEN_UNUSED_VARIABLE(rem_);
-    }
+  template <int64_t endM, int64_t endN, int64_t counter, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_updateC(
+      Scalar *C_arr, int64_t LDC, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(C_arr);
+    EIGEN_UNUSED_VARIABLE(LDC);
+    EIGEN_UNUSED_VARIABLE(zmm);
+    EIGEN_UNUSED_VARIABLE(rem_);
+  }
 
   /**
    * aux_storeC
@@ -911,30 +933,29 @@
    *      for(startM = 0; startM < endM; startM++)
    *        for(startN = 0; startN < endN; startN++)
    **/
-  template<int64_t endM, int64_t endN, int64_t counter, bool rem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_storeC(Scalar *C_arr, int64_t LDC, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+  template <int64_t endM, int64_t endN, int64_t counter, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeC(
+      Scalar *C_arr, int64_t LDC, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(rem_);
-    constexpr int64_t counterReverse = endM*endN-counter;
-    constexpr int64_t startM = counterReverse/(endN);
-    constexpr int64_t startN = counterReverse%endN;
+    constexpr int64_t counterReverse = endM * endN - counter;
+    constexpr int64_t startM = counterReverse / (endN);
+    constexpr int64_t startN = counterReverse % endN;
 
     EIGEN_IF_CONSTEXPR(rem)
-      pstoreu<Scalar>(&C_arr[(startN) * LDC + startM*PacketSize], zmm.packet[startN*endM + startM], remMask<PacketSize>(rem_));
-    else
-      pstoreu<Scalar>(&C_arr[(startN) * LDC + startM*PacketSize], zmm.packet[startN*endM + startM]);
-    aux_storeC<endM, endN, counter-1, rem>(C_arr, LDC, zmm, rem_);
+    pstoreu<Scalar>(&C_arr[(startN)*LDC + startM * PacketSize], zmm.packet[startN * endM + startM],
+                    remMask<PacketSize>(rem_));
+    else pstoreu<Scalar>(&C_arr[(startN)*LDC + startM * PacketSize], zmm.packet[startN * endM + startM]);
+    aux_storeC<endM, endN, counter - 1, rem>(C_arr, LDC, zmm, rem_);
   }
 
-  template<int64_t endM, int64_t endN, int64_t counter, bool rem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_storeC(Scalar *C_arr, int64_t LDC, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0)
-    {
-      EIGEN_UNUSED_VARIABLE(C_arr);
-      EIGEN_UNUSED_VARIABLE(LDC);
-      EIGEN_UNUSED_VARIABLE(zmm);
-      EIGEN_UNUSED_VARIABLE(rem_);
-    }
+  template <int64_t endM, int64_t endN, int64_t counter, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_storeC(
+      Scalar *C_arr, int64_t LDC, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(C_arr);
+    EIGEN_UNUSED_VARIABLE(LDC);
+    EIGEN_UNUSED_VARIABLE(zmm);
+    EIGEN_UNUSED_VARIABLE(rem_);
+  }
 
   /**
    * aux_startLoadB
@@ -942,28 +963,25 @@
    * 1-D unroll
    *      for(startL = 0; startL < endL; startL++)
    **/
-  template<int64_t unrollM, int64_t unrollN, int64_t endL, int64_t counter, bool rem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_startLoadB(Scalar *B_t, int64_t LDB, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+  template <int64_t unrollM, int64_t unrollN, int64_t endL, int64_t counter, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_startLoadB(
+      Scalar *B_t, int64_t LDB, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(rem_);
-    constexpr int64_t counterReverse = endL-counter;
+    constexpr int64_t counterReverse = endL - counter;
     constexpr int64_t startL = counterReverse;
 
     EIGEN_IF_CONSTEXPR(rem)
-      zmm.packet[unrollM*unrollN+startL] =
-      ploadu<vec>(&B_t[(startL/unrollM)*LDB + (startL%unrollM)*PacketSize], remMask<PacketSize>(rem_));
-    else
-      zmm.packet[unrollM*unrollN+startL] = ploadu<vec>(&B_t[(startL/unrollM)*LDB + (startL%unrollM)*PacketSize]);
+    zmm.packet[unrollM * unrollN + startL] =
+        ploadu<vec>(&B_t[(startL / unrollM) * LDB + (startL % unrollM) * PacketSize], remMask<PacketSize>(rem_));
+    else zmm.packet[unrollM * unrollN + startL] =
+        ploadu<vec>(&B_t[(startL / unrollM) * LDB + (startL % unrollM) * PacketSize]);
 
-    aux_startLoadB<unrollM, unrollN, endL, counter-1, rem>(B_t, LDB, zmm, rem_);
+    aux_startLoadB<unrollM, unrollN, endL, counter - 1, rem>(B_t, LDB, zmm, rem_);
   }
 
-  template<int64_t unrollM, int64_t unrollN, int64_t endL, int64_t counter, bool rem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_startLoadB(
-    Scalar *B_t, int64_t LDB,
-    PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0)
-  {
+  template <int64_t unrollM, int64_t unrollN, int64_t endL, int64_t counter, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_startLoadB(
+      Scalar *B_t, int64_t LDB, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(B_t);
     EIGEN_UNUSED_VARIABLE(LDB);
     EIGEN_UNUSED_VARIABLE(zmm);
@@ -976,21 +994,20 @@
    * 1-D unroll
    *      for(startB = 0; startB < endB; startB++)
    **/
-  template<bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t counter, int64_t numLoad>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_startBCastA(Scalar *A_t, int64_t LDA, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
-    constexpr int64_t counterReverse = endB-counter;
+  template <bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t counter, int64_t numLoad>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_startBCastA(
+      Scalar *A_t, int64_t LDA, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
+    constexpr int64_t counterReverse = endB - counter;
     constexpr int64_t startB = counterReverse;
 
-    zmm.packet[unrollM*unrollN+numLoad+startB] = pload1<vec>(&A_t[idA<isARowMajor>(startB, 0,LDA)]);
+    zmm.packet[unrollM * unrollN + numLoad + startB] = pload1<vec>(&A_t[idA<isARowMajor>(startB, 0, LDA)]);
 
-    aux_startBCastA<isARowMajor, unrollM, unrollN, endB, counter-1, numLoad>(A_t, LDA, zmm);
+    aux_startBCastA<isARowMajor, unrollM, unrollN, endB, counter - 1, numLoad>(A_t, LDA, zmm);
   }
 
-  template<bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t counter, int64_t numLoad>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_startBCastA(Scalar *A_t, int64_t LDA, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm)
-  {
+  template <bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t counter, int64_t numLoad>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_startBCastA(
+      Scalar *A_t, int64_t LDA, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
     EIGEN_UNUSED_VARIABLE(A_t);
     EIGEN_UNUSED_VARIABLE(LDA);
     EIGEN_UNUSED_VARIABLE(zmm);
@@ -1003,33 +1020,32 @@
    * 1-D unroll
    *      for(startM = 0; startM < endM; startM++)
    **/
-  template<int64_t endM, int64_t counter, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad, int64_t numBCast, bool rem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_loadB(Scalar *B_t, int64_t LDB, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+  template <int64_t endM, int64_t counter, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad,
+            int64_t numBCast, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadB(
+      Scalar *B_t, int64_t LDB, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(rem_);
-    if ((numLoad/endM + currK < unrollK)) {
-      constexpr int64_t counterReverse = endM-counter;
+    if ((numLoad / endM + currK < unrollK)) {
+      constexpr int64_t counterReverse = endM - counter;
       constexpr int64_t startM = counterReverse;
 
       EIGEN_IF_CONSTEXPR(rem) {
-        zmm.packet[endM*unrollN+(startM+currK*endM)%numLoad] =
-          ploadu<vec>(&B_t[(numLoad/endM + currK)*LDB + startM*PacketSize], remMask<PacketSize>(rem_));
+        zmm.packet[endM * unrollN + (startM + currK * endM) % numLoad] =
+            ploadu<vec>(&B_t[(numLoad / endM + currK) * LDB + startM * PacketSize], remMask<PacketSize>(rem_));
       }
       else {
-        zmm.packet[endM*unrollN+(startM+currK*endM)%numLoad] =
-          ploadu<vec>(&B_t[(numLoad/endM + currK)*LDB + startM*PacketSize]);
+        zmm.packet[endM * unrollN + (startM + currK * endM) % numLoad] =
+            ploadu<vec>(&B_t[(numLoad / endM + currK) * LDB + startM * PacketSize]);
       }
 
-      aux_loadB<endM, counter-1, unrollN, currK, unrollK, numLoad, numBCast, rem>(B_t, LDB, zmm, rem_);
+      aux_loadB<endM, counter - 1, unrollN, currK, unrollK, numLoad, numBCast, rem>(B_t, LDB, zmm, rem_);
     }
   }
 
-  template<int64_t endM, int64_t counter, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad, int64_t numBCast, bool rem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_loadB(
-    Scalar *B_t, int64_t LDB,
-    PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0)
-  {
+  template <int64_t endM, int64_t counter, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad,
+            int64_t numBCast, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadB(
+      Scalar *B_t, int64_t LDB, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(B_t);
     EIGEN_UNUSED_VARIABLE(LDB);
     EIGEN_UNUSED_VARIABLE(zmm);
@@ -1044,58 +1060,53 @@
    *        for(startN = 0; startN < endN; startN++)
    *          for(startK = 0; startK < endK; startK++)
    **/
-  template<bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t counter, int64_t numLoad, int64_t numBCast, bool rem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)>
-  aux_microKernel(
-    Scalar *B_t, Scalar* A_t, int64_t LDB, int64_t LDA,
-    PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+  template <bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t counter, int64_t numLoad,
+            int64_t numBCast, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_microKernel(
+      Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+      int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(rem_);
-    constexpr int64_t counterReverse = endM*endN*endK-counter;
-    constexpr int startK = counterReverse/(endM*endN);
-    constexpr int startN = (counterReverse/(endM))%endN;
-    constexpr int startM = counterReverse%endM;
+    constexpr int64_t counterReverse = endM * endN * endK - counter;
+    constexpr int startK = counterReverse / (endM * endN);
+    constexpr int startN = (counterReverse / (endM)) % endN;
+    constexpr int startM = counterReverse % endM;
 
     EIGEN_IF_CONSTEXPR(startK == 0 && startM == 0 && startN == 0) {
-      gemm:: template
-        startLoadB<endM, endN, numLoad, rem>(B_t, LDB, zmm, rem_);
-      gemm:: template
-        startBCastA<isARowMajor, endM, endN, numBCast, numLoad>(A_t, LDA, zmm);
+      gemm::template startLoadB<endM, endN, numLoad, rem>(B_t, LDB, zmm, rem_);
+      gemm::template startBCastA<isARowMajor, endM, endN, numBCast, numLoad>(A_t, LDA, zmm);
     }
 
     {
       // Interleave FMA and Bcast
       EIGEN_IF_CONSTEXPR(isAdd) {
-        zmm.packet[startN*endM + startM] =
-          pmadd(zmm.packet[endM*endN+numLoad+(startN+startK*endN)%numBCast],
-                zmm.packet[endM*endN+(startM+startK*endM)%numLoad], zmm.packet[startN*endM + startM]);
+        zmm.packet[startN * endM + startM] =
+            pmadd(zmm.packet[endM * endN + numLoad + (startN + startK * endN) % numBCast],
+                  zmm.packet[endM * endN + (startM + startK * endM) % numLoad], zmm.packet[startN * endM + startM]);
       }
       else {
-        zmm.packet[startN*endM + startM] =
-          pnmadd(zmm.packet[endM*endN+numLoad+(startN+startK*endN)%numBCast],
-                 zmm.packet[endM*endN+(startM+startK*endM)%numLoad], zmm.packet[startN*endM + startM]);
+        zmm.packet[startN * endM + startM] =
+            pnmadd(zmm.packet[endM * endN + numLoad + (startN + startK * endN) % numBCast],
+                   zmm.packet[endM * endN + (startM + startK * endM) % numLoad], zmm.packet[startN * endM + startM]);
       }
       // Bcast
-      EIGEN_IF_CONSTEXPR(startM == endM - 1 && (numBCast + startN + startK*endN < endK*endN)) {
-        zmm.packet[endM*endN+numLoad+(startN+startK*endN)%numBCast] =
-          pload1<vec>(&A_t[idA<isARowMajor>((numBCast + startN + startK*endN)%endN,
-                                            (numBCast + startN + startK*endN)/endN, LDA)]);
+      EIGEN_IF_CONSTEXPR(startM == endM - 1 && (numBCast + startN + startK * endN < endK * endN)) {
+        zmm.packet[endM * endN + numLoad + (startN + startK * endN) % numBCast] = pload1<vec>(&A_t[idA<isARowMajor>(
+            (numBCast + startN + startK * endN) % endN, (numBCast + startN + startK * endN) / endN, LDA)]);
       }
     }
 
     // We have updated all accumlators, time to load next set of B's
-    EIGEN_IF_CONSTEXPR( (startN == endN - 1) && (startM == endM - 1) ) {
+    EIGEN_IF_CONSTEXPR((startN == endN - 1) && (startM == endM - 1)) {
       gemm::template loadB<endM, endN, startK, endK, numLoad, numBCast, rem>(B_t, LDB, zmm, rem_);
     }
-    aux_microKernel<isARowMajor, endM, endN, endK, counter-1, numLoad, numBCast, rem>(B_t, A_t, LDB, LDA, zmm, rem_);
-
+    aux_microKernel<isARowMajor, endM, endN, endK, counter - 1, numLoad, numBCast, rem>(B_t, A_t, LDB, LDA, zmm, rem_);
   }
 
-  template<bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t counter, int64_t numLoad, int64_t numBCast, bool rem>
-  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)>
-  aux_microKernel(
-    Scalar *B_t, Scalar* A_t, int64_t LDB, int64_t LDA,
-    PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0)
-  {
+  template <bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t counter, int64_t numLoad,
+            int64_t numBCast, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_microKernel(
+      Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+      int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(B_t);
     EIGEN_UNUSED_VARIABLE(A_t);
     EIGEN_UNUSED_VARIABLE(LDB);
@@ -1108,55 +1119,57 @@
    * Wrappers for aux_XXXX to hide counter parameter
    ********************************************************/
 
-  template<int64_t endM, int64_t endN>
-  static EIGEN_ALWAYS_INLINE
-  void setzero(PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm){
-    aux_setzero<endM, endN, endM*endN>(zmm);
+  template <int64_t endM, int64_t endN>
+  static EIGEN_ALWAYS_INLINE void setzero(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
+    aux_setzero<endM, endN, endM * endN>(zmm);
   }
 
   /**
    * Ideally the compiler folds these into vaddp{s,d} with an embedded memory load.
    */
-  template<int64_t endM, int64_t endN, bool rem = false>
-  static EIGEN_ALWAYS_INLINE
-  void updateC(Scalar *C_arr, int64_t LDC, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0){
+  template <int64_t endM, int64_t endN, bool rem = false>
+  static EIGEN_ALWAYS_INLINE void updateC(Scalar *C_arr, int64_t LDC,
+                                          PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                          int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(rem_);
-    aux_updateC<endM, endN, endM*endN, rem>(C_arr, LDC, zmm, rem_);
+    aux_updateC<endM, endN, endM * endN, rem>(C_arr, LDC, zmm, rem_);
   }
 
-  template<int64_t endM, int64_t endN, bool rem = false>
-  static EIGEN_ALWAYS_INLINE
-  void storeC(Scalar *C_arr, int64_t LDC, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0){
+  template <int64_t endM, int64_t endN, bool rem = false>
+  static EIGEN_ALWAYS_INLINE void storeC(Scalar *C_arr, int64_t LDC,
+                                         PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                         int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(rem_);
-    aux_storeC<endM, endN, endM*endN, rem>(C_arr, LDC, zmm, rem_);
+    aux_storeC<endM, endN, endM * endN, rem>(C_arr, LDC, zmm, rem_);
   }
 
   /**
    * Use numLoad registers for loading B at start of microKernel
-  */
-  template<int64_t unrollM, int64_t unrollN, int64_t endL, bool rem>
-  static EIGEN_ALWAYS_INLINE
-  void startLoadB(Scalar *B_t, int64_t LDB, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0){
+   */
+  template <int64_t unrollM, int64_t unrollN, int64_t endL, bool rem>
+  static EIGEN_ALWAYS_INLINE void startLoadB(Scalar *B_t, int64_t LDB,
+                                             PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                             int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(rem_);
     aux_startLoadB<unrollM, unrollN, endL, endL, rem>(B_t, LDB, zmm, rem_);
   }
 
   /**
    * Use numBCast registers for broadcasting A at start of microKernel
-  */
-  template<bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t numLoad>
-  static EIGEN_ALWAYS_INLINE
-  void startBCastA(Scalar *A_t, int64_t LDA, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm){
+   */
+  template <bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t numLoad>
+  static EIGEN_ALWAYS_INLINE void startBCastA(Scalar *A_t, int64_t LDA,
+                                              PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
     aux_startBCastA<isARowMajor, unrollM, unrollN, endB, endB, numLoad>(A_t, LDA, zmm);
   }
 
   /**
    * Loads next set of B into vector registers between each K unroll.
-  */
-  template<int64_t endM, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad, int64_t numBCast, bool rem>
-  static EIGEN_ALWAYS_INLINE
-  void loadB(
-    Scalar *B_t, int64_t LDB, PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0){
+   */
+  template <int64_t endM, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad, int64_t numBCast, bool rem>
+  static EIGEN_ALWAYS_INLINE void loadB(Scalar *B_t, int64_t LDB,
+                                        PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                        int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(rem_);
     aux_loadB<endM, endM, unrollN, currK, unrollK, numLoad, numBCast, rem>(B_t, LDB, zmm, rem_);
   }
@@ -1184,18 +1197,16 @@
    * From testing, there are no register spills with clang. There are register spills with GNU, which
    * causes a performance hit.
    */
-  template<bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t numLoad, int64_t numBCast, bool rem = false>
-  static EIGEN_ALWAYS_INLINE
-  void microKernel(
-    Scalar *B_t, Scalar* A_t, int64_t LDB, int64_t LDA,
-    PacketBlock<vec,EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0){
+  template <bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t numLoad, int64_t numBCast,
+            bool rem = false>
+  static EIGEN_ALWAYS_INLINE void microKernel(Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA,
+                                              PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                              int64_t rem_ = 0) {
     EIGEN_UNUSED_VARIABLE(rem_);
-    aux_microKernel<isARowMajor,endM, endN, endK, endM*endN*endK, numLoad, numBCast, rem>(
-      B_t, A_t, LDB, LDA, zmm, rem_);
+    aux_microKernel<isARowMajor, endM, endN, endK, endM * endN * endK, numLoad, numBCast, rem>(B_t, A_t, LDB, LDA, zmm,
+                                                                                               rem_);
   }
-
 };
-} // namespace unrolls
+}  // namespace unrolls
 
-
-#endif //EIGEN_UNROLLS_IMPL_H
+#endif  // EIGEN_UNROLLS_IMPL_H
diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
index 8baced1..d28cca2 100644
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -44,6 +44,34 @@
   return _mm512_castps512_ps256(a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet16f>(const Packet16f& a) {
+  return _mm512_castps512_ps128(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d preinterpret<Packet4d, Packet8d>(const Packet8d& a) {
+  return _mm512_castpd512_pd256(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet8d>(const Packet8d& a) {
+  return _mm512_castpd512_pd128(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet8f>(const Packet8f& a) {
+  return _mm512_castps256_ps512(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet4f>(const Packet4f& a) {
+  return _mm512_castps128_ps512(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet4d>(const Packet4d& a) {
+  return _mm512_castpd256_pd512(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet2d>(const Packet2d& a) {
+  return _mm512_castpd128_pd512(a);
+}
+
 template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16f>(const Packet16f& a) {
   return a;
 }
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index e24b5d5..4cc0a94 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -104,12 +104,6 @@
                                             12, 13, 14, 15,
                                             20, 21, 22, 23,
                                             28, 29, 30, 31};
-const static Packet16uc p16uc_GETREAL64 = {  0,  1,  2,  3,  4,  5,  6,  7,
-                                            16, 17, 18, 19, 20, 21, 22, 23};
-
-//[a,ai],[b,bi] = [ai,bi]
-const static Packet16uc p16uc_GETIMAG64 = {  8,  9, 10, 11, 12, 13, 14, 15,
-                                            24, 25, 26, 27, 28, 29, 30, 31};
 
 /*********************************************
  * Single precision real and complex packing *
@@ -129,7 +123,7 @@
  * reason why packing for complex is broken down into several different parts, also the reason why we endup having a
  * float32/64 and complex float32/64 version.
  **/
-template<typename Scalar, typename Index, int StorageOrder>
+template<typename Scalar, int StorageOrder>
 EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt)
 {
   std::complex<Scalar> v;
@@ -148,7 +142,7 @@
   return v;
 }
 
-template<typename Scalar, typename Index, int StorageOrder, int N>
+template<typename Scalar, int StorageOrder, int N>
 EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB, const std::complex<Scalar>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
 {
   const Index depth = k2 + rows;
@@ -166,7 +160,7 @@
     {
       for(Index k = 0; k < vectorSize; k++)
       {
-        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j + k, rhs);
+        std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(i, j + k, rhs);
 
         blockBf[rir + k] = v.real();
         blockBf[rii + k] = v.imag();
@@ -184,7 +178,7 @@
 
     for(Index i = k2; i < depth; i++)
     {
-      std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j, rhs);
+      std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(i, j, rhs);
 
       blockBf[rir] = v.real();
       blockBf[rii] = v.imag();
@@ -197,7 +191,7 @@
   }
 }
 
-template<typename Scalar, typename Index, int StorageOrder>
+template<typename Scalar, int StorageOrder>
 EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA, const std::complex<Scalar>* _lhs, Index lhsStride, Index cols, Index rows)
 {
   const Index depth = cols;
@@ -215,7 +209,7 @@
     {
       for(Index k = 0; k < vectorSize; k++)
       {
-        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(j+k, i, lhs);
+        std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(j+k, i, lhs);
 
         blockAf[rir + k] = v.real();
         blockAf[rii + k] = v.imag();
@@ -236,7 +230,7 @@
       Index k = j;
       for(; k < rows; k++)
       {
-        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(k, i, lhs);
+        std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(k, i, lhs);
 
         blockAf[rir] = v.real();
         blockAf[rii] = v.imag();
@@ -248,7 +242,7 @@
   }
 }
 
-template<typename Scalar, typename Index, int StorageOrder, int N>
+template<typename Scalar, int StorageOrder, int N>
 EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
 {
   const Index depth = k2 + rows;
@@ -285,7 +279,7 @@
   }
 }
 
-template<typename Scalar, typename Index, int StorageOrder>
+template<typename Scalar, int StorageOrder>
 EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
 {
   const Index depth = cols;
@@ -332,7 +326,7 @@
 {
   void operator()(std::complex<float>* blockB, const std::complex<float>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
   {
-    symm_pack_complex_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
+    symm_pack_complex_rhs_helper<float, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
   }
 };
 
@@ -341,7 +335,7 @@
 {
   void operator()(std::complex<float>* blockA, const std::complex<float>* _lhs, Index lhsStride, Index cols, Index rows)
   {
-    symm_pack_complex_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+    symm_pack_complex_lhs_helper<float, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
   }
 };
 
@@ -352,7 +346,7 @@
 {
   void operator()(std::complex<double>* blockB, const std::complex<double>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
   {
-    symm_pack_complex_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
+    symm_pack_complex_rhs_helper<double, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
   }
 };
 
@@ -361,7 +355,7 @@
 {
   void operator()(std::complex<double>* blockA, const std::complex<double>* _lhs, Index lhsStride, Index cols, Index rows)
   {
-    symm_pack_complex_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+    symm_pack_complex_lhs_helper<double, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
   }
 };
 
@@ -371,7 +365,7 @@
 {
   void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
   {
-    symm_pack_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
+    symm_pack_rhs_helper<float, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
   }
 };
 
@@ -380,7 +374,7 @@
 {
   void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows)
   {
-    symm_pack_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+    symm_pack_lhs_helper<float, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
   }
 };
 
@@ -390,7 +384,7 @@
 {
   void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
   {
-    symm_pack_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
+    symm_pack_rhs_helper<double, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
   }
 };
 
@@ -399,7 +393,7 @@
 {
   void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows)
   {
-    symm_pack_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+    symm_pack_lhs_helper<double, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
   }
 };
 
@@ -414,7 +408,7 @@
  * and offset and behaves accordingly.
  **/
 
-template<typename Scalar, typename Packet, typename Index, int N>
+template<typename Scalar, typename Packet, int N>
 EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,N>& block)
 {
   const Index size = 16 / sizeof(Scalar);
@@ -429,7 +423,7 @@
 }
 
 // General template for lhs & rhs complex packing.
-template<typename Scalar, typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode, bool UseLhs>
+template<typename Scalar, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode, bool UseLhs>
 struct dhs_cpack {
   EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   {
@@ -441,6 +435,7 @@
 
     for(; j + vectorSize <= rows; j+=vectorSize)
     {
+      const DataMapper lhs2 = UseLhs ? lhs.getSubMapper(j, 0) : lhs.getSubMapper(0, j);
       Index i = 0;
 
       rii = rir + vectorDelta;
@@ -451,9 +446,9 @@
         PacketBlock<PacketC,8> cblock;
 
         if (UseLhs) {
-          bload<DataMapper, PacketC, Index, 2, StorageOrder, true, 4>(cblock, lhs, j, i);
+          bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, 0, i);
         } else {
-          bload<DataMapper, PacketC, Index, 2, StorageOrder, true, 4>(cblock, lhs, i, j);
+          bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, i, 0);
         }
 
         blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
@@ -480,8 +475,8 @@
           ptranspose(blocki);
         }
 
-        storeBlock<Scalar, Packet, Index, 4>(blockAt + rir, blockr);
-        storeBlock<Scalar, Packet, Index, 4>(blockAt + rii, blocki);
+        storeBlock<Scalar, Packet, 4>(blockAt + rir, blockr);
+        storeBlock<Scalar, Packet, 4>(blockAt + rii, blocki);
 
         rir += 4*vectorSize;
         rii += 4*vectorSize;
@@ -494,19 +489,19 @@
         if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs)))
         {
           if (UseLhs) {
-            cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
-            cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 2, i);
+            cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i);
+            cblock.packet[1] = lhs2.template loadPacket<PacketC>(2, i);
           } else {
-            cblock.packet[0] = lhs.template loadPacket<PacketC>(i, j + 0);
-            cblock.packet[1] = lhs.template loadPacket<PacketC>(i, j + 2);
+            cblock.packet[0] = lhs2.template loadPacket<PacketC>(i, 0);
+            cblock.packet[1] = lhs2.template loadPacket<PacketC>(i, 2);
           }
         } else {
           if (UseLhs) {
-            cblock.packet[0] = pload2(lhs(j + 0, i), lhs(j + 1, i));
-            cblock.packet[1] = pload2(lhs(j + 2, i), lhs(j + 3, i));
+            cblock.packet[0] = pload2(lhs2(0, i), lhs2(1, i));
+            cblock.packet[1] = pload2(lhs2(2, i), lhs2(3, i));
           } else {
-            cblock.packet[0] = pload2(lhs(i, j + 0), lhs(i, j + 1));
-            cblock.packet[1] = pload2(lhs(i, j + 2), lhs(i, j + 3));
+            cblock.packet[0] = pload2(lhs2(i, 0), lhs2(i, 1));
+            cblock.packet[1] = pload2(lhs2(i, 2), lhs2(i, 3));
           }
         }
 
@@ -534,16 +529,17 @@
 
       for(; j < rows; j++)
       {
+        const DataMapper lhs2 = lhs.getSubMapper(0, j);
         rii = rir + ((PanelMode) ? stride : depth);
 
         for(Index i = 0; i < depth; i++)
         {
-          blockAt[rir] = lhs(i, j).real();
+          blockAt[rir] = lhs2(i, 0).real();
 
           if(Conjugate)
-            blockAt[rii] = -lhs(i, j).imag();
+            blockAt[rii] = -lhs2(i, 0).imag();
           else
-            blockAt[rii] =  lhs(i, j).imag();
+            blockAt[rii] =  lhs2(i, 0).imag();
 
           rir += 1;
           rii += 1;
@@ -579,7 +575,7 @@
 };
 
 // General template for lhs & rhs packing.
-template<typename Scalar, typename Index, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
+template<typename Scalar, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
 struct dhs_pack{
   EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   {
@@ -588,6 +584,7 @@
 
     for(; j + vectorSize <= rows; j+=vectorSize)
     {
+      const DataMapper lhs2 = UseLhs ? lhs.getSubMapper(j, 0) : lhs.getSubMapper(0, j);
       Index i = 0;
 
       if(PanelMode) ri += vectorSize*offset;
@@ -597,16 +594,16 @@
         PacketBlock<Packet,4> block;
 
         if (UseLhs) {
-          bload<DataMapper, Packet, Index, 4, StorageOrder, false, 4>(block, lhs, j, i);
+          bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block, lhs2, 0, i);
         } else {
-          bload<DataMapper, Packet, Index, 4, StorageOrder, false, 4>(block, lhs, i, j);
+          bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block, lhs2, i, 0);
         }
         if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
         {
           ptranspose(block);
         }
 
-        storeBlock<Scalar, Packet, Index, 4>(blockA + ri, block);
+        storeBlock<Scalar, Packet, 4>(blockA + ri, block);
 
         ri += 4*vectorSize;
       }
@@ -615,22 +612,22 @@
         if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
         {
           if (UseLhs) {
-            blockA[ri+0] = lhs(j+0, i);
-            blockA[ri+1] = lhs(j+1, i);
-            blockA[ri+2] = lhs(j+2, i);
-            blockA[ri+3] = lhs(j+3, i);
+            blockA[ri+0] = lhs2(0, i);
+            blockA[ri+1] = lhs2(1, i);
+            blockA[ri+2] = lhs2(2, i);
+            blockA[ri+3] = lhs2(3, i);
           } else {
-            blockA[ri+0] = lhs(i, j+0);
-            blockA[ri+1] = lhs(i, j+1);
-            blockA[ri+2] = lhs(i, j+2);
-            blockA[ri+3] = lhs(i, j+3);
+            blockA[ri+0] = lhs2(i, 0);
+            blockA[ri+1] = lhs2(i, 1);
+            blockA[ri+2] = lhs2(i, 2);
+            blockA[ri+3] = lhs2(i, 3);
           }
         } else {
           Packet lhsV;
           if (UseLhs) {
-            lhsV = lhs.template loadPacket<Packet>(j, i);
+            lhsV = lhs2.template loadPacket<Packet>(0, i);
           } else {
-            lhsV = lhs.template loadPacket<Packet>(i, j);
+            lhsV = lhs2.template loadPacket<Packet>(i, 0);
           }
           pstore<Scalar>(blockA + ri, lhsV);
         }
@@ -647,9 +644,10 @@
 
       for(; j < rows; j++)
       {
+        const DataMapper lhs2 = lhs.getSubMapper(0, j);
         for(Index i = 0; i < depth; i++)
         {
-          blockA[ri] = lhs(i, j);
+          blockA[ri] = lhs2(i, 0);
           ri += 1;
         }
 
@@ -675,8 +673,8 @@
 };
 
 // General template for lhs packing, float64 specialization.
-template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
-struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, true>
+template<typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, true>
 {
   EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   {
@@ -685,6 +683,7 @@
 
     for(; j + vectorSize <= rows; j+=vectorSize)
     {
+      const DataMapper lhs2 = lhs.getSubMapper(j, 0);
       Index i = 0;
 
       if(PanelMode) ri += vectorSize*offset;
@@ -694,16 +693,16 @@
         PacketBlock<Packet2d,2> block;
         if(StorageOrder == RowMajor)
         {
-          block.packet[0] = lhs.template loadPacket<Packet2d>(j + 0, i);
-          block.packet[1] = lhs.template loadPacket<Packet2d>(j + 1, i);
+          block.packet[0] = lhs2.template loadPacket<Packet2d>(0, i);
+          block.packet[1] = lhs2.template loadPacket<Packet2d>(1, i);
 
           ptranspose(block);
         } else {
-          block.packet[0] = lhs.template loadPacket<Packet2d>(j, i + 0);
-          block.packet[1] = lhs.template loadPacket<Packet2d>(j, i + 1);
+          block.packet[0] = lhs2.template loadPacket<Packet2d>(0, i + 0);
+          block.packet[1] = lhs2.template loadPacket<Packet2d>(0, i + 1);
         }
 
-        storeBlock<double, Packet2d, Index, 2>(blockA + ri, block);
+        storeBlock<double, Packet2d, 2>(blockA + ri, block);
 
         ri += 2*vectorSize;
       }
@@ -711,10 +710,10 @@
       {
         if(StorageOrder == RowMajor)
         {
-          blockA[ri+0] = lhs(j+0, i);
-          blockA[ri+1] = lhs(j+1, i);
+          blockA[ri+0] = lhs2(0, i);
+          blockA[ri+1] = lhs2(1, i);
         } else {
-          Packet2d lhsV = lhs.template loadPacket<Packet2d>(j, i);
+          Packet2d lhsV = lhs2.template loadPacket<Packet2d>(0, i);
           pstore<double>(blockA + ri, lhsV);
         }
 
@@ -742,8 +741,8 @@
 };
 
 // General template for rhs packing, float64 specialization.
-template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
-struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, false>
+template<typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, false>
 {
   EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
   {
@@ -752,6 +751,7 @@
 
     for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
     {
+      const DataMapper rhs2 = rhs.getSubMapper(0, j);
       Index i = 0;
 
       if(PanelMode) ri += offset*(2*vectorSize);
@@ -762,10 +762,10 @@
         if(StorageOrder == ColMajor)
         {
           PacketBlock<Packet2d,2> block1, block2;
-          block1.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 0);
-          block1.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 1);
-          block2.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 2);
-          block2.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 3);
+          block1.packet[0] = rhs2.template loadPacket<Packet2d>(i, 0);
+          block1.packet[1] = rhs2.template loadPacket<Packet2d>(i, 1);
+          block2.packet[0] = rhs2.template loadPacket<Packet2d>(i, 2);
+          block2.packet[1] = rhs2.template loadPacket<Packet2d>(i, 3);
 
           ptranspose(block1);
           ptranspose(block2);
@@ -775,12 +775,12 @@
           pstore<double>(blockB + ri + 4, block1.packet[1]);
           pstore<double>(blockB + ri + 6, block2.packet[1]);
         } else {
-          block.packet[0] = rhs.template loadPacket<Packet2d>(i + 0, j + 0); //[a1 a2]
-          block.packet[1] = rhs.template loadPacket<Packet2d>(i + 0, j + 2); //[a3 a4]
-          block.packet[2] = rhs.template loadPacket<Packet2d>(i + 1, j + 0); //[b1 b2]
-          block.packet[3] = rhs.template loadPacket<Packet2d>(i + 1, j + 2); //[b3 b4]
+          block.packet[0] = rhs2.template loadPacket<Packet2d>(i + 0, 0); //[a1 a2]
+          block.packet[1] = rhs2.template loadPacket<Packet2d>(i + 0, 2); //[a3 a4]
+          block.packet[2] = rhs2.template loadPacket<Packet2d>(i + 1, 0); //[b1 b2]
+          block.packet[3] = rhs2.template loadPacket<Packet2d>(i + 1, 2); //[b3 b4]
 
-          storeBlock<double, Packet2d, Index, 4>(blockB + ri, block);
+          storeBlock<double, Packet2d, 4>(blockB + ri, block);
         }
 
         ri += 4*vectorSize;
@@ -789,20 +789,20 @@
       {
         if(StorageOrder == ColMajor)
         {
-          blockB[ri+0] = rhs(i, j+0);
-          blockB[ri+1] = rhs(i, j+1);
+          blockB[ri+0] = rhs2(i, 0);
+          blockB[ri+1] = rhs2(i, 1);
 
           ri += vectorSize;
 
-          blockB[ri+0] = rhs(i, j+2);
-          blockB[ri+1] = rhs(i, j+3);
+          blockB[ri+0] = rhs2(i, 2);
+          blockB[ri+1] = rhs2(i, 3);
         } else {
-          Packet2d rhsV = rhs.template loadPacket<Packet2d>(i, j);
+          Packet2d rhsV = rhs2.template loadPacket<Packet2d>(i, 0);
           pstore<double>(blockB + ri, rhsV);
 
           ri += vectorSize;
 
-          rhsV = rhs.template loadPacket<Packet2d>(i, j + 2);
+          rhsV = rhs2.template loadPacket<Packet2d>(i, 2);
           pstore<double>(blockB + ri, rhsV);
         }
         ri += vectorSize;
@@ -815,9 +815,10 @@
 
     for(; j < cols; j++)
     {
+      const DataMapper rhs2 = rhs.getSubMapper(0, j);
       for(Index i = 0; i < depth; i++)
       {
-        blockB[ri] = rhs(i, j);
+        blockB[ri] = rhs2(i, 0);
         ri += 1;
       }
 
@@ -827,8 +828,8 @@
 };
 
 // General template for lhs complex packing, float64 specialization.
-template<typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
-struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
+template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
 {
   EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   {
@@ -840,6 +841,7 @@
 
     for(; j + vectorSize <= rows; j+=vectorSize)
     {
+      const DataMapper lhs2 = lhs.getSubMapper(j, 0);
       Index i = 0;
 
       rii = rir + vectorDelta;
@@ -851,29 +853,29 @@
 
         if(StorageOrder == ColMajor)
         {
-          cblock.packet[0] = lhs.template loadPacket<PacketC>(j, i + 0); //[a1 a1i]
-          cblock.packet[1] = lhs.template loadPacket<PacketC>(j, i + 1); //[b1 b1i]
+          cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i + 0); //[a1 a1i]
+          cblock.packet[1] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
 
-          cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 1, i + 0); //[a2 a2i]
-          cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i]
+          cblock.packet[2] = lhs2.template loadPacket<PacketC>(1, i + 0); //[a2 a2i]
+          cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i]
 
-          blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2]
-          blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
+          blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[2].v); //[a1 a2]
+          blockr.packet[1] = vec_mergeh(cblock.packet[1].v, cblock.packet[3].v); //[b1 b2]
 
-          blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64);
-          blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64);
+          blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[2].v);
+          blocki.packet[1] = vec_mergel(cblock.packet[1].v, cblock.packet[3].v);
         } else {
-          cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i); //[a1 a1i]
-          cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i); //[a2 a2i]
+          cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i); //[a1 a1i]
+          cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i); //[a2 a2i]
 
-          cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 0, i + 1); //[b1 b1i]
-          cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i
+          cblock.packet[2] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
+          cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i
 
-          blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2]
-          blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
+          blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v); //[a1 a2]
+          blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v); //[b1 b2]
 
-          blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
-          blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
+          blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
+          blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
         }
 
         if(Conjugate)
@@ -882,8 +884,8 @@
           blocki.packet[1] = -blocki.packet[1];
         }
 
-        storeBlock<double, Packet, Index, 2>(blockAt + rir, blockr);
-        storeBlock<double, Packet, Index, 2>(blockAt + rii, blocki);
+        storeBlock<double, Packet, 2>(blockAt + rir, blockr);
+        storeBlock<double, Packet, 2>(blockAt + rii, blocki);
 
         rir += 2*vectorSize;
         rii += 2*vectorSize;
@@ -893,11 +895,11 @@
         PacketBlock<Packet,1> blockr, blocki;
         PacketBlock<PacketC,2> cblock;
 
-        cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
-        cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
+        cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i);
+        cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i);
 
-        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
-        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
+        blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);
+        blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
 
         if(Conjugate)
         {
@@ -940,8 +942,8 @@
 };
 
 // General template for rhs complex packing, float64 specialization.
-template<typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
-struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
+template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
 {
   EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
   {
@@ -953,6 +955,7 @@
 
     for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
     {
+      const DataMapper rhs2 = rhs.getSubMapper(0, j);
       Index i = 0;
 
       rii = rir + vectorDelta;
@@ -962,13 +965,13 @@
         PacketBlock<PacketC,4> cblock;
         PacketBlock<Packet,2> blockr, blocki;
 
-        bload<DataMapper, PacketC, Index, 2, ColMajor, false, 4>(cblock, rhs, i, j);
+        bload<DataMapper, PacketC, 2, ColMajor, false, 4>(cblock, rhs2, i, 0);
 
-        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
-        blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
+        blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);
+        blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v);
 
-        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
-        blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
+        blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
+        blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
 
         if(Conjugate)
         {
@@ -976,8 +979,8 @@
           blocki.packet[1] = -blocki.packet[1];
         }
 
-        storeBlock<double, Packet, Index, 2>(blockBt + rir, blockr);
-        storeBlock<double, Packet, Index, 2>(blockBt + rii, blocki);
+        storeBlock<double, Packet, 2>(blockBt + rir, blockr);
+        storeBlock<double, Packet, 2>(blockBt + rii, blocki);
 
         rir += 2*vectorSize;
         rii += 2*vectorSize;
@@ -990,16 +993,17 @@
 
     for(; j < cols; j++)
     {
+      const DataMapper rhs2 = rhs.getSubMapper(0, j);
       rii = rir + ((PanelMode) ? stride : depth);
 
       for(Index i = 0; i < depth; i++)
       {
-        blockBt[rir] = rhs(i, j).real();
+        blockBt[rir] = rhs2(i, 0).real();
 
         if(Conjugate)
-          blockBt[rii] = -rhs(i, j).imag();
+          blockBt[rii] = -rhs2(i, 0).imag();
         else
-          blockBt[rii] =  rhs(i, j).imag();
+          blockBt[rii] =  rhs2(i, 0).imag();
 
         rir += 1;
         rii += 1;
@@ -1123,7 +1127,7 @@
 // Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed.
 //
 // full = operate (load) on the entire PacketBlock or only half
-template<typename DataMapper, typename Packet, typename Index, const Index accCols, int StorageOrder, bool Complex, int N, bool full>
+template<typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N, bool full>
 EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index col)
 {
   if (StorageOrder == RowMajor) {
@@ -1147,7 +1151,7 @@
   }
 }
 
-template<typename DataMapper, typename Packet, typename Index, int N>
+template<typename DataMapper, typename Packet, int N>
 EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row)
 {
   for (int M = 0; M < N; M++) {
@@ -1165,7 +1169,7 @@
 const static Packet4i mask4[4] = { {  0,  0,  0,  0 }, { -1,  0,  0,  0 }, { -1, -1,  0,  0 }, { -1, -1, -1,  0 } };
 #endif
 
-template<typename Packet, typename Index>
+template<typename Packet>
 EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows)
 {
 #if USE_P10_AND_PVIPR2_0
@@ -1180,7 +1184,7 @@
 }
 
 template<>
-EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d,Index>(const Index remaining_rows)
+EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(const Index remaining_rows)
 {
 #if USE_P10_AND_PVIPR2_0
   Packet2d mask2 = Packet2d(vec_gendm(remaining_rows));
@@ -1406,7 +1410,7 @@
     MICRO_PREFETCHN1(ptr_imag, N); \
   }
 
-template<typename Scalar, typename Packet, typename Index, const Index accRows, const Index remaining_rows>
+template<typename Scalar, typename Packet, const Index accRows, const Index remaining_rows>
 EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(
   const Scalar* &lhs_ptr,
   const Scalar* &rhs_ptr0,
@@ -1419,7 +1423,7 @@
   lhs_ptr += remaining_rows;
 }
 
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols, const Index remaining_rows>
+template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, const Index remaining_rows>
 EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration(
   const DataMapper& res,
   const Scalar* lhs_base,
@@ -1454,14 +1458,14 @@
   }
   for(; k < depth; k++)
   {
-    MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows, remaining_rows>(lhs_ptr, rhs_ptr0, rhs_ptr1, rhs_ptr2, accZero0);
+    MICRO_EXTRA_ROW<Scalar, Packet, accRows, remaining_rows>(lhs_ptr, rhs_ptr0, rhs_ptr1, rhs_ptr2, accZero0);
   }
 
-  bload<DataMapper, Packet, Index, 0, ColMajor, false, accRows>(acc, res, row, 0);
+  bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row, 0);
   if ((accRows == 1) || (rows >= accCols))
   {
     bscale<Packet,accRows,true>(acc, accZero0, pAlpha, pMask);
-    bstore<DataMapper, Packet, Index, accRows>(acc, res, row);
+    bstore<DataMapper, Packet, accRows>(acc, res, row);
   } else {
     bscale<Packet,accRows,false>(acc, accZero0, pAlpha, pMask);
     for(Index j = 0; j < accRows; j++) {
@@ -1490,9 +1494,9 @@
   }
 
 #define MICRO_EXTRA_ROWS(N) \
-  gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, Index, accRows, accCols, N>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlpha, pMask);
+  gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, accRows, accCols, N>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlpha, pMask);
 
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
 EIGEN_ALWAYS_INLINE void gemm_extra_row(
   const DataMapper& res,
   const Scalar* lhs_base,
@@ -1563,14 +1567,14 @@
 
 #define MICRO_STORE_ONE(iter) \
   if (unroll_factor > iter) { \
-    bload<DataMapper, Packet, Index, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
+    bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
     bscale<Packet,accRows,!(MICRO_NORMAL(iter))>(acc, accZero##iter, pAlpha, pMask); \
-    bstore<DataMapper, Packet, Index, accRows>(acc, res, row + iter*accCols); \
+    bstore<DataMapper, Packet, accRows>(acc, res, row + iter*accCols); \
   }
 
 #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
 
-template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols, const Index accCols2>
+template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2>
 EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration(
   const DataMapper& res,
   const Scalar* lhs_base,
@@ -1609,10 +1613,10 @@
 }
 
 #define MICRO_UNROLL_ITER2(N, M) \
-  gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, Index, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask); \
+  gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask); \
   if (M) return;
 
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
 EIGEN_ALWAYS_INLINE void gemm_cols(
   const DataMapper& res,
   const Scalar* blockA,
@@ -1681,14 +1685,14 @@
 
   if(remaining_rows > 0)
   {
-    gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
+    gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
   }
 }
 
 #define MICRO_EXTRA_COLS(N) \
-  gemm_cols<Scalar, Packet, DataMapper, Index, N, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
+  gemm_cols<Scalar, Packet, DataMapper, N, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
 
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
+template<typename Scalar, typename Packet, typename DataMapper, const Index accCols>
 EIGEN_STRONG_INLINE void gemm_extra_cols(
   const DataMapper& res,
   const Scalar* blockA,
@@ -1711,7 +1715,7 @@
 /****************
  * GEMM kernels *
  * **************/
-template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
+template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
 EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
       const Index remaining_rows = rows % accCols;
@@ -1725,12 +1729,12 @@
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        gemm_cols<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
+        gemm_cols<Scalar, Packet, DataMapper, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
       }
 
       if (col != cols)
       {
-        gemm_extra_cols<Scalar, Packet, DataMapper, Index, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
+        gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
       }
 }
 
@@ -1828,7 +1832,7 @@
   MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \
   MICRO_COMPLEX_ADD_PEEL(1, 0)
 
-template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
+template<typename Scalar, typename Packet, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
 EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(
   const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
   const Scalar* &rhs_ptr_real0, const Scalar* &rhs_ptr_real1, const Scalar* &rhs_ptr_real2,
@@ -1840,7 +1844,7 @@
   MICRO_COMPLEX_ADD_COLS(1)
 }
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
 EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration(
   const DataMapper& res,
   const Scalar* lhs_base,
@@ -1888,18 +1892,18 @@
   }
   for(; k < depth; k++)
   {
-    MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real0, rhs_ptr_real1, rhs_ptr_real2, rhs_ptr_imag0, rhs_ptr_imag1, rhs_ptr_imag2, accReal0, accImag0);
+    MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real0, rhs_ptr_real1, rhs_ptr_real2, rhs_ptr_imag0, rhs_ptr_imag1, rhs_ptr_imag2, accReal0, accImag0);
   }
 
-  const bool full = (remaining_rows > accColsC);
-  bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, accRows, full>(tRes, res, row, 0);
+  constexpr bool full = (remaining_rows > accColsC);
+  bload<DataMapper, Packetc, accColsC, ColMajor, true, accRows, full>(tRes, res, row, 0);
   if ((accRows == 1) || (rows >= accCols))
   {
     bscalec<Packet,accRows,true>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
     bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1);
-    bstore<DataMapper, Packetc, Index, accRows>(acc0, res, row + 0);
+    bstore<DataMapper, Packetc, accRows>(acc0, res, row + 0);
     if (full) {
-      bstore<DataMapper, Packetc, Index, accRows>(acc1, res, row + accColsC);
+      bstore<DataMapper, Packetc, accRows>(acc1, res, row + accColsC);
     }
   } else {
     bscalec<Packet,accRows,false>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
@@ -1911,7 +1915,7 @@
         res(row + 0, j) = pfirst<Packetc>(acc0.packet[j]);
       }
     } else {
-      bstore<DataMapper, Packetc, Index, accRows>(acc0, res, row + 0);
+      bstore<DataMapper, Packetc, accRows>(acc0, res, row + 0);
       if (full) {
         for(Index j = 0; j < accRows; j++) {
           res(row + accColsC, j) = pfirst<Packetc>(acc1.packet[j]);
@@ -1922,9 +1926,9 @@
 }
 
 #define MICRO_COMPLEX_EXTRA_ROWS(N) \
-  gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, N>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlphaReal, pAlphaImag, pMask);
+  gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, N>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlphaReal, pAlphaImag, pMask);
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
   const DataMapper& res,
   const Scalar* lhs_base,
@@ -1998,19 +2002,19 @@
 
 #define MICRO_COMPLEX_STORE_ONE(iter) \
   if (unroll_factor > iter) { \
-    const bool full = ((MICRO_NORMAL(iter)) || (accCols2 > accColsC)); \
-    bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, accRows, full>(tRes, res, row + iter*accCols, 0); \
+    constexpr bool full = ((MICRO_NORMAL(iter)) || (accCols2 > accColsC)); \
+    bload<DataMapper, Packetc, accColsC, ColMajor, true, accRows, full>(tRes, res, row + iter*accCols, 0); \
     bscalec<Packet,accRows,!(MICRO_NORMAL(iter))>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); \
     bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1); \
-    bstore<DataMapper, Packetc, Index, accRows>(acc0, res, row + iter*accCols + 0); \
+    bstore<DataMapper, Packetc, accRows>(acc0, res, row + iter*accCols + 0); \
     if (full) { \
-      bstore<DataMapper, Packetc, Index, accRows>(acc1, res, row + iter*accCols + accColsC); \
+      bstore<DataMapper, Packetc, accRows>(acc1, res, row + iter*accCols + accColsC); \
     } \
   }
 
 #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
 
-template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_iteration(
   const DataMapper& res,
   const Scalar* lhs_base,
@@ -2057,10 +2061,10 @@
 }
 
 #define MICRO_COMPLEX_UNROLL_ITER2(N, M) \
-  gemm_complex_unrolled_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
+  gemm_complex_unrolled_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, DataMapper, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
   if (M) return;
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_ALWAYS_INLINE void gemm_complex_cols(
   const DataMapper& res,
   const Scalar* blockA,
@@ -2115,14 +2119,14 @@
 
   if(remaining_rows > 0)
   {
-    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
   }
 }
 
 #define MICRO_COMPLEX_EXTRA_COLS(N) \
-  gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, Index, N, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, N, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_STRONG_INLINE void gemm_complex_extra_cols(
   const DataMapper& res,
   const Scalar* blockA,
@@ -2143,7 +2147,7 @@
   MICRO_EXTRA(MICRO_COMPLEX_EXTRA_COLS, cols-col, true)
 }
 
-template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
       const Index remaining_rows = rows % accCols;
@@ -2161,12 +2165,12 @@
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+        gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
       }
 
       if (col != cols)
       {
-        gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+        gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
       }
 }
 
@@ -2189,7 +2193,7 @@
 void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
   ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-    dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
+    dhs_pack<double, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
     pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2203,7 +2207,7 @@
 void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
   ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-    dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
+    dhs_pack<double, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
     pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2218,7 +2222,7 @@
 void gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
   ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, false> pack;
+  dhs_pack<double, DataMapper, Packet2d, ColMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2232,7 +2236,7 @@
 void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
   ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
+  dhs_pack<double, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 #endif
@@ -2247,7 +2251,7 @@
 void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
   ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, true> pack;
+  dhs_pack<float, DataMapper, Packet4f, RowMajor, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2261,7 +2265,7 @@
 void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
   ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
+  dhs_pack<float, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2275,7 +2279,7 @@
 void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
   ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, true> pack;
+  dhs_cpack<float, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2289,7 +2293,7 @@
 void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
   ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, true> pack;
+  dhs_cpack<float, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2304,7 +2308,7 @@
 void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
   ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, false> pack;
+  dhs_pack<float, DataMapper, Packet4f, ColMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2318,7 +2322,7 @@
 void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
   ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
+  dhs_pack<float, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 #endif
@@ -2333,7 +2337,7 @@
 void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
   ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, false> pack;
+  dhs_cpack<float, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2347,7 +2351,7 @@
 void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
   ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, false> pack;
+  dhs_cpack<float, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2361,7 +2365,7 @@
 void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
   ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, true> pack;
+  dhs_cpack<double, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2375,7 +2379,7 @@
 void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
   ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, true> pack;
+  dhs_cpack<double, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
@@ -2389,7 +2393,7 @@
 void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
   ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, false> pack;
+  dhs_cpack<double, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2403,7 +2407,7 @@
 void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
   ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
-  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, false> pack;
+  dhs_cpack<double, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
@@ -2431,16 +2435,16 @@
 
     #if defined(EIGEN_ALTIVEC_MMA_ONLY)
       //generate with MMA only
-      gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+      gemm_function = &Eigen::internal::gemmMMA<float, Packet, RhsPacket, DataMapper, accRows, accCols>;
     #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-        gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+        gemm_function = &Eigen::internal::gemmMMA<float, Packet, RhsPacket, DataMapper, accRows, accCols>;
       }
       else{
-        gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+        gemm_function = &Eigen::internal::gemm<float, Packet, RhsPacket, DataMapper, accRows, accCols>;
       }
     #else
-      gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+      gemm_function = &Eigen::internal::gemm<float, Packet, RhsPacket, DataMapper, accRows, accCols>;
     #endif
     gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
@@ -2470,16 +2474,16 @@
 
     #if defined(EIGEN_ALTIVEC_MMA_ONLY)
       //generate with MMA only
-      gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+      gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
     #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
       }
       else{
-        gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+        gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
       }
     #else
-      gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+      gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
     #endif
     gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
@@ -2508,16 +2512,16 @@
           Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
     #if defined(EIGEN_ALTIVEC_MMA_ONLY)
       //generate with MMA only
-      gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+      gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
     #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-        gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+        gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
       }
       else{
-        gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+        gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
       }
     #else
-      gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+      gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
     #endif
     gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
@@ -2546,16 +2550,16 @@
           Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
     #if defined(EIGEN_ALTIVEC_MMA_ONLY)
       //generate with MMA only
-      gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+      gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
     #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
       }
       else{
-        gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+        gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
       }
     #else
-      gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+      gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
     #endif
     gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
@@ -2583,16 +2587,16 @@
 
     #if defined(EIGEN_ALTIVEC_MMA_ONLY)
       //generate with MMA only
-      gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+      gemm_function = &Eigen::internal::gemmMMA<double, Packet, RhsPacket, DataMapper, accRows, accCols>;
     #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-        gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+        gemm_function = &Eigen::internal::gemmMMA<double, Packet, RhsPacket, DataMapper, accRows, accCols>;
       }
       else{
-        gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+        gemm_function = &Eigen::internal::gemm<double, Packet, RhsPacket, DataMapper, accRows, accCols>;
       }
     #else
-      gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+      gemm_function = &Eigen::internal::gemm<double, Packet, RhsPacket, DataMapper, accRows, accCols>;
     #endif
     gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
@@ -2621,16 +2625,16 @@
           Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
     #if defined(EIGEN_ALTIVEC_MMA_ONLY)
       //generate with MMA only
-      gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+      gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
     #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
       }
       else{
-        gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+        gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
       }
     #else
-      gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+      gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
     #endif
     gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
@@ -2659,16 +2663,16 @@
           Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
     #if defined(EIGEN_ALTIVEC_MMA_ONLY)
       //generate with MMA only
-      gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+      gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
     #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
       }
       else{
-        gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+        gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
       }
     #else
-      gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+      gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
     #endif
     gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
@@ -2697,16 +2701,16 @@
           Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
     #if defined(EIGEN_ALTIVEC_MMA_ONLY)
       //generate with MMA only
-      gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+      gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
     #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-        gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+        gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
       }
       else{
-        gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+        gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
       }
     #else
-      gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+      gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
     #endif
     gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
index e68c595..70b95da 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
@@ -11,7 +11,7 @@
 
 namespace internal {
 
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
 EIGEN_ALWAYS_INLINE void gemm_extra_row(
   const DataMapper& res,
   const Scalar* lhs_base,
@@ -26,7 +26,7 @@
   const Packet& pAlpha,
   const Packet& pMask);
 
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<typename Scalar, typename Packet, typename DataMapper, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_STRONG_INLINE void gemm_extra_cols(
   const DataMapper& res,
   const Scalar* blockA,
@@ -43,10 +43,10 @@
   const Packet& pAlpha,
   const Packet& pMask);
 
-template<typename Packet, typename Index>
+template<typename Packet>
 EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows);
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
   const DataMapper& res,
   const Scalar* lhs_base,
@@ -62,7 +62,7 @@
   const Packet& pAlphaImag,
   const Packet& pMask);
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_STRONG_INLINE void gemm_complex_extra_cols(
   const DataMapper& res,
   const Scalar* blockA,
@@ -83,10 +83,10 @@
 template<typename Packet>
 EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet)* lhs);
 
-template<typename DataMapper, typename Packet, typename Index, const Index accCols, int StorageOrder, bool Complex, int N, bool full = true>
+template<typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N, bool full = true>
 EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index col);
 
-template<typename DataMapper, typename Packet, typename Index, int N>
+template<typename DataMapper, typename Packet, int N>
 EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row);
 
 template<typename Packet, int N, bool mask>
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
index 1cb82ee..84ba115 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -39,30 +39,30 @@
   __builtin_mma_xxsetaccz(acc);
 }
 
-template<typename DataMapper, typename Index, typename Packet, const Index accCols, const Index accCols2>
+template<typename DataMapper, typename Packet, const Index accCols, const Index accCols2>
 EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Packet& pMask, __vector_quad* acc)
 {
   PacketBlock<Packet, 4> result;
   __builtin_mma_disassemble_acc(&result.packet, acc);
 
   PacketBlock<Packet, 4> tRes;
-  bload<DataMapper, Packet, Index, 0, ColMajor, false, 4>(tRes, data, i, 0);
+  bload<DataMapper, Packet, 0, ColMajor, false, 4>(tRes, data, i, 0);
 
   bscale<Packet, 4, (accCols != accCols2)>(tRes, result, alpha, pMask);
 
-  bstore<DataMapper, Packet, Index, 4>(tRes, data, i);
+  bstore<DataMapper, Packet, 4>(tRes, data, i);
 }
 
-template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
+template<typename DataMapper, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
 EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, const Packet& pMask, __vector_quad* accReal, __vector_quad* accImag)
 {
-  const bool full = (accCols2 > accColsC);
+  constexpr bool full = (accCols2 > accColsC);
   PacketBlock<Packet, 4> resultReal, resultImag;
   __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
   __builtin_mma_disassemble_acc(&resultImag.packet, accImag);
 
   PacketBlock<Packetc, 8> tRes;
-  bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, 4, full>(tRes, data, i, 0);
+  bload<DataMapper, Packetc, accColsC, ColMajor, true, 4, full>(tRes, data, i, 0);
 
   PacketBlock<Packet, 4> taccReal, taccImag;
   bscalec<Packet, 4, (accCols != accCols2)>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag, pMask);
@@ -70,9 +70,9 @@
   PacketBlock<Packetc, 4> acc1, acc2;
   bcouple<Packet, Packetc, 4, full>(taccReal, taccImag, tRes, acc1, acc2);
 
-  bstore<DataMapper, Packetc, Index, 4>(acc1, data, i);
+  bstore<DataMapper, Packetc, 4>(acc1, data, i);
   if (full) {
-    bstore<DataMapper, Packetc, Index, 4>(acc2, data, i + accColsC);
+    bstore<DataMapper, Packetc, 4>(acc2, data, i + accColsC);
   }
 }
 
@@ -138,7 +138,7 @@
     reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs + (sizeof(Packet2d) / sizeof(double)))),
     reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs)));
 #else
-  __asm__ ("lxvp %x0,%1" : "=wa" (rhsV) : "Y" (*rhs));
+  rhsV = *reinterpret_cast<__vector_pair *>(const_cast<double *>(rhs));
 #endif
 }
 
@@ -147,6 +147,10 @@
   ploadRhsMMA(lhs, lhsV);
 }
 
+#if (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
+#define VECTOR_PAIR_LOADS_LHS
+#endif
+
 // PEEL_MMA loop factor.
 #define PEEL_MMA 7
 
@@ -159,12 +163,13 @@
 
 #define MICRO_MMA_WORK_ONE(iter, type, peel) \
   if (unroll_factor > iter) { \
-    pgerMMA<Packet, type, false>(&accZero##iter, rhsV##peel, lhsV##iter); \
+    pgerMMA<Packet, type, false>(&accZero##iter, rhsV[peel], lhsV##iter); \
   }
 
+#ifdef VECTOR_PAIR_LOADS_LHS
 #define MICRO_MMA_WORK_TWO(iter, type, peel) \
   if (unroll_factor > iter) { \
-    pgerMMA<Packet, type, false>(&accZero##iter, rhsV##peel, lhsV2##iter.packet[peel & 1]); \
+    pgerMMA<Packet, type, false>(&accZero##iter, rhsV[peel], lhsV2##iter.packet[peel & 1]); \
   }
 
 #define MICRO_MMA_LOAD1_TWO(lhs_ptr, iter) \
@@ -185,50 +190,70 @@
   }
 
 #define MICRO_MMA_LOAD_TWO(iter) MICRO_MMA_LOAD1_TWO(lhs_ptr, iter)
+#endif
 
 #define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
   if (PEEL_MMA > peel) { \
     Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
-    ploadRhsMMA(rhs_ptr + (accRows * peel), rhsV##peel); \
+    ploadRhsMMA(rhs_ptr + (accRows * peel), rhsV[peel]); \
     MICRO_MMA_UNROLL(funcl) \
     MICRO_MMA_WORK(funcw, type, peel) \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
   }
 
+#ifndef VECTOR_PAIR_LOADS_LHS
+#define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
+  type rhsV[8]; \
+  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,1) \
+  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,3) \
+  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,4) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,5) \
+  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,6) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,7)
+#else
 #define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
   if (PEEL_MMA > peel2) { \
     PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \
     __vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7; \
-    ploadRhsMMA(rhs_ptr + (accRows * peel1), rhsV##peel1); \
-    ploadRhsMMA(rhs_ptr + (accRows * peel2), rhsV##peel2); \
+    if (sizeof(type) == 16) { \
+      ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr + (accRows * peel1)), prhsV##peel1); \
+      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV[peel1]), &prhsV##peel1); \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
+      ploadRhsMMA(rhs_ptr + (accRows * peel1), rhsV[peel1]); \
+      ploadRhsMMA(rhs_ptr + (accRows * peel2), rhsV[peel2]); \
+    } \
     MICRO_MMA_UNROLL(funcl2) \
     MICRO_MMA_WORK(funcw2, type, peel1) \
     MICRO_MMA_WORK(funcw2, type, peel2) \
   } else { \
+    EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
     MICRO_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
   }
 
 #define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
-  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \
+  type rhsV[8]; \
+  __vector_pair prhsV0, prhsV2, prhsV4, prhsV6; \
   MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
   MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) \
   MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,4,5) \
   MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,6,7)
+#endif
 
 #define MICRO_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
-  type rhsV0; \
+  type rhsV[1]; \
   MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0)
 
 #define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size) \
   MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, RhsPacket) \
   rhs_ptr += (accRows * size);
 
+#ifndef VECTOR_PAIR_LOADS_LHS
+#define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_PEEL, PEEL_MMA)
+#else
 #define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size) \
- MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \
+  MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \
   rhs_ptr += (accRows * size);
 
 #define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_UNROLL_TYPE_PEEL2, PEEL_MMA)
+#endif
 
 #define MICRO_MMA_ONE MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_ONE, 1)
 
@@ -247,12 +272,12 @@
 
 #define MICRO_MMA_STORE_ONE(iter) \
   if (unroll_factor > iter) { \
-    storeAccumulator<DataMapper, Index, Packet, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlpha, pMask, &accZero##iter); \
+    storeAccumulator<DataMapper, Packet, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlpha, pMask, &accZero##iter); \
   }
 
 #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
 
-template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, const Index accCols2>
+template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2>
 EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
   const DataMapper& res,
   const Scalar* lhs_base,
@@ -288,10 +313,10 @@
 }
 
 #define MICRO_MMA_UNROLL_ITER2(N, M) \
-  gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, pMask); \
+  gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, pMask); \
   if (M) return;
 
-template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
 EIGEN_ALWAYS_INLINE void gemmMMA_cols(
   const DataMapper& res,
   const Scalar* blockA,
@@ -360,11 +385,11 @@
 
   if(remaining_rows > 0)
   {
-    gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
+    gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
   }
 }
 
-template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
+template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
 void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
       const Index remaining_rows = rows % accCols;
@@ -380,12 +405,12 @@
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, Index, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
+        gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
       }
 
       if (col != cols)
       {
-        gemm_extra_cols<Scalar, Packet, DataMapper, Index, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
+        gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
       }
 }
 
@@ -403,12 +428,13 @@
 
 #define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \
   if (unroll_factor > iter) { \
-    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV[peel], rhsVi[peel]); \
   }
 
+#ifdef VECTOR_PAIR_LOADS_LHS
 #define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel) \
   if (unroll_factor > iter) { \
-    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV2##iter.packet[peel & 1], lhsVi2##iter.packet[peel & 1], rhsV##peel, rhsVi##peel); \
+    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV2##iter.packet[peel & 1], lhsVi2##iter.packet[peel & 1], rhsV[peel], rhsVi[peel]); \
   }
 
 #define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter) \
@@ -428,54 +454,70 @@
   MICRO_MMA_LOAD1_TWO(lhs_ptr_real, iter)
 
 #define MICRO_COMPLEX_MMA_LOAD_TWO(iter) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter)
+#endif
 
 #define MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
   if (PEEL_COMPLEX_MMA > peel) { \
     Packet lhsV0, lhsV1, lhsV2, lhsV3; \
     Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
-    ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV##peel); \
+    ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV[peel]); \
     if(!RhsIsReal) { \
-      ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \
-    } else { \
-      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+      ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi[peel]); \
     } \
     MICRO_COMPLEX_MMA_UNROLL(funcl) \
     MICRO_COMPLEX_MMA_WORK(funcw, type, peel) \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
-    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
   }
 
+#ifndef VECTOR_PAIR_LOADS_LHS
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
+  type rhsV[4], rhsVi[4]; \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,1) \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,3)
+#else
 #define MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
   if (PEEL_COMPLEX_MMA > peel2) { \
     PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23; \
     PacketBlock<Packet,2> lhsVi20, lhsVi21, lhsVi22, lhsVi23; \
     __vector_pair plhsV0, plhsV1, plhsV2, plhsV3; \
     __vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3; \
-    ploadRhsMMA(rhs_ptr_real + (accRows * peel1), rhsV##peel1); \
-    ploadRhsMMA(rhs_ptr_real + (accRows * peel2), rhsV##peel2); \
-    if(!RhsIsReal) { \
-      ploadRhsMMA(rhs_ptr_imag + (accRows * peel1), rhsVi##peel1); \
-      ploadRhsMMA(rhs_ptr_imag + (accRows * peel2), rhsVi##peel2); \
+    if (sizeof(type) == 16) { \
+      ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_real + (accRows * peel1)), prhsV##peel1); \
+      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV[peel1]), &prhsV##peel1); \
+      if(!RhsIsReal) { \
+        ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_imag + (accRows * peel1)), prhsVi##peel1); \
+        __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsVi[peel1]), &prhsVi##peel1); \
+      } else { \
+        EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
+      } \
     } else { \
-      EIGEN_UNUSED_VARIABLE(rhsVi##peel1); \
-      EIGEN_UNUSED_VARIABLE(rhsVi##peel2); \
+      EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
+      EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
+      ploadRhsMMA(rhs_ptr_real + (accRows * peel1), rhsV[peel1]); \
+      ploadRhsMMA(rhs_ptr_real + (accRows * peel2), rhsV[peel2]); \
+      if(!RhsIsReal) { \
+        ploadRhsMMA(rhs_ptr_imag + (accRows * peel1), rhsVi[peel1]); \
+        ploadRhsMMA(rhs_ptr_imag + (accRows * peel2), rhsVi[peel2]); \
+      } \
     } \
     MICRO_COMPLEX_MMA_UNROLL(funcl2) \
     MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1) \
     MICRO_COMPLEX_MMA_WORK(funcw2, type, peel2) \
   } else { \
+    EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
+    EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
     MICRO_COMPLEX_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
   }
 
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
-  type rhsV0, rhsV1, rhsV2, rhsV3; \
-  type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \
+  type rhsV[4], rhsVi[4]; \
+  __vector_pair prhsV0, prhsV2; \
+  __vector_pair prhsVi0, prhsVi2; \
   MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
   MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3)
+#endif
 
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
-  type rhsV0, rhsVi0; \
+  type rhsV[1], rhsVi[1]; \
   MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0)
 
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size) \
@@ -483,12 +525,16 @@
   rhs_ptr_real += (accRows * size); \
   if(!RhsIsReal) rhs_ptr_imag += (accRows * size);
 
+#ifndef VECTOR_PAIR_LOADS_LHS
+#define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL, PEEL_COMPLEX_MMA)
+#else
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size) \
- MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \
+  MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \
   rhs_ptr_real += (accRows * size); \
   if(!RhsIsReal) rhs_ptr_imag += (accRows * size);
 
 #define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2, PEEL_COMPLEX_MMA)
+#endif
 
 #define MICRO_COMPLEX_MMA_ONE MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE, 1)
 
@@ -509,12 +555,12 @@
 
 #define MICRO_COMPLEX_MMA_STORE_ONE(iter) \
   if (unroll_factor > iter) { \
-    storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \
+    storeComplexAccumulator<DataMapper, Packet, Packetc, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \
   }
 
 #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
 
-template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
   const DataMapper& res,
   const Scalar* lhs_base,
@@ -564,10 +610,10 @@
 }
 
 #define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M) \
-  gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
+  gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
   if (M) return;
 
-template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
   const DataMapper& res,
   const Scalar* blockA,
@@ -622,11 +668,11 @@
 
   if(remaining_rows > 0)
   {
-    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
   }
 }
 
-template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
       const Index remaining_rows = rows % accCols;
@@ -646,12 +692,12 @@
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+        gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
       }
 
       if (col != cols)
       {
-        gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+        gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
       }
 }
 
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
index 6ab4d0b..940a817 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
@@ -375,7 +375,7 @@
 }
 #endif
 
-template<typename Index, typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
+template<typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
 EIGEN_STRONG_INLINE void gemv_col(
     Index rows, Index cols,
     const LhsMapper& alhs,
@@ -927,7 +927,7 @@
     }
 }
 
-template<typename Index, typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar, typename AlphaData, Index ResPacketSize, Index iter2>
+template<typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar, typename AlphaData, Index ResPacketSize, Index iter2>
 EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, AlphaData& b0, ResScalar* res)
 {
     PResPacket c2 = pcplxflipconj(c0);
@@ -953,7 +953,7 @@
 }
 
 /** \internal load lhs packet */
-template<typename Scalar, typename LhsScalar, typename LhsMapper, typename LhsPacket, typename Index>
+template<typename Scalar, typename LhsScalar, typename LhsMapper, typename LhsPacket>
 EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper& lhs, Index i, Index j)
 {
     if (sizeof(Scalar) == sizeof(LhsScalar)) {
@@ -1337,17 +1337,17 @@
             result0.packet[0] = tmp0;
 
             if (ConjugateLhs) {
-                result0.packet[0] = convertReal(pconj2(convertComplex(result0.packet[0])));
-                result0.packet[2] = convertReal(pconj2(convertComplex(result0.packet[2])));
+                result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+                result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
             } else if (ConjugateRhs) {
-                result0.packet[1] = convertReal(pconj2(convertComplex(result0.packet[1])));
-                result0.packet[3] = convertReal(pconj2(convertComplex(result0.packet[3])));
+                result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
+                result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
             } else {
-                result0.packet[1] = convertReal(pconjinv(convertComplex(result0.packet[1])));
-                result0.packet[3] = convertReal(pconjinv(convertComplex(result0.packet[3])));
-           }
-           result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
-           result0.packet[2] = vec_add(result0.packet[2], result0.packet[3]);
+                result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
+                result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
+            }
+            result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
+            result0.packet[2] = vec_add(result0.packet[2], result0.packet[3]);
         } else {
             result0.packet[0][1] = result0.packet[1][1];
             result0.packet[2][1] = result0.packet[3][1];
@@ -1361,19 +1361,19 @@
     __builtin_mma_disassemble_acc(&result0.packet, c0);
     if (GEMV_IS_COMPLEX_COMPLEX) {
         if (ConjugateLhs) {
-            result0.packet[0] = convertReal(pconj2(convertComplex(result0.packet[0])));
-            result0.packet[1] = convertReal(pcplxflip2(convertComplex(result0.packet[1])));
+            result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+            result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
         } else {
             if (ConjugateRhs) {
-                result0.packet[1] = convertReal(pcplxconjflip(convertComplex(result0.packet[1])));
+                result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
             } else {
-                result0.packet[1] = convertReal(pcplxflipconj(convertComplex(result0.packet[1])));
+                result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
             }
         }
         result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
     } else if (sizeof(LhsPacket) == sizeof(std::complex<float>)) {
         if (ConjugateLhs) {
-            result0.packet[0] = convertReal(pconj2(convertComplex(result0.packet[0])));
+            result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
         }
     } else {
         result0.packet[0] = vec_mergee(result0.packet[0], result0.packet[1]);
@@ -1394,7 +1394,7 @@
 #define GEMV_GETN_COMPLEX(N) (((N) * ResPacketSize) >> 1)
 
 #define GEMV_LOADPACKET_COL_COMPLEX(iter) \
-  loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket, Index>(lhs, i + ((iter) * ResPacketSize), j)
+  loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + ((iter) * ResPacketSize), j)
 
 #define GEMV_LOADPACKET_COL_COMPLEX_DATA(iter) \
   convertReal(GEMV_LOADPACKET_COL_COMPLEX(iter))
@@ -1444,7 +1444,7 @@
   }
 
 #define GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter1, iter2) \
-  GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1)); \
+  GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1));
 
 #define GEMV_LOAD2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
   if (GEMV_GETN_COMPLEX(N) > iter1) { \
@@ -1498,7 +1498,7 @@
 #endif
 
 #define GEMV_DISASSEMBLE_COMPLEX_MMA(iter) \
-  disassembleResults<Scalar, ScalarPacket, ResPacketSize, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter, result0##iter); \
+  disassembleResults<Scalar, ScalarPacket, ResPacketSize, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter, result0##iter);
 
 #define GEMV_STORE_COL_COMPLEX_MMA(iter, N) \
   if (GEMV_GETN_COMPLEX(N) > iter) { \
@@ -1520,13 +1520,13 @@
     c0##iter2 = PResPacket(result0##iter2.packet[0]); \
     if (GEMV_IS_COMPLEX_FLOAT) { \
       c0##iter3 = PResPacket(result0##iter3.packet[0]); \
-      pstoreu_pmadd_complex<Index, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2>(c0##iter2, c0##iter3, alpha_data, res + i); \
+      pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2>(c0##iter2, c0##iter3, alpha_data, res + i); \
     } else { \
       c0##iter3 = PResPacket(result0##iter2.packet[2]); \
-      pstoreu_pmadd_complex<Index, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2 << 1>(c0##iter2, c0##iter3, alpha_data, res + i); \
+      pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2 << 1>(c0##iter2, c0##iter3, alpha_data, res + i); \
       c0##iter2 = PResPacket(result0##iter3.packet[0]); \
       c0##iter3 = PResPacket(result0##iter3.packet[2]); \
-      pstoreu_pmadd_complex<Index, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter3 << 1>(c0##iter2, c0##iter3, alpha_data, res + i); \
+      pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter3 << 1>(c0##iter2, c0##iter3, alpha_data, res + i); \
     } \
   }
 
@@ -1607,7 +1607,7 @@
 #endif
 #endif
 
-template<typename Index, typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
+template<typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
 EIGEN_STRONG_INLINE void gemv_complex_col(
     Index rows, Index cols,
     const LhsMapper& alhs,
@@ -1725,10 +1725,6 @@
 template<typename ResScalar, typename ResPacket>
 EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0, __vector_quad* acc1)
 {
-    union {
-      ScalarBlock<ResScalar, 2> cs;
-      double                    cd;
-    } cc0;
     PacketBlock<ResPacket, 4> result0, result1;
     __builtin_mma_disassemble_acc(&result0.packet, acc0);
     __builtin_mma_disassemble_acc(&result1.packet, acc1);
@@ -1737,20 +1733,17 @@
     result0.packet[2] = vec_mergel(result0.packet[2], result1.packet[2]);
     result0.packet[3] = vec_perm(result0.packet[3], result1.packet[3], p16uc_ELEMENT_3);
     result0.packet[0] = vec_add(vec_add(result0.packet[0], result0.packet[2]), vec_add(result0.packet[1], result0.packet[3]));
-    cc0.cd = pfirst(reinterpret_cast<Packet2d>(result0.packet[0]));
-    return cc0.cs;
+    return *reinterpret_cast<ScalarBlock<ResScalar, 2> *>(&result0.packet[0]);
 }
 
 template<>
 EIGEN_ALWAYS_INLINE ScalarBlock<double, 2> predux_real<double, Packet2d>(__vector_quad* acc0, __vector_quad* acc1)
 {
-    ScalarBlock<double, 2> cc0;
     PacketBlock<Packet2d, 4> result0, result1;
     __builtin_mma_disassemble_acc(&result0.packet, acc0);
     __builtin_mma_disassemble_acc(&result1.packet, acc1);
-    cc0.scalar[0] = result0.packet[0][0] + result0.packet[1][1];
-    cc0.scalar[1] = result1.packet[0][0] + result1.packet[1][1];
-    return cc0;
+    result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result1.packet[0]), vec_mergel(result0.packet[1], result1.packet[1]));
+    return *reinterpret_cast<ScalarBlock<double, 2> *>(&result0.packet[0]);
 }
 
 /** \internal add complex results together */
@@ -1766,17 +1759,17 @@
         result0.packet[3] = reinterpret_cast<Packet4f>(vec_mergel(reinterpret_cast<Packet2d>(result0.packet[3]), reinterpret_cast<Packet2d>(result1.packet[3])));
         result0.packet[1] = vec_add(result0.packet[1], result0.packet[3]);
         if (ConjugateLhs) {
-            result0.packet[0] = convertReal(pconj2(convertComplex(result0.packet[0])));
-            result0.packet[1] = convertReal(pcplxflip2(convertComplex(result0.packet[1])));
+            result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+            result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
         } else if (ConjugateRhs) {
-            result0.packet[1] = convertReal(pcplxconjflip(convertComplex(result0.packet[1])));
+            result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
         } else {
-            result0.packet[1] = convertReal(pcplxflipconj(convertComplex(result0.packet[1])));
+            result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
         }
         result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
     } else {
         if (ConjugateLhs && (sizeof(LhsPacket) == sizeof(std::complex<float>))) {
-            result0.packet[0] = convertReal(pconj2(convertComplex(result0.packet[0])));
+            result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
         }
     }
     cc0.scalar[0].real(result0.packet[0][0]);
@@ -1807,12 +1800,10 @@
 template<typename ResScalar, typename ResPacket>
 EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0)
 {
-    ScalarBlock<ResScalar, 2> cc0;
     PacketBlock<ResPacket, 4> result0;
     __builtin_mma_disassemble_acc(&result0.packet, acc0);
-    cc0.scalar[0] = result0.packet[0][0] + result0.packet[1][1];
-    cc0.scalar[1] = result0.packet[2][0] + result0.packet[3][1];
-    return cc0;
+    result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result0.packet[2]), vec_mergel(result0.packet[1], result0.packet[3]));
+    return *reinterpret_cast<ScalarBlock<ResScalar, 2> *>(&result0.packet[0]);
 }
 
 template<typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
@@ -1823,25 +1814,25 @@
     __builtin_mma_disassemble_acc(&result0.packet, acc0);
     if (GEMV_IS_COMPLEX_COMPLEX) {
         if (ConjugateLhs) {
-            result0.packet[1] = convertReal(pconjinv(convertComplex(result0.packet[1])));
-            result0.packet[3] = convertReal(pconjinv(convertComplex(result0.packet[3])));
+            result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
+            result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
         } else if (ConjugateRhs) {
-            result0.packet[0] = convertReal(pconj2(convertComplex(result0.packet[0])));
-            result0.packet[2] = convertReal(pconj2(convertComplex(result0.packet[2])));
+            result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+            result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
         } else {
-            result0.packet[1] = convertReal(pconj2(convertComplex(result0.packet[1])));
-            result0.packet[3] = convertReal(pconj2(convertComplex(result0.packet[3])));
+            result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
+            result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
         }
-        cc0.scalar[0].real(result0.packet[0][0] + result0.packet[1][1]);
-        cc0.scalar[0].imag(result0.packet[0][1] + result0.packet[1][0]);
-        cc0.scalar[1].real(result0.packet[2][0] + result0.packet[3][1]);
-        cc0.scalar[1].imag(result0.packet[2][1] + result0.packet[3][0]);
+        result0.packet[0] = vec_add(result0.packet[0], __builtin_vsx_xxpermdi(result0.packet[1], result0.packet[1], 2));
+        result0.packet[2] = vec_add(result0.packet[2], __builtin_vsx_xxpermdi(result0.packet[3], result0.packet[3], 2));
     } else {
-        cc0.scalar[0].real(result0.packet[0][0]);
-        cc0.scalar[0].imag(result0.packet[1][1]);
-        cc0.scalar[1].real(result0.packet[2][0]);
-        cc0.scalar[1].imag(result0.packet[3][1]);
+        result0.packet[0] = __builtin_vsx_xxpermdi(result0.packet[0], result0.packet[1], 1);
+        result0.packet[2] = __builtin_vsx_xxpermdi(result0.packet[2], result0.packet[3], 1);
     }
+    cc0.scalar[0].real(result0.packet[0][0]);
+    cc0.scalar[0].imag(result0.packet[0][1]);
+    cc0.scalar[1].real(result0.packet[2][0]);
+    cc0.scalar[1].imag(result0.packet[2][1]);
     return cc0;
 }
 #endif
@@ -1946,18 +1937,18 @@
     GEMV_UNROLL_ROW(GEMV_INIT_ROW, N) \
     Index j = 0; \
     for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
-      RhsPacket a0 = rhs2.template load<RhsPacket, Unaligned>(j, 0); \
+      RhsPacket a0 = rhs2.template load<RhsPacket, Unaligned>(j); \
       GEMV_UNROLL_ROW(GEMV_WORK_ROW, N) \
     } \
     GEMV_UNROLL_ROW_HALF(GEMV_PREDUX2, (N >> 1)) \
     for (; j < cols; ++j) { \
-      RhsScalar a0 = rhs2(j, 0); \
+      RhsScalar a0 = rhs2(j); \
       GEMV_UNROLL_ROW_HALF(GEMV_MULT, (N >> 1)) \
     } \
     GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW, (N >> 1)) \
   }
 
-template<typename Index, typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
+template<typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
 EIGEN_STRONG_INLINE void gemv_row(
     Index rows, Index cols,
     const LhsMapper& alhs,
@@ -1974,7 +1965,7 @@
     // The following copy tells the compiler that lhs's attributes are not modified outside this function
     // This helps GCC to generate proper code.
     LhsMapper lhs(alhs);
-    RhsMapper rhs2(rhs);
+    typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
 
     eigen_internal_assert(rhs.stride() == 1);
     conj_helper<LhsScalar, RhsScalar, false, false> cj;
@@ -2015,14 +2006,14 @@
         Index j = 0;
         for (; j + LhsPacketSize <= cols; j += LhsPacketSize)
         {
-            RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j, 0);
+            RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j);
 
             d0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, d0);
         }
         ResScalar dd0 = predux(d0);
         for (; j < cols; ++j)
         {
-            dd0 += cj.pmul(lhs(i, j), rhs2(j, 0));
+            dd0 += cj.pmul(lhs(i, j), rhs2(j));
         }
         res[i * resIncr] += alpha * dd0;
     }
@@ -2040,7 +2031,7 @@
         const RhsMapper& rhs, \
         ResScalar* res, Index resIncr, \
         ResScalar alpha) { \
-        gemv_col<Index, Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
+        gemv_col<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
     } \
 };
 
@@ -2056,7 +2047,7 @@
         const RhsMapper& rhs, \
         ResScalar* res, Index resIncr, \
         ResScalar alpha) { \
-        gemv_row<Index, Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
+        gemv_row<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
     } \
 };
 
@@ -2076,7 +2067,7 @@
 }
 
 #define GEMV_LOADPACKET_ROW_COMPLEX(iter) \
-  loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket, Index>(lhs, i + (iter), j)
+  loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + (iter), j)
 
 #define GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter) \
   convertReal(GEMV_LOADPACKET_ROW_COMPLEX(iter))
@@ -2084,14 +2075,14 @@
 #define GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(which, N) \
   j = 0; \
   for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
-    const RhsScalar& b1 = rhs2(j, 0); \
+    const RhsScalar& b1 = rhs2(j); \
     RhsScalar* b = const_cast<RhsScalar *>(&b1); \
     GEMV_UNROLL_ROW(which, N) \
   }
 
 #define GEMV_PROCESS_END_ROW_COMPLEX(N) \
   for (; j < cols; ++j) { \
-    RhsScalar b0 = rhs2(j, 0); \
+    RhsScalar b0 = rhs2(j); \
     GEMV_UNROLL_ROW_HALF(GEMV_MULT_COMPLEX, (N >> 1)) \
   } \
   GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW_COMPLEX, (N >> 1))
@@ -2225,7 +2216,7 @@
   GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX_OLD, N) \
   j = 0; \
   for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
-    RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j, 0); \
+    RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j); \
     GEMV_UNROLL_ROW(GEMV_WORK_ROW_COMPLEX_OLD, N) \
   }
 
@@ -2276,7 +2267,7 @@
   GEMV_PROCESS_ROW_COMPLEX_ONE(N)
 #endif
 
-template<typename Index, typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
+template<typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
 EIGEN_STRONG_INLINE void gemv_complex_row(
     Index rows, Index cols,
     const LhsMapper& alhs,
@@ -2298,7 +2289,7 @@
     // The following copy tells the compiler that lhs's attributes are not modified outside this function
     // This helps GCC to generate proper code.
     LhsMapper lhs(alhs);
-    RhsMapper rhs2(rhs);
+    typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
 
     eigen_internal_assert(rhs.stride() == 1);
     conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
@@ -2349,7 +2340,7 @@
         GEMV_PROCESS_ROW_COMPLEX_PREDUX(0)
         for (; j < cols; ++j)
         {
-            dd0 += cj.pmul(lhs(i, j), rhs2(j, 0));
+            dd0 += cj.pmul(lhs(i, j), rhs2(j));
         }
         res[i * resIncr] += alpha * dd0;
     }
@@ -2367,7 +2358,7 @@
         const RhsMapper& rhs, \
         ResScalar* res, Index resIncr, \
         ResScalar alpha) { \
-        gemv_complex_col<Index, Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
+        gemv_complex_col<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
     } \
 };
 
@@ -2383,7 +2374,7 @@
         const RhsMapper& rhs, \
         ResScalar* res, Index resIncr, \
         ResScalar alpha) { \
-        gemv_complex_row<Index, Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
+        gemv_complex_row<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
     } \
 };
 
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
old mode 100755
new mode 100644
index 945f36b..91b3e20
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -174,9 +174,9 @@
 #else
     HasSqrt = 0,
     HasRsqrt = 0,
+#endif
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
-#endif
     HasRound = 1,
     HasFloor = 1,
     HasCeil = 1,
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
old mode 100755
new mode 100644
index 35490a6..e896040
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -285,6 +285,10 @@
 
 template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
 
+template<typename Packet> EIGEN_STRONG_INLINE Packet padds(const Packet& a, const Packet& b);
+template<> EIGEN_STRONG_INLINE Packet4f padds<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ss(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d padds<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_sd(a,b); }
+
 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
@@ -370,6 +374,10 @@
 template<> EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fnmadd_pd(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fnmsub_ps(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fnmsub_pd(a,b,c); }
+
+template<typename Packet> EIGEN_STRONG_INLINE Packet pmadds(const Packet& a, const Packet& b, const Packet& c);
+template<> EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ss(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_sd(a,b,c); }
 #endif
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
@@ -746,6 +754,15 @@
   return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 }
 
+// Load lower part of packet zero extending.
+template<typename Packet> EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits<Packet>::type* from);
+template<> EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(const float*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))); }
+template<> EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from); }
+
+// Load scalar
+template<typename Packet> EIGEN_STRONG_INLINE Packet ploads(const typename unpacket_traits<Packet>::type* from);
+template<> EIGEN_STRONG_INLINE Packet4f ploads<Packet4f>(const float*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_ss(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from); }
 
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
 {
@@ -787,6 +804,14 @@
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<bool>(bool*     to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
 
+template<typename Scalar, typename Packet> EIGEN_STRONG_INLINE void pstorel(Scalar* to, const Packet& from);
+template<> EIGEN_STRONG_INLINE void pstorel(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(reinterpret_cast<__m64*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstorel(double*  to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from); }
+
+template<typename Scalar, typename Packet> EIGEN_STRONG_INLINE void pstores(Scalar* to, const Packet& from);
+template<> EIGEN_STRONG_INLINE void pstores(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from); }
+template<> EIGEN_STRONG_INLINE void pstores(double*  to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from); }
+
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
  return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index c21d1ac..a6346ea 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -71,6 +71,14 @@
   return _mm_cvtps_pd(a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4f>(const Packet4f& a) {
+  return _mm_castps_pd(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet2d>(const Packet2d& a) {
+  return _mm_castpd_ps(a);
+}
+
 template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
   return _mm_castps_si128(a);
 }
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 98c0b77..9c106b3 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -151,7 +151,7 @@
 template <>
 EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(const PacketXi& a, const PacketXi& b)
 {
-  return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
+  return svdup_n_s32_z(svcmple_s32(svptrue_b32(), a, b), 0xffffffffu);
 }
 
 template <>
@@ -211,13 +211,13 @@
 template <int N>
 EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a)
 {
-  return svreinterpret_s32_u32(svlsr_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), svdup_n_u32_z(svptrue_b32(), N)));
+  return svreinterpret_s32_u32(svlsr_n_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), N));
 }
 
 template <int N>
 EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a)
 {
-  return svlsl_s32_z(svptrue_b32(), a, svdup_n_u32_z(svptrue_b32(), N));
+  return svlsl_n_s32_z(svptrue_b32(), a, N);
 }
 
 template <>
@@ -525,7 +525,7 @@
 template <>
 EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b)
 {
-  return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmple_f32(svptrue_b32(), a, b), 0xffffffffu));
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 5262428..38bddac 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -287,7 +287,6 @@
     };
     typedef std::conditional_t<Transpose,RhsScalar_,LhsScalar_> LhsScalar;
     typedef std::conditional_t<Transpose,LhsScalar_,RhsScalar_> RhsScalar;
-    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
     enum {
       SizeA = ActualRows * MaxDepth,
       SizeB = ActualCols * MaxDepth
@@ -336,7 +335,6 @@
     };
     typedef std::conditional_t<Transpose,RhsScalar_,LhsScalar_> LhsScalar;
     typedef std::conditional_t<Transpose,LhsScalar_,RhsScalar_> RhsScalar;
-    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
     Index m_sizeA;
     Index m_sizeB;
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index def6a28..b148d9c 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -18,24 +18,27 @@
 namespace internal {
 
 template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride>
-struct trsm_kernels {
+struct trsmKernelL {
   // Generic Implementation of triangular solve for triangular matrix on left and multiple rhs.
   // Handles non-packed matrices.
-  static void trsmKernelL(
-    Index size, Index otherSize,
-    const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherIncr, Index otherStride);
-
-  // Generic Implementation of triangular solve for triangular matrix on right and multiple lhs.
-  // Handles non-packed matrices.
-  static void trsmKernelR(
+  static void kernel(
     Index size, Index otherSize,
     const Scalar* _tri, Index triStride,
     Scalar* _other, Index otherIncr, Index otherStride);
 };
 
 template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride>
-EIGEN_STRONG_INLINE void trsm_kernels<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride>::trsmKernelL(
+struct trsmKernelR {
+  // Generic Implementation of triangular solve for triangular matrix on right and multiple lhs.
+  // Handles non-packed matrices.
+  static void kernel(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherIncr, Index otherStride);
+};
+
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride>
+EIGEN_STRONG_INLINE void trsmKernelL<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride>::kernel(
     Index size, Index otherSize,
     const Scalar* _tri, Index triStride,
     Scalar* _other, Index otherIncr, Index otherStride)
@@ -86,7 +89,7 @@
 
 
 template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
-EIGEN_STRONG_INLINE void trsm_kernels<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride>::trsmKernelR(
+EIGEN_STRONG_INLINE void trsmKernelR<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride>::kernel(
     Index size, Index otherSize,
     const Scalar* _tri, Index triStride,
     Scalar* _other, Index otherIncr, Index otherStride)
@@ -168,7 +171,7 @@
     std::ptrdiff_t l1, l2, l3;
     manage_caching_sizes(GetAction, &l1, &l2, &l3);
 
-#if defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS)
+#if defined(EIGEN_VECTORIZE_AVX512) && EIGEN_USE_AVX512_TRSM_L_KERNELS && EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS
     EIGEN_IF_CONSTEXPR( (OtherInnerStride == 1 &&
                        (std::is_same<Scalar,float>::value ||
                         std::is_same<Scalar,double>::value)) ) {
@@ -177,7 +180,7 @@
       // TODO: Investigate better heuristics for cutoffs.
       double L2Cap = 0.5; // 50% of L2 size
       if (size < avx512_trsm_cutoff<Scalar>(l2, cols, L2Cap)) {
-        trsm_kernels<Scalar, Index, Mode, Conjugate, TriStorageOrder, 1>::trsmKernelL(
+        trsmKernelL<Scalar, Index, Mode, Conjugate, TriStorageOrder, 1>::kernel(
           size, cols, _tri, triStride, _other, 1, otherStride);
         return;
       }
@@ -243,14 +246,14 @@
           // tr solve
           {
             Index i  = IsLower ? k2+k1 : k2-k1;
-#if defined(EIGEN_USE_AVX512_TRSM_KERNELS)
+#if defined(EIGEN_VECTORIZE_AVX512) && EIGEN_USE_AVX512_TRSM_L_KERNELS
             EIGEN_IF_CONSTEXPR( (OtherInnerStride == 1 &&
                                  (std::is_same<Scalar,float>::value ||
                                   std::is_same<Scalar,double>::value)) ) {
               i  = IsLower ? k2 + k1: k2 - k1 - actualPanelWidth;
             }
 #endif
-            trsm_kernels<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride>::trsmKernelL(
+            trsmKernelL<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride>::kernel(
               actualPanelWidth, actual_cols,
               _tri + i + (i)*triStride, triStride,
               _other + i*OtherInnerStride + j2*otherStride, otherIncr, otherStride);
@@ -315,7 +318,7 @@
   {
     Index rows = otherSize;
 
-#if defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS)
+#if defined(EIGEN_VECTORIZE_AVX512) && EIGEN_USE_AVX512_TRSM_R_KERNELS && EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS
     EIGEN_IF_CONSTEXPR( (OtherInnerStride == 1 &&
                  (std::is_same<Scalar,float>::value ||
                   std::is_same<Scalar,double>::value)) ) {
@@ -324,8 +327,8 @@
       manage_caching_sizes(GetAction, &l1, &l2, &l3);
       double L2Cap = 0.5; // 50% of L2 size
       if (size < avx512_trsm_cutoff<Scalar>(l2, rows, L2Cap)) {
-        trsm_kernels<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride>::
-          trsmKernelR(size, rows, _tri, triStride, _other, 1, otherStride);
+        trsmKernelR<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride>::
+          kernel(size, rows, _tri, triStride, _other, 1, otherStride);
         return;
       }
     }
@@ -420,8 +423,8 @@
 
             {
               // unblocked triangular solve
-              trsm_kernels<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride>::
-                trsmKernelR(actualPanelWidth, actual_mc,
+              trsmKernelR<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride>::
+                kernel(actualPanelWidth, actual_mc,
                             _tri + absolute_j2 + absolute_j2*triStride, triStride,
                             _other + i2*OtherInnerStride + absolute_j2*otherStride, otherIncr, otherStride);
             }
diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h
index 57ade28..b8fbb5b 100644
--- a/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/Eigen/src/Core/products/TriangularSolverVector.h
@@ -78,7 +78,7 @@
         if (k>0)
           rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
 
-        if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0)))
+        if((!(Mode & UnitDiag)) && !is_identically_zero(rhs[i]))
           rhs[i] /= cjLhs(i,i);
       }
     }
@@ -115,7 +115,7 @@
       for(Index k=0; k<actualPanelWidth; ++k)
       {
         Index i = IsLower ? pi+k : pi-k-1;
-        if(numext::not_equal_strict(rhs[i],RhsScalar(0)))
+        if(!is_identically_zero(rhs[i]))
         {
           if(!(Mode & UnitDiag))
             rhs[i] /= cjLhs.coeff(i,i);
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
old mode 100755
new mode 100644
index f45665e..e2eef19
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -229,6 +229,7 @@
   }
 
   EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC const Index incr() const { return 1; }
   EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
 
   EIGEN_DEVICE_FUNC Index firstAligned(Index size) const {
@@ -402,6 +403,10 @@
     storePacketBlock_helper<SubPacket, Scalar, n, n-1> spb;
     spb.store(this, i,j,block);
   }
+
+  EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC const Index incr() const { return m_incr.value(); }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
 protected:
   Scalar* EIGEN_RESTRICT m_data;
   const Index m_stride;
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 16ed585..4457f4f 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -430,7 +430,9 @@
   /** Use a QR decomposition without pivoting as the first step. */
   HouseholderQRPreconditioner = 0x80,
   /** Use a QR decomposition with full pivoting as the first step. */
-  FullPivHouseholderQRPreconditioner = 0xC0
+  FullPivHouseholderQRPreconditioner = 0xC0,
+  /** Used to disable the QR Preconditioner in BDCSVD. */
+  DisableQRDecomposition = NoQRPreconditioner
 };
 
 #ifdef Success
@@ -549,7 +551,7 @@
 /** \internal
  * Constants for comparison functors
  */
-enum ComparisonName {
+enum ComparisonName : unsigned int {
   cmp_EQ = 0,
   cmp_LT = 1,
   cmp_LE = 2,
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
old mode 100755
new mode 100644
index 8d5c2c3..32152ac
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -441,6 +441,17 @@
 } // end namespace numext
 
 namespace internal {
+
+template<typename Scalar>
+struct is_identically_zero_impl {
+  static inline bool run(const Scalar& s) {
+    return numext::is_exactly_zero(s);
+  }
+};
+
+template<typename Scalar> EIGEN_STRONG_INLINE
+bool is_identically_zero(const Scalar& s) { return is_identically_zero_impl<Scalar>::run(s); }
+
 /// \internal Returns true if its argument is of integer or enum type.
 /// FIXME this has the same purpose as `is_valid_index_type` in XprHelper.h
 template<typename A>
diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index bf3f456..855b752 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@@ -71,7 +71,7 @@
   Scalar& tau,
   RealScalar& beta) const
 {
-  using std::sqrt;
+  using numext::sqrt;
   using numext::conj;
   
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(EssentialPart)
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 5415f85..f9e0c62 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -76,9 +76,11 @@
  * \tparam MatrixType_ the type of the matrix of which we are computing the SVD decomposition
  *
  * \tparam Options_ this optional parameter allows one to specify options for computing unitaries \a U and \a V.
- *                  Possible values are #ComputeThinU, #ComputeThinV, #ComputeFullU, #ComputeFullV.
- *                  It is not possible to request both the thin and full version of \a U or \a V.
- *                  By default, unitaries are not computed.
+ *                  Possible values are #ComputeThinU, #ComputeThinV, #ComputeFullU, #ComputeFullV, and
+ *                  #DisableQRDecomposition. It is not possible to request both the thin and full version of \a U or
+ *                  \a V. By default, unitaries are not computed. BDCSVD uses R-Bidiagonalization to improve
+ *                  performance on tall and wide matrices. For backwards compatility, the option
+ *                  #DisableQRDecomposition can be used to disable this optimization.
  *
  * This class first reduces the input matrix to bi-diagonal form using class UpperBidiagonalization,
  * and then performs a divide-and-conquer diagonalization. Small blocks are diagonalized using class JacobiSVD.
@@ -110,6 +112,8 @@
   typedef typename Base::Index Index;
   enum {
     Options = Options_,
+    QRDecomposition = Options & internal::QRPreconditionerBits,
+    ComputationOptions = Options & internal::ComputationOptionsBits,
     RowsAtCompileTime = Base::RowsAtCompileTime,
     ColsAtCompileTime = Base::ColsAtCompileTime,
     DiagSizeAtCompileTime = Base::DiagSizeAtCompileTime,
@@ -251,8 +255,12 @@
   ArrayXr m_workspace;
   ArrayXi m_workspaceI;
   int m_algoswap;
-  bool m_isTranspose, m_compU, m_compV;
-  JacobiSVD<MatrixType, Options> smallSvd;
+  bool m_isTranspose, m_compU, m_compV, m_useQrDecomp;
+  JacobiSVD<MatrixType, ComputationOptions> smallSvd;
+  HouseholderQR<MatrixX> qrDecomp;
+  internal::UpperBidiagonalization<MatrixX> bid;
+  MatrixX copyWorkspace;
+  MatrixX reducedTriangle;
 
   using Base::m_computationOptions;
   using Base::m_computeThinU;
@@ -276,7 +284,7 @@
     return;
 
   if (cols < m_algoswap)
-    internal::allocate_small_svd<MatrixType, Options>::run(smallSvd, rows, cols, computationOptions);
+    internal::allocate_small_svd<MatrixType, ComputationOptions>::run(smallSvd, rows, cols, computationOptions);
 
   m_computed = MatrixXr::Zero(m_diagSize + 1, m_diagSize );
   m_compU = computeV();
@@ -285,6 +293,22 @@
   if (m_isTranspose)
     std::swap(m_compU, m_compV);
 
+  // kMinAspectRatio is the crossover point that determines if we perform R-Bidiagonalization
+  // or bidiagonalize the input matrix directly.
+  // It is based off of LAPACK's dgesdd routine, which uses 11.0/6.0
+  // we use a larger scalar to prevent a regression for relatively square matrices.
+  constexpr Index kMinAspectRatio = 4;
+  constexpr bool disableQrDecomp = static_cast<int>(QRDecomposition) == static_cast<int>(DisableQRDecomposition);
+  m_useQrDecomp = !disableQrDecomp && ((rows / kMinAspectRatio > cols) || (cols / kMinAspectRatio > rows));
+  if (m_useQrDecomp) {
+    qrDecomp = HouseholderQR<MatrixX>((std::max)(rows, cols), (std::min)(rows, cols));
+    reducedTriangle = MatrixX(m_diagSize, m_diagSize);
+  }
+
+  copyWorkspace = MatrixX(m_isTranspose ? cols : rows, m_isTranspose ? rows : cols);
+  bid = internal::UpperBidiagonalization<MatrixX>(m_useQrDecomp ? m_diagSize : copyWorkspace.rows(),
+                                                  m_useQrDecomp ? m_diagSize : copyWorkspace.cols());
+
   if (m_compU) m_naiveU = MatrixXr::Zero(m_diagSize + 1, m_diagSize + 1 );
   else         m_naiveU = MatrixXr::Zero(2, m_diagSize + 1 );
 
@@ -330,13 +354,22 @@
   }
 
   if(numext::is_exactly_zero(scale)) scale = Literal(1);
-  MatrixX copy;
-  if (m_isTranspose) copy = matrix.adjoint()/scale;
-  else               copy = matrix/scale;
 
-  //**** step 1 - Bidiagonalization
-  // FIXME this line involves temporaries
-  internal::UpperBidiagonalization<MatrixX> bid(copy);
+  if (m_isTranspose) copyWorkspace = matrix.adjoint() / scale;
+  else copyWorkspace = matrix / scale;
+
+  //**** step 1 - Bidiagonalization.
+  // If the problem is sufficiently rectangular, we perform R-Bidiagonalization: compute A = Q(R/0)
+  // and then bidiagonalize R. Otherwise, if the problem is relatively square, we
+  // bidiagonalize the input matrix directly.
+  if (m_useQrDecomp) {
+    qrDecomp.compute(copyWorkspace);
+    reducedTriangle = qrDecomp.matrixQR().topRows(m_diagSize);
+    reducedTriangle.template triangularView<StrictlyLower>().setZero();
+    bid.compute(reducedTriangle);
+  } else {
+    bid.compute(copyWorkspace);
+  }
 
   //**** step 2 - Divide & Conquer
   m_naiveU.setZero();
@@ -368,13 +401,15 @@
     }
   }
 
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-//   std::cout << "m_naiveU\n" << m_naiveU << "\n\n";
-//   std::cout << "m_naiveV\n" << m_naiveV << "\n\n";
-#endif
+  //**** step 4 - Finalize unitaries U and V
   if(m_isTranspose) copyUV(bid.householderV(), bid.householderU(), m_naiveV, m_naiveU);
   else              copyUV(bid.householderU(), bid.householderV(), m_naiveU, m_naiveV);
 
+  if (m_useQrDecomp) {
+    if (m_isTranspose && computeV()) m_matrixV.applyOnTheLeft(qrDecomp.householderQ());
+    else if (!m_isTranspose && computeU()) m_matrixU.applyOnTheLeft(qrDecomp.householderQ());
+  }
+
   m_isInitialized = true;
   return *this;
 }  // end compute
@@ -386,17 +421,21 @@
   // Note exchange of U and V: m_matrixU is set from m_naiveV and vice versa
   if (computeU())
   {
-    Index Ucols = m_computeThinU ? m_diagSize : householderU.cols();
-    m_matrixU = MatrixX::Identity(householderU.cols(), Ucols);
+    Index Ucols = m_computeThinU ? m_diagSize : rows();
+    m_matrixU = MatrixX::Identity(rows(), Ucols);
     m_matrixU.topLeftCorner(m_diagSize, m_diagSize) = naiveV.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize);
-    householderU.applyThisOnTheLeft(m_matrixU); // FIXME this line involves a temporary buffer
+    // FIXME the following conditionals involve temporary buffers
+    if (m_useQrDecomp) m_matrixU.topLeftCorner(householderU.cols(), m_diagSize).applyOnTheLeft(householderU);
+    else m_matrixU.applyOnTheLeft(householderU);
   }
   if (computeV())
   {
-    Index Vcols = m_computeThinV ? m_diagSize : householderV.cols();
-    m_matrixV = MatrixX::Identity(householderV.cols(), Vcols);
+    Index Vcols = m_computeThinV ? m_diagSize : cols();
+    m_matrixV = MatrixX::Identity(cols(), Vcols);
     m_matrixV.topLeftCorner(m_diagSize, m_diagSize) = naiveU.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize);
-    householderV.applyThisOnTheLeft(m_matrixV); // FIXME this line involves a temporary buffer
+    // FIXME the following conditionals involve temporary buffers
+    if (m_useQrDecomp) m_matrixV.topLeftCorner(householderV.cols(), m_diagSize).applyOnTheLeft(householderV);
+    else m_matrixV.applyOnTheLeft(householderV);
   }
 }
 
@@ -1044,8 +1083,8 @@
     // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
     // (deflation is supposed to avoid this from happening)
     // - this does no seem to be necessary anymore -
-//     if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
-//     if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
+    // if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
+    // if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
   }
 }
 
@@ -1097,7 +1136,14 @@
             std::cout << "  " << "j=" << j << "\n";
           }
 #endif
-          Index j = i<k ? i : perm(l-1);
+          // Avoid index out of bounds.
+          // Will end up setting zhat(k) = 0.
+          if (i >= k && l == 0) {
+            m_info = NumericalIssue;
+            prod = 0;
+            break;
+          }
+          Index j = i<k ? i : l > 0 ? perm(l-1) : i;
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
           if(!(dk!=Literal(0) || diag(i)!=Literal(0)))
           {
diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index 7aac931..e6c9097 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h
@@ -53,7 +53,7 @@
     * The default constructor is useful in cases in which the user intends to
     * perform decompositions via Bidiagonalization::compute(const MatrixType&).
     */
-    UpperBidiagonalization() : m_householder(), m_bidiagonal(), m_isInitialized(false) {}
+    UpperBidiagonalization() : m_householder(), m_bidiagonal(0, 0), m_isInitialized(false) {}
 
     explicit UpperBidiagonalization(const MatrixType& matrix)
       : m_householder(matrix.rows(), matrix.cols()),
@@ -62,7 +62,13 @@
     {
       compute(matrix);
     }
-    
+
+    UpperBidiagonalization(Index rows, Index cols)
+      : m_householder(rows, cols),
+        m_bidiagonal(cols, cols),
+        m_isInitialized(false)
+    {}
+
     UpperBidiagonalization& compute(const MatrixType& matrix);
     UpperBidiagonalization& computeUnblocked(const MatrixType& matrix);
     
diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h
old mode 100755
new mode 100644
diff --git a/doc/TutorialMatrixClass.dox b/doc/TutorialMatrixClass.dox
index e489f6d..e4e4f98 100644
--- a/doc/TutorialMatrixClass.dox
+++ b/doc/TutorialMatrixClass.dox
@@ -111,9 +111,9 @@
 
 If C++11 is enabled, fixed-size column or row vectors of arbitrary size can be initialized by passing an arbitrary number of coefficients:
 \code
-Vector2i a(1, 2);                      // A column vector containing the elements {1, 2}
-Matrix<int, 5, 1> b {1, 2, 3, 4, 5};   // A row-vector containing the elements {1, 2, 3, 4, 5}
-Matrix<int, 1, 5> c = {1, 2, 3, 4, 5}; // A column vector containing the elements {1, 2, 3, 4, 5}
+Vector2i a(1, 2);                      // A column-vector containing the elements {1, 2}
+Matrix<int, 5, 1> b {1, 2, 3, 4, 5};   // A column-vector containing the elements {1, 2, 3, 4, 5}
+Matrix<int, 1, 5> c = {1, 2, 3, 4, 5}; // A row-vector containing the elements {1, 2, 3, 4, 5}
 \endcode
 
 In the general case of matrices and vectors with either fixed or runtime sizes,
diff --git a/test/accelerate_support.cpp b/test/accelerate_support.cpp
index e04895b..ac4be61 100644
--- a/test/accelerate_support.cpp
+++ b/test/accelerate_support.cpp
@@ -152,21 +152,21 @@
 {
   typedef SparseMatrix<Scalar> MatrixType; 
 
-  test_accelerate_ldlt<Scalar, AccelerateLDLT<MatrixType, Lower | Symmetric> >();
-  test_accelerate_ldlt<Scalar, AccelerateLDLTUnpivoted<MatrixType, Lower | Symmetric> >();
-  test_accelerate_ldlt<Scalar, AccelerateLDLTSBK<MatrixType, Lower | Symmetric> >();
-  test_accelerate_ldlt<Scalar, AccelerateLDLTTPP<MatrixType, Lower | Symmetric> >();
+  test_accelerate_ldlt<Scalar, AccelerateLDLT<MatrixType, Lower> >();
+  test_accelerate_ldlt<Scalar, AccelerateLDLTUnpivoted<MatrixType, Lower> >();
+  test_accelerate_ldlt<Scalar, AccelerateLDLTSBK<MatrixType, Lower> >();
+  test_accelerate_ldlt<Scalar, AccelerateLDLTTPP<MatrixType, Lower> >();
 
-  test_accelerate_ldlt<Scalar, AccelerateLDLT<MatrixType, Upper | Symmetric> >();
-  test_accelerate_ldlt<Scalar, AccelerateLDLTUnpivoted<MatrixType, Upper | Symmetric> >();
-  test_accelerate_ldlt<Scalar, AccelerateLDLTSBK<MatrixType, Upper | Symmetric> >();
-  test_accelerate_ldlt<Scalar, AccelerateLDLTTPP<MatrixType, Upper | Symmetric> >();
+  test_accelerate_ldlt<Scalar, AccelerateLDLT<MatrixType, Upper> >();
+  test_accelerate_ldlt<Scalar, AccelerateLDLTUnpivoted<MatrixType, Upper> >();
+  test_accelerate_ldlt<Scalar, AccelerateLDLTSBK<MatrixType, Upper> >();
+  test_accelerate_ldlt<Scalar, AccelerateLDLTTPP<MatrixType, Upper> >();
 
-  test_accelerate_llt<Scalar, AccelerateLLT<MatrixType, Lower | Symmetric> >();
+  test_accelerate_llt<Scalar, AccelerateLLT<MatrixType, Lower> >();
 
-  test_accelerate_llt<Scalar, AccelerateLLT<MatrixType, Upper | Symmetric> >();
+  test_accelerate_llt<Scalar, AccelerateLLT<MatrixType, Upper> >();
 
-  test_accelerate_qr<Scalar, AccelerateQR<MatrixType, 0> >();
+  test_accelerate_qr<Scalar, AccelerateQR<MatrixType> >();
 }
 
 EIGEN_DECLARE_TEST(accelerate_support)
diff --git a/test/adjoint.cpp b/test/adjoint.cpp
index da8f958..210a4d92 100644
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@@ -62,6 +62,17 @@
   }
 };
 
+template<typename MatrixType, typename Scalar = typename MatrixType::Scalar>
+MatrixType RandomMatrix(int rows, int cols, Scalar min, Scalar max) {
+  MatrixType M = MatrixType(rows, cols);
+  for (int i=0; i<rows; ++i) {
+    for (int j=0; j<cols; ++j) {
+      M(i, j) = Eigen::internal::random<Scalar>(min, max);
+    }
+  }
+  return M;
+}
+
 template<typename MatrixType> void adjoint(const MatrixType& m)
 {
   /* this test covers the following files:
@@ -77,17 +88,21 @@
   Index rows = m.rows();
   Index cols = m.cols();
 
-  MatrixType m1 = MatrixType::Random(rows, cols),
-             m2 = MatrixType::Random(rows, cols),
+  // Avoid integer overflow by limiting input values.
+  RealScalar rmin = static_cast<RealScalar>(NumTraits<Scalar>::IsInteger ? NumTraits<Scalar>::IsSigned ? -100 : 0 : -1);
+  RealScalar rmax = static_cast<RealScalar>(NumTraits<Scalar>::IsInteger ? 100 : 1);
+
+  MatrixType m1 = RandomMatrix<MatrixType>(rows, cols, rmin, rmax),
+             m2 = RandomMatrix<MatrixType>(rows, cols, rmin, rmax),
              m3(rows, cols),
-             square = SquareMatrixType::Random(rows, rows);
-  VectorType v1 = VectorType::Random(rows),
-             v2 = VectorType::Random(rows),
-             v3 = VectorType::Random(rows),
+             square = RandomMatrix<SquareMatrixType>(rows, rows, rmin, rmax);
+  VectorType v1 = RandomMatrix<VectorType>(rows, 1, rmin, rmax),
+             v2 = RandomMatrix<VectorType>(rows, 1, rmin, rmax),
+             v3 = RandomMatrix<VectorType>(rows, 1, rmin, rmax),
              vzero = VectorType::Zero(rows);
 
-  Scalar s1 = internal::random<Scalar>(),
-         s2 = internal::random<Scalar>();
+  Scalar s1 = internal::random<Scalar>(rmin, rmax),
+         s2 = internal::random<Scalar>(rmin, rmax);
 
   // check basic compatibility of adjoint, transpose, conjugate
   VERIFY_IS_APPROX(m1.transpose().conjugate().adjoint(),    m1);
@@ -138,7 +153,8 @@
 
   // check mixed dot product
   typedef Matrix<RealScalar, MatrixType::RowsAtCompileTime, 1> RealVectorType;
-  RealVectorType rv1 = RealVectorType::Random(rows);
+  RealVectorType rv1 = RandomMatrix<RealVectorType>(rows, 1, rmin, rmax);
+  
   VERIFY_IS_APPROX(v1.dot(rv1.template cast<Scalar>()), v1.dot(rv1));
   VERIFY_IS_APPROX(rv1.template cast<Scalar>().dot(v1), rv1.dot(v1));
 
diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp
index 291210c..539494b 100644
--- a/test/bdcsvd.cpp
+++ b/test/bdcsvd.cpp
@@ -46,10 +46,17 @@
   VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).solve(m), m);
   VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).transpose().solve(m), m);
   VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m);
-  
+  VERIFY_IS_APPROX(m.template bdcSvd<DisableQRDecomposition>(ComputeFullU|ComputeFullV).solve(m), m);
+  VERIFY_IS_APPROX(m.template bdcSvd<DisableQRDecomposition>(ComputeFullU|ComputeFullV).transpose().solve(m), m);
+  VERIFY_IS_APPROX(m.template bdcSvd<DisableQRDecomposition>(ComputeFullU|ComputeFullV).adjoint().solve(m), m);
+
   VERIFY_IS_APPROX(m.template bdcSvd<ComputeFullU | ComputeFullV>().solve(m), m);
   VERIFY_IS_APPROX(m.template bdcSvd<ComputeFullU | ComputeFullV>().transpose().solve(m), m);
   VERIFY_IS_APPROX(m.template bdcSvd<ComputeFullU | ComputeFullV>().adjoint().solve(m), m);
+
+  VERIFY_IS_APPROX(m.template bdcSvd<ComputeFullU | ComputeFullV | DisableQRDecomposition>().solve(m), m);
+  VERIFY_IS_APPROX(m.template bdcSvd<ComputeFullU | ComputeFullV | DisableQRDecomposition>().transpose().solve(m), m);
+  VERIFY_IS_APPROX(m.template bdcSvd<ComputeFullU | ComputeFullV | DisableQRDecomposition>().adjoint().solve(m), m);
 }
 
 // compare the Singular values returned with Jacobi and Bdc
@@ -88,7 +95,7 @@
 
 template <typename MatrixType>
 void bdcsvd_all_options(const MatrixType& input = MatrixType()) {
-  MatrixType m = input;
+  MatrixType m(input.rows(), input.cols());
   svd_fill_random(m);
   svd_option_checks<MatrixType, 0>(m);
 }
diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp
index 76c8886..15492a7 100644
--- a/test/diagonalmatrices.cpp
+++ b/test/diagonalmatrices.cpp
@@ -72,6 +72,9 @@
   VERIFY_IS_APPROX( (((v1+v2).asDiagonal() * (m1+m2))(i,j))  , (v1+v2)(i) * (m1+m2)(i,j) );
   VERIFY_IS_APPROX( ((m1 * (rv1+rv2).asDiagonal())(i,j))  , (rv1+rv2)(j) * m1(i,j) );
   VERIFY_IS_APPROX( (((m1+m2) * (rv1+rv2).asDiagonal())(i,j))  , (rv1+rv2)(j) * (m1+m2)(i,j) );
+  VERIFY_IS_APPROX( (ldm1 * ldm1).diagonal()(i), ldm1.diagonal()(i) * ldm1.diagonal()(i) );
+  VERIFY_IS_APPROX( (ldm1 * ldm1 * m1)(i, j), ldm1.diagonal()(i) * ldm1.diagonal()(i) * m1(i, j) );
+  VERIFY_IS_APPROX( ((v1.asDiagonal() * v1.asDiagonal()).diagonal()(i)), v1(i) * v1(i) );
   internal::set_is_malloc_allowed(true);
   
   if(rows>1)
@@ -99,6 +102,7 @@
   res.noalias() = ldm1 * m1;
   res.noalias() = m1 * rdm1;
   res.noalias() = ldm1 * m1 * rdm1;
+  res.noalias() = LeftDiagonalMatrix::Identity(rows) * m1 * RightDiagonalMatrix::Zero(cols);
   internal::set_is_malloc_allowed(true);  
   
   // scalar multiple
@@ -127,6 +131,13 @@
   VERIFY_IS_APPROX( sq_m3 = v1.asDiagonal() + v2.asDiagonal(), sq_m1 + sq_m2);
   VERIFY_IS_APPROX( sq_m3 = v1.asDiagonal() - v2.asDiagonal(), sq_m1 - sq_m2);
   VERIFY_IS_APPROX( sq_m3 = v1.asDiagonal() - 2*v2.asDiagonal() + v1.asDiagonal(), sq_m1 - 2*sq_m2 + sq_m1);
+
+  // Zero and Identity
+  LeftDiagonalMatrix zero = LeftDiagonalMatrix::Zero(rows);
+  LeftDiagonalMatrix identity = LeftDiagonalMatrix::Identity(rows);
+  VERIFY_IS_APPROX(identity.diagonal().sum(), Scalar(rows));
+  VERIFY_IS_APPROX(zero.diagonal().sum(), Scalar(0));
+  VERIFY_IS_APPROX((zero + 2 * LeftDiagonalMatrix::Identity(rows)).diagonal().sum(), Scalar(2 * rows));
 }
 
 template<typename MatrixType> void as_scalar_product(const MatrixType& m)
diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp
index 76194f7..daf24a7 100644
--- a/test/jacobisvd.cpp
+++ b/test/jacobisvd.cpp
@@ -50,7 +50,7 @@
 
 template <typename MatrixType>
 void jacobisvd_all_options(const MatrixType& input = MatrixType()) {
-  MatrixType m = input;
+  MatrixType m(input.rows(), input.cols());
   svd_fill_random(m);
   svd_option_checks<MatrixType, 0>(m);
   svd_option_checks<MatrixType, ColPivHouseholderQRPreconditioner>(m);
diff --git a/test/main.h b/test/main.h
index 8eead7c..884b6c3 100644
--- a/test/main.h
+++ b/test/main.h
@@ -87,7 +87,7 @@
 // protected by parenthesis against macro expansion, the min()/max() macros
 // are defined here and any not-parenthesized min/max call will cause a
 // compiler error.
-#if !defined(__HIPCC__) && !defined(EIGEN_USE_SYCL)
+#if !defined(__HIPCC__) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_POCKETFFT_DEFAULT)
   //
   // HIP header files include the following files
   //  <thread>
diff --git a/test/visitor.cpp b/test/visitor.cpp
index 7ff7bf1..3b118f9 100644
--- a/test/visitor.cpp
+++ b/test/visitor.cpp
@@ -175,15 +175,15 @@
 
 template<typename T, bool Vectorizable>
 struct TrackedVisitor {
-  void init(T v, int i, int j) { return this->operator()(v,i,j); }
-  void operator()(T v, int i, int j) {
+  void init(T v, Index i, Index j) { return this->operator()(v,i,j); }
+  void operator()(T v, Index i, Index j) {
     EIGEN_UNUSED_VARIABLE(v)
     visited.push_back({i, j});
     vectorized = false;
   }
   
   template<typename Packet>
-  void packet(Packet p, int i, int j) {
+  void packet(Packet p, Index i, Index j) {
     EIGEN_UNUSED_VARIABLE(p)  
     visited.push_back({i, j});
     vectorized = true;
@@ -210,9 +210,9 @@
     Eigen::Matrix4f X = Eigen::Matrix4f::Random();
     TrackedVisitor<double, false> visitor;
     X.visit(visitor);
-    int count = 0;
-    for (int j=0; j<X.cols(); ++j) {
-      for (int i=0; i<X.rows(); ++i) {
+    Index count = 0;
+    for (Index j=0; j<X.cols(); ++j) {
+      for (Index i=0; i<X.rows(); ++i) {
         VERIFY_IS_EQUAL(visitor.visited[count].first, i);
         VERIFY_IS_EQUAL(visitor.visited[count].second, j);
         ++count;
@@ -226,9 +226,9 @@
     Matrix4fRowMajor X = Matrix4fRowMajor::Random();
     TrackedVisitor<double, false> visitor;
     X.visit(visitor);
-    int count = 0;
-    for (int i=0; i<X.rows(); ++i) {
-      for (int j=0; j<X.cols(); ++j) {
+    Index count = 0;
+    for (Index i=0; i<X.rows(); ++i) {
+      for (Index j=0; j<X.cols(); ++j) {
         VERIFY_IS_EQUAL(visitor.visited[count].first, i);
         VERIFY_IS_EQUAL(visitor.visited[count].second, j);
         ++count;
@@ -241,9 +241,9 @@
     Eigen::MatrixXf X = Eigen::MatrixXf::Random(4, 4);
     TrackedVisitor<double, false> visitor;
     X.visit(visitor);
-    int count = 0;
-    for (int j=0; j<X.cols(); ++j) {
-      for (int i=0; i<X.rows(); ++i) {
+    Index count = 0;
+    for (Index j=0; j<X.cols(); ++j) {
+      for (Index i=0; i<X.rows(); ++i) {
         VERIFY_IS_EQUAL(visitor.visited[count].first, i);
         VERIFY_IS_EQUAL(visitor.visited[count].second, j);
         ++count;
@@ -257,9 +257,9 @@
     MatrixXfRowMajor X = MatrixXfRowMajor::Random(4, 4);
     TrackedVisitor<double, false> visitor;
     X.visit(visitor);
-    int count = 0;
-    for (int i=0; i<X.rows(); ++i) {
-      for (int j=0; j<X.cols(); ++j) {
+    Index count = 0;
+    for (Index i=0; i<X.rows(); ++i) {
+      for (Index j=0; j<X.cols(); ++j) {
         VERIFY_IS_EQUAL(visitor.visited[count].first, i);
         VERIFY_IS_EQUAL(visitor.visited[count].second, j);
         ++count;
@@ -274,11 +274,11 @@
     Eigen::MatrixXf X = Eigen::MatrixXf::Random(4 * PacketSize, 4 * PacketSize);
     TrackedVisitor<double, true> visitor;
     X.visit(visitor);
-    int previ = -1;
-    int prevj = 0;
+    Index previ = -1;
+    Index prevj = 0;
     for (const auto& p : visitor.visited) {
-      int i = p.first;
-      int j = p.second;
+      Index i = p.first;
+      Index j = p.second;
       VERIFY(
         (j == prevj && i == previ + 1)             // Advance single element
         || (j == prevj && i == previ + PacketSize) // Advance packet
@@ -299,11 +299,11 @@
     MatrixXfRowMajor X = MatrixXfRowMajor::Random(4 * PacketSize, 4 * PacketSize);
     TrackedVisitor<double, true> visitor;
     X.visit(visitor);
-    int previ = 0;
-    int prevj = -1;
+    Index previ = 0;
+    Index prevj = -1;
     for (const auto& p : visitor.visited) {
-      int i = p.first;
-      int j = p.second;
+      Index i = p.first;
+      Index j = p.second;
       VERIFY(
         (i == previ && j == prevj + 1)             // Advance single element
         || (i == previ && j == prevj + PacketSize) // Advance packet
@@ -316,7 +316,6 @@
       VERIFY(visitor.vectorized);
     }
   }
-  
 }
 
 EIGEN_DECLARE_TEST(visitor)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index 4c069e9..e4b5e2e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -120,9 +120,7 @@
 ## TensorLayout
 
 The tensor library supports 2 layouts: `ColMajor` (the default) and
-`RowMajor`.  Only the default column major layout is currently fully
-supported, and it is therefore not recommended to attempt to use the row major
-layout at the moment.
+`RowMajor`. 
 
 The layout of a tensor is optionally specified as part of its type. If not
 specified explicitly column major is assumed.
@@ -888,6 +886,23 @@
 Returns a tensor of the same type and dimensions as the original tensor
 containing the absolute values of the original tensor.
 
+### <Operation> arg()
+
+Returns a tensor with the same dimensions as the original tensor
+containing the complex argument (phase angle) of the values of the
+original tensor.
+
+### <Operation> real()
+
+Returns a tensor with the same dimensions as the original tensor
+containing the real part of the complex values of the original tensor.
+
+### <Operation> imag()
+
+Returns a tensor with the same dimensions as the orginal tensor
+containing the imaginary part of the complex values of the original
+tensor.
+
 ### <Operation> pow(Scalar exponent)
 
 Returns a tensor of the same type and dimensions as the original tensor
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 8eaf96a..4a6edb1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -312,6 +312,12 @@
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived>
+    arg() const {
+      return unaryExpr(internal::scalar_arg_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_clamp_op<Scalar>, const Derived>
     clip(Scalar min, Scalar max) const {
       return unaryExpr(internal::scalar_clamp_op<Scalar>(min, max));
@@ -515,34 +521,34 @@
     // Comparisons and tests.
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
-    operator<(const OtherDerived& other) const {
+    operator<(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
-    operator<=(const OtherDerived& other) const {
+    operator<=(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
-    operator>(const OtherDerived& other) const {
+    operator>(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
-    operator>=(const OtherDerived& other) const {
+    operator>=(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
-    operator==(const OtherDerived& other) const {
+    operator==(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
-    operator!=(const OtherDerived& other) const {
+    operator!=(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index 227d4f3..92cbaf6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -416,6 +416,7 @@
   typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
   typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
   typedef Self LinearMapper;
+  typedef Self SubMapper;
 
   enum {
     // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
@@ -485,6 +486,13 @@
     return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return SubMapper(m_base_mapper, i, j);
+    }
+    return SubMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
   template <typename PacketT, int AlignmentType>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
     EIGEN_STATIC_ASSERT((internal::is_same<PacketT, PacketT>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -531,6 +539,7 @@
   typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
   typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
   typedef SubMapper VectorMapper;
+  typedef SubMapper LinearMapper;
 
   EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
                                const nocontract_t& nocontract_strides,
@@ -544,6 +553,10 @@
     return SubMapper(*this, i, j);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(*this, i, j);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
     return VectorMapper(*this, i, j);
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
old mode 100755
new mode 100644
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 8a91779..3cbc1ab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -394,8 +394,8 @@
         const size_t numX = dimensions()[m_indices[0]];
         const size_t numP = dimensions().TotalSize() / numX;
         const auto input_dim = std::array<size_t, 2>{numX, numP};
-        auto global_range = cl::sycl::range<2>{};
-        auto local_range = cl::sycl::range<2>{};
+        auto global_range = cl::sycl::range<2>{1, 1};
+        auto local_range = cl::sycl::range<2>{1, 1};
         const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
 
         m_device.parallel_for_setup(input_dim, global_range, local_range);
@@ -425,8 +425,8 @@
         const size_t numP = dimensions().TotalSize() / (numX * numY);
         auto input_dim = std::array<size_t, 3>{numX, numY, numP};
 
-        auto global_range = cl::sycl::range<3>{};
-        auto local_range = cl::sycl::range<3>{};
+        auto global_range = cl::sycl::range<3>{1, 1, 1};
+        auto local_range = cl::sycl::range<3>{1, 1, 1};
 
         m_device.parallel_for_setup(input_dim, global_range, local_range);
 
@@ -469,8 +469,8 @@
 
         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
 
-        auto global_range = cl::sycl::range<3>{};
-        auto local_range = cl::sycl::range<3>{};
+        auto global_range = cl::sycl::range<3>{1, 1, 1};
+        auto local_range = cl::sycl::range<3>{1, 1, 1};
 
         m_device.parallel_for_setup(input_dim, global_range, local_range);
         auto local_memory_range = (local_range + kernel_size - 1);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index b2ae608..84ebe38 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -136,6 +136,15 @@
                          this->exception_caught_ = this->sycl_async_handler(l);
                        },
                        num_threads) {}
+  
+  explicit QueueInterface(
+      const cl::sycl::queue& q, unsigned num_threads = std::thread::hardware_concurrency())
+      : m_queue(q),
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+        m_prog(m_queue.get_context(), get_sycl_supported_devices()),
+#endif
+        m_thread_pool(num_threads),
+        m_device_info(m_queue) {}
 
 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
   EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index c9d6938..b929e84 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -29,10 +29,19 @@
   * The default implementation is based on kissfft. It is a small, free, and
   * reasonably efficient default.
   *
-  * There are currently two implementation backend:
+  * There are currently four implementation backend:
   *
+  * - kissfft(https://github.com/mborgerding/kissfft) : Simple and not so fast, BSD-3-Clause.
+  *   It is a mixed-radix Fast Fourier Transform based up on the principle, "Keep It Simple, Stupid."
+  *   Notice that:kissfft fails to handle "atypically-sized" inputs(i.e., sizes with large factors),a workaround is using fftw or pocketfft.
   * - fftw (http://www.fftw.org) : faster, GPL -- incompatible with Eigen in LGPL form, bigger code size.
-  * - MKL (http://en.wikipedia.org/wiki/Math_Kernel_Library) : fastest, commercial -- may be incompatible with Eigen in GPL form.
+  * - MKL (https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-download.html) : fastest, free -- may be incompatible with Eigen in GPL form.
+  * - pocketfft (https://gitlab.mpcdf.mpg.de/mtr/pocketfft) : faster than kissfft, BSD 3-clause.
+  *   It is a heavily modified implementation of FFTPack, with the following advantages:
+  *   1.strictly C++11 compliant
+  *   2.more accurate twiddle factor computation
+  *   3.very fast plan generation
+  *   4.worst case complexity for transform sizes with large prime factors is N*log(N), because Bluestein's algorithm is used for these cases
   *
   * \section FFTDesign Design
   *
@@ -79,15 +88,21 @@
      template <typename T> struct default_fft_impl : public internal::fftw_impl<T> {};
    }
 #elif defined EIGEN_MKL_DEFAULT
-// TODO 
-// intel Math Kernel Library: fastest, commercial -- may be incompatible with Eigen in GPL form
+// intel Math Kernel Library: fastest, free -- may be incompatible with Eigen in GPL form
 #  include "src/FFT/ei_imklfft_impl.h"
    namespace Eigen {
-     template <typename T> struct default_fft_impl : public internal::imklfft_impl {};
+     template <typename T> struct default_fft_impl : public internal::imklfft::imklfft_impl<T> {};
    }
-#else
+#elif defined EIGEN_POCKETFFT_DEFAULT
+// internal::pocketfft_impl: a heavily modified implementation of FFTPack, with many advantages.
+# include<pocketfft_hdronly.h>
+# include"src/FFT/ei_pocketfft_impl.h"
+  namespace Eigen {
+     template <typename T>
+      struct default_fft_impl : public internal::pocketfft_impl<T> {};
+  }
+#else 
 // internal::kissfft_impl:  small, free, reasonably efficient default, derived from kissfft
-//
 # include "src/FFT/ei_kissfft_impl.h"
   namespace Eigen {
      template <typename T> 
@@ -195,13 +210,13 @@
         m_impl.fwd(dst,src,static_cast<int>(nfft));
     }
 
-    /*
+#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_MKL_DEFAULT
     inline 
     void fwd2(Complex * dst, const Complex * src, int n0,int n1)
     {
       m_impl.fwd2(dst,src,n0,n1);
     }
-    */
+#endif
 
     template <typename Input_>
     inline
@@ -354,8 +369,7 @@
     }
 
 
-    /*
-    // TODO: multi-dimensional FFTs
+#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_MKL_DEFAULT
     inline 
     void inv2(Complex * dst, const Complex * src, int n0,int n1)
     {
@@ -363,7 +377,8 @@
       if ( HasFlag( Unscaled ) == false)
           scale(dst,1./(n0*n1),n0*n1);
     }
-  */
+#endif
+
 
     inline
     impl_type & impl() {return m_impl;}
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
old mode 100755
new mode 100644
index 47b0b34..97222d1
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -715,6 +715,23 @@
   };
 };
 
+namespace internal {
+template<typename DerivativeType>
+struct is_identically_zero_impl<AutoDiffScalar<DerivativeType>> {
+  static inline bool run(const AutoDiffScalar<DerivativeType>& s)
+  {
+    const DerivativeType& derivatives = s.derivatives();
+    for(int i=0; i<derivatives.size(); ++i)
+    {
+      if(!numext::is_exactly_zero(derivatives[i]))
+      {
+        return false;
+      }
+    }
+    return numext::is_exactly_zero(s.value());
+  }
+};
+}
 }
 
 namespace std {
diff --git a/unsupported/Eigen/src/FFT/ei_imklfft_impl.h b/unsupported/Eigen/src/FFT/ei_imklfft_impl.h
new file mode 100644
index 0000000..186a66c
--- /dev/null
+++ b/unsupported/Eigen/src/FFT/ei_imklfft_impl.h
@@ -0,0 +1,288 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <mkl_dfti.h>
+
+#include "./InternalHeaderCheck.h"
+
+#include <complex>
+
+namespace Eigen {
+namespace internal {
+namespace imklfft {
+
+#define RUN_OR_ASSERT(EXPR, ERROR_MSG)                   \
+  {                                                      \
+    MKL_LONG status = (EXPR);                            \
+    eigen_assert(status == DFTI_NO_ERROR && (ERROR_MSG)); \
+  };
+
+inline MKL_Complex16* complex_cast(const std::complex<double>* p) {
+  return const_cast<MKL_Complex16*>(reinterpret_cast<const MKL_Complex16*>(p));
+}
+
+inline MKL_Complex8* complex_cast(const std::complex<float>* p) {
+  return const_cast<MKL_Complex8*>(reinterpret_cast<const MKL_Complex8*>(p));
+}
+
+/*
+ * Parameters:
+ * precision: enum, Precision of the transform: DFTI_SINGLE or DFTI_DOUBLE.
+ * forward_domain: enum, Forward domain of the transform: DFTI_COMPLEX or
+ * DFTI_REAL. dimension: MKL_LONG Dimension of the transform. sizes: MKL_LONG if
+ * dimension = 1.Length of the transform for a one-dimensional transform. sizes:
+ * Array of type MKL_LONG otherwise. Lengths of each dimension for a
+ * multi-dimensional transform.
+ */
+inline void configure_descriptor(DFTI_DESCRIPTOR_HANDLE* handl,
+                                 enum DFTI_CONFIG_VALUE precision,
+                                 enum DFTI_CONFIG_VALUE forward_domain,
+                                 MKL_LONG dimension, MKL_LONG* sizes) {
+  eigen_assert(dimension == 1 ||
+               dimension == 2 &&
+                   "Transformation dimension must be less than 3.");
+
+  if (dimension == 1) {
+    RUN_OR_ASSERT(DftiCreateDescriptor(handl, precision, forward_domain,
+                                       dimension, *sizes),
+                  "DftiCreateDescriptor failed.")
+    if (forward_domain == DFTI_REAL) {
+      // Set CCE storage
+      RUN_OR_ASSERT(DftiSetValue(*handl, DFTI_CONJUGATE_EVEN_STORAGE,
+                                 DFTI_COMPLEX_COMPLEX),
+                    "DftiSetValue failed.")
+    }
+  } else {
+    RUN_OR_ASSERT(
+        DftiCreateDescriptor(handl, precision, DFTI_COMPLEX, dimension, sizes),
+        "DftiCreateDescriptor failed.")
+  }
+
+  RUN_OR_ASSERT(DftiSetValue(*handl, DFTI_PLACEMENT, DFTI_NOT_INPLACE),
+                "DftiSetValue failed.")
+  RUN_OR_ASSERT(DftiCommitDescriptor(*handl), "DftiCommitDescriptor failed.")
+}
+
+template <typename T>
+struct plan {};
+
+template <>
+struct plan<float> {
+  typedef float scalar_type;
+  typedef MKL_Complex8 complex_type;
+
+  DFTI_DESCRIPTOR_HANDLE m_plan;
+
+  plan() : m_plan(0) {}
+  ~plan() {
+    if (m_plan) DftiFreeDescriptor(&m_plan);
+  };
+
+  enum DFTI_CONFIG_VALUE precision = DFTI_SINGLE;
+
+  inline void forward(complex_type* dst, complex_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(&m_plan, precision, DFTI_COMPLEX, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeForward(m_plan, src, dst),
+                  "DftiComputeForward failed.")
+  }
+
+  inline void inverse(complex_type* dst, complex_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(&m_plan, precision, DFTI_COMPLEX, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeBackward(m_plan, src, dst),
+                  "DftiComputeBackward failed.")
+  }
+
+  inline void forward(complex_type* dst, scalar_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(&m_plan, precision, DFTI_REAL, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeForward(m_plan, src, dst),
+                  "DftiComputeForward failed.")
+  }
+
+  inline void inverse(scalar_type* dst, complex_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(&m_plan, precision, DFTI_REAL, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeBackward(m_plan, src, dst),
+                  "DftiComputeBackward failed.")
+  }
+
+  inline void forward2(complex_type* dst, complex_type* src, int n0, int n1) {
+    if (m_plan == 0) {
+      MKL_LONG sizes[2] = {n0, n1};
+      configure_descriptor(&m_plan, precision, DFTI_COMPLEX, 2, sizes);
+    }
+    RUN_OR_ASSERT(DftiComputeForward(m_plan, src, dst),
+                  "DftiComputeForward failed.")
+  }
+
+  inline void inverse2(complex_type* dst, complex_type* src, int n0, int n1) {
+    if (m_plan == 0) {
+      MKL_LONG sizes[2] = {n0, n1};
+      configure_descriptor(&m_plan, precision, DFTI_COMPLEX, 2, sizes);
+    }
+    RUN_OR_ASSERT(DftiComputeBackward(m_plan, src, dst),
+                  "DftiComputeBackward failed.")
+  }
+};
+
+template <>
+struct plan<double> {
+  typedef double scalar_type;
+  typedef MKL_Complex16 complex_type;
+
+  DFTI_DESCRIPTOR_HANDLE m_plan;
+
+  plan() : m_plan(0) {}
+  ~plan() {
+    if (m_plan) DftiFreeDescriptor(&m_plan);
+  };
+
+  enum DFTI_CONFIG_VALUE precision = DFTI_DOUBLE;
+
+  inline void forward(complex_type* dst, complex_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(&m_plan, precision, DFTI_COMPLEX, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeForward(m_plan, src, dst),
+                  "DftiComputeForward failed.")
+  }
+
+  inline void inverse(complex_type* dst, complex_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(&m_plan, precision, DFTI_COMPLEX, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeBackward(m_plan, src, dst),
+                  "DftiComputeBackward failed.")
+  }
+
+  inline void forward(complex_type* dst, scalar_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(&m_plan, precision, DFTI_REAL, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeForward(m_plan, src, dst),
+                  "DftiComputeForward failed.")
+  }
+
+  inline void inverse(scalar_type* dst, complex_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(&m_plan, precision, DFTI_REAL, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeBackward(m_plan, src, dst),
+                  "DftiComputeBackward failed.")
+  }
+
+  inline void forward2(complex_type* dst, complex_type* src, int n0, int n1) {
+    if (m_plan == 0) {
+      MKL_LONG sizes[2] = {n0, n1};
+      configure_descriptor(&m_plan, precision, DFTI_COMPLEX, 2, sizes);
+    }
+    RUN_OR_ASSERT(DftiComputeForward(m_plan, src, dst),
+                  "DftiComputeForward failed.")
+  }
+
+  inline void inverse2(complex_type* dst, complex_type* src, int n0, int n1) {
+    if (m_plan == 0) {
+      MKL_LONG sizes[2] = {n0, n1};
+      configure_descriptor(&m_plan, precision, DFTI_COMPLEX, 2, sizes);
+    }
+    RUN_OR_ASSERT(DftiComputeBackward(m_plan, src, dst),
+                  "DftiComputeBackward failed.")
+  }
+};
+
+template <typename Scalar_>
+struct imklfft_impl {
+  typedef Scalar_ Scalar;
+  typedef std::complex<Scalar> Complex;
+
+  inline void clear() { m_plans.clear(); }
+
+  // complex-to-complex forward FFT
+  inline void fwd(Complex* dst, const Complex* src, int nfft) {
+    MKL_LONG size = nfft;
+    get_plan(nfft, dst, src)
+        .forward(complex_cast(dst), complex_cast(src), size);
+  }
+
+  // real-to-complex forward FFT
+  inline void fwd(Complex* dst, const Scalar* src, int nfft) {
+    MKL_LONG size = nfft;
+    get_plan(nfft, dst, src)
+        .forward(complex_cast(dst), const_cast<Scalar*>(src), nfft);
+  }
+
+  // 2-d complex-to-complex
+  inline void fwd2(Complex* dst, const Complex* src, int n0, int n1) {
+    get_plan(n0, n1, dst, src)
+        .forward2(complex_cast(dst), complex_cast(src), n0, n1);
+  }
+
+  // inverse complex-to-complex
+  inline void inv(Complex* dst, const Complex* src, int nfft) {
+    MKL_LONG size = nfft;
+    get_plan(nfft, dst, src)
+        .inverse(complex_cast(dst), complex_cast(src), nfft);
+  }
+
+  // half-complex to scalar
+  inline void inv(Scalar* dst, const Complex* src, int nfft) {
+    MKL_LONG size = nfft;
+    get_plan(nfft, dst, src)
+        .inverse(const_cast<Scalar*>(dst), complex_cast(src), nfft);
+  }
+
+  // 2-d complex-to-complex
+  inline void inv2(Complex* dst, const Complex* src, int n0, int n1) {
+    get_plan(n0, n1, dst, src)
+        .inverse2(complex_cast(dst), complex_cast(src), n0, n1);
+  }
+
+ private:
+  std::map<int64_t, plan<Scalar>> m_plans;
+
+  inline plan<Scalar>& get_plan(int nfft, void* dst,
+                                const void* src) {
+    int inplace = dst == src ? 1 : 0;
+    int aligned = ((reinterpret_cast<size_t>(src) & 15) |
+                   (reinterpret_cast<size_t>(dst) & 15)) == 0
+                      ? 1
+                      : 0;
+    int64_t key = ((nfft << 2) | (inplace << 1) | aligned)
+                  << 1;
+
+    // Create element if key does not exist.
+    return m_plans[key];
+  }
+
+  inline plan<Scalar>& get_plan(int n0, int n1, void* dst,
+                                const void* src) {
+    int inplace = (dst == src) ? 1 : 0;
+    int aligned = ((reinterpret_cast<size_t>(src) & 15) |
+                   (reinterpret_cast<size_t>(dst) & 15)) == 0
+                      ? 1
+                      : 0;
+    int64_t key = (((((int64_t)n0) << 31) | (n1 << 2) |
+                    (inplace << 1) | aligned)
+                   << 1) +
+                  1;
+
+    // Create element if key does not exist.
+    return m_plans[key];
+  }
+};
+
+#undef RUN_OR_ASSERT
+
+}  // namespace imklfft
+}  // namespace internal
+}  // namespace Eigen
diff --git a/unsupported/Eigen/src/FFT/ei_pocketfft_impl.h b/unsupported/Eigen/src/FFT/ei_pocketfft_impl.h
new file mode 100644
index 0000000..f2da890
--- /dev/null
+++ b/unsupported/Eigen/src/FFT/ei_pocketfft_impl.h
@@ -0,0 +1,69 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. 
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+using namespace pocketfft;
+using namespace pocketfft::detail;
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename _Scalar>
+struct pocketfft_impl
+{
+  typedef _Scalar Scalar;
+  typedef std::complex<Scalar> Complex;
+
+  inline void clear() {}
+
+  inline void fwd(Complex* dst, const Scalar* src, int nfft){
+    const shape_t  shape_{ static_cast<size_t>(nfft) };
+    const shape_t  axes_{ 0 };
+    const stride_t stride_in{ sizeof(Scalar) };
+    const stride_t stride_out{ sizeof(Complex) };
+    r2c(shape_, stride_in, stride_out, axes_, FORWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void fwd(Complex* dst, const Complex* src, int nfft){
+    const shape_t  shape_{ static_cast<size_t>(nfft) };
+    const shape_t  axes_{ 0 };
+    const stride_t stride_{ sizeof(Complex) };
+    c2c(shape_, stride_, stride_, axes_, FORWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void inv(Scalar* dst,  const Complex* src, int nfft){
+    const shape_t  shape_{ static_cast<size_t>(nfft) };
+    const shape_t  axes_{ 0 };
+    const stride_t stride_in{ sizeof(Complex) };
+    const stride_t stride_out{ sizeof(Scalar) };
+    c2r(shape_, stride_in, stride_out, axes_, BACKWARD, src, dst, static_cast<Scalar>(1));
+  }  
+
+  inline void inv(Complex* dst, const Complex* src, int nfft){
+    const shape_t  shape_{ static_cast<size_t>(nfft) };
+    const shape_t  axes_{ 0 };
+    const stride_t stride_{ sizeof(Complex) };
+    c2c(shape_, stride_, stride_, axes_, BACKWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void fwd2(Complex* dst, const Complex* src, int nfft0, int nfft1){
+    const shape_t  shape_{ static_cast<size_t>(nfft0), static_cast<size_t>(nfft1) };
+    const shape_t  axes_{ 0, 1 };
+    const stride_t stride_{ static_cast<ptrdiff_t>(sizeof(Complex)*nfft1), static_cast<ptrdiff_t>(sizeof(Complex)) };
+    c2c(shape_, stride_, stride_, axes_, FORWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void inv2(Complex* dst, const Complex* src, int nfft0, int nfft1){
+    const shape_t  shape_{ static_cast<size_t>(nfft0), static_cast<size_t>(nfft1) };
+    const shape_t  axes_{ 0, 1 };
+    const stride_t stride_{ static_cast<ptrdiff_t>(sizeof(Complex)*nfft1), static_cast<ptrdiff_t>(sizeof(Complex)) };
+    c2c(shape_, stride_, stride_, axes_, BACKWARD, src, dst, static_cast<Scalar>(1));
+  }
+};
+
+} // namespace internal
+} // namespace Eigen
diff --git a/unsupported/Eigen/src/IterativeSolvers/BiCGSTABL.h b/unsupported/Eigen/src/IterativeSolvers/BiCGSTABL.h
old mode 100755
new mode 100644
diff --git a/unsupported/Eigen/src/IterativeSolvers/IDRS.h b/unsupported/Eigen/src/IterativeSolvers/IDRS.h
old mode 100755
new mode 100644
diff --git a/unsupported/Eigen/src/IterativeSolvers/IDRSTABL.h b/unsupported/Eigen/src/IterativeSolvers/IDRSTABL.h
old mode 100755
new mode 100644
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
index 7dd3c3e..909b08e 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
@@ -4,6 +4,9 @@
 namespace Eigen {
 namespace internal {
 
+// Bessel functions only available for some compilers.
+#if EIGEN_HAS_AVX512_MATH
+
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0)
 
@@ -40,6 +43,8 @@
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1)
 
+#endif
+
 }  // namespace internal
 }  // namespace Eigen
 
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index b7661e7..21d8c5e 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -77,6 +77,17 @@
   ei_add_property(EIGEN_MISSING_BACKENDS "fftw, ")
 endif()
 
+find_path(POCKETFFT  pocketfft_hdronly.h)
+if(POCKETFFT)
+  if(EIGEN_TEST_CXX11)
+    ei_add_property(EIGEN_TESTED_BACKENDS "pocketfft, ")
+    include_directories( ${POCKETFFT} )
+    ei_add_test(pocketfft "-pthread" "${CMAKE_THREAD_LIBS_INIT}" "-DEIGEN_POCKETFFT_DEFAULT" )  
+  endif()  
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "pocketfft, ")
+endif()
+
 option(EIGEN_TEST_OPENGL "Enable OpenGL support in unit tests" OFF)
 if(EIGEN_TEST_OPENGL)
   find_package(OpenGL)
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 45c87f5..f85461c 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -1,2 +1,2 @@
-#define test_FFTW test_FFT
-#include "FFTW.cpp"
+#define EIGEN_FFT_DEFAULT 1
+#include "fft_test_shared.h"
diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
index cfe559e..d69867c 100644
--- a/unsupported/test/FFTW.cpp
+++ b/unsupported/test/FFTW.cpp
@@ -1,262 +1,2 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Mark Borgerding mark a borgerding net
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <unsupported/Eigen/FFT>
-
-template <typename T> 
-std::complex<T> RandomCpx() { return std::complex<T>( (T)(rand()/(T)RAND_MAX - .5), (T)(rand()/(T)RAND_MAX - .5) ); }
-
-using namespace std;
-using namespace Eigen;
-
-
-template < typename T>
-complex<long double>  promote(complex<T> x) { return complex<long double>((long double)x.real(),(long double)x.imag()); }
-
-complex<long double>  promote(float x) { return complex<long double>((long double)x); }
-complex<long double>  promote(double x) { return complex<long double>((long double)x); }
-complex<long double>  promote(long double x) { return complex<long double>((long double)x); }
-    
-
-    template <typename VT1,typename VT2>
-    long double fft_rmse( const VT1 & fftbuf,const VT2 & timebuf)
-    {
-        long double totalpower=0;
-        long double difpower=0;
-        long double pi = acos((long double)-1 );
-        for (size_t k0=0;k0<(size_t)fftbuf.size();++k0) {
-            complex<long double> acc = 0;
-            long double phinc = (long double)(-2.)*k0* pi / timebuf.size();
-            for (size_t k1=0;k1<(size_t)timebuf.size();++k1) {
-                acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
-            }
-            totalpower += numext::abs2(acc);
-            complex<long double> x = promote(fftbuf[k0]); 
-            complex<long double> dif = acc - x;
-            difpower += numext::abs2(dif);
-            //cerr << k0 << "\t" << acc << "\t" <<  x << "\t" << sqrt(numext::abs2(dif)) << endl;
-        }
-        cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
-        return sqrt(difpower/totalpower);
-    }
-
-    template <typename VT1,typename VT2>
-    long double dif_rmse( const VT1 buf1,const VT2 buf2)
-    {
-        long double totalpower=0;
-        long double difpower=0;
-        size_t n = (min)( buf1.size(),buf2.size() );
-        for (size_t k=0;k<n;++k) {
-            totalpower += (long double)((numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2);
-            difpower += (long double)(numext::abs2(buf1[k] - buf2[k]));
-        }
-        return sqrt(difpower/totalpower);
-    }
-
-enum { StdVectorContainer, EigenVectorContainer };
-
-template<int Container, typename Scalar> struct VectorType;
-
-template<typename Scalar> struct VectorType<StdVectorContainer,Scalar>
-{
-  typedef vector<Scalar> type;
-};
-
-template<typename Scalar> struct VectorType<EigenVectorContainer,Scalar>
-{
-  typedef Matrix<Scalar,Dynamic,1> type;
-};
-
-template <int Container, typename T>
-void test_scalar_generic(int nfft)
-{
-    typedef typename FFT<T>::Complex Complex;
-    typedef typename FFT<T>::Scalar Scalar;
-    typedef typename VectorType<Container,Scalar>::type ScalarVector;
-    typedef typename VectorType<Container,Complex>::type ComplexVector;
-
-    FFT<T> fft;
-    ScalarVector tbuf(nfft);
-    ComplexVector freqBuf;
-    for (int k=0;k<nfft;++k)
-        tbuf[k]= (T)( rand()/(double)RAND_MAX - .5);
-
-    // make sure it DOESN'T give the right full spectrum answer
-    // if we've asked for half-spectrum
-    fft.SetFlag(fft.HalfSpectrum );
-    fft.fwd( freqBuf,tbuf);
-    VERIFY((size_t)freqBuf.size() == (size_t)( (nfft>>1)+1) );
-    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
-
-    fft.ClearFlag(fft.HalfSpectrum );
-    fft.fwd( freqBuf,tbuf);
-    VERIFY( (size_t)freqBuf.size() == (size_t)nfft);
-    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
-
-    if (nfft&1)
-        return; // odd FFTs get the wrong size inverse FFT
-
-    ScalarVector tbuf2;
-    fft.inv( tbuf2 , freqBuf);
-    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
-
-
-    // verify that the Unscaled flag takes effect
-    ScalarVector tbuf3;
-    fft.SetFlag(fft.Unscaled);
-
-    fft.inv( tbuf3 , freqBuf);
-
-    for (int k=0;k<nfft;++k)
-        tbuf3[k] *= T(1./nfft);
-
-
-    //for (size_t i=0;i<(size_t) tbuf.size();++i)
-    //    cout << "freqBuf=" << freqBuf[i] << " in2=" << tbuf3[i] << " -  in=" << tbuf[i] << " => " << (tbuf3[i] - tbuf[i] ) <<  endl;
-
-    VERIFY( T(dif_rmse(tbuf,tbuf3)) < test_precision<T>()  );// gross check
-
-    // verify that ClearFlag works
-    fft.ClearFlag(fft.Unscaled);
-    fft.inv( tbuf2 , freqBuf);
-    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
-}
-
-template <typename T>
-void test_scalar(int nfft)
-{
-  test_scalar_generic<StdVectorContainer,T>(nfft);
-  //test_scalar_generic<EigenVectorContainer,T>(nfft);
-}
-
-
-template <int Container, typename T>
-void test_complex_generic(int nfft)
-{
-    typedef typename FFT<T>::Complex Complex;
-    typedef typename VectorType<Container,Complex>::type ComplexVector;
-
-    FFT<T> fft;
-
-    ComplexVector inbuf(nfft);
-    ComplexVector outbuf;
-    ComplexVector buf3;
-    for (int k=0;k<nfft;++k)
-        inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
-    fft.fwd( outbuf , inbuf);
-
-    VERIFY( T(fft_rmse(outbuf,inbuf)) < test_precision<T>()  );// gross check
-    fft.inv( buf3 , outbuf);
-
-    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
-
-    // verify that the Unscaled flag takes effect
-    ComplexVector buf4;
-    fft.SetFlag(fft.Unscaled);
-    fft.inv( buf4 , outbuf);
-    for (int k=0;k<nfft;++k)
-        buf4[k] *= T(1./nfft);
-    VERIFY( T(dif_rmse(inbuf,buf4)) < test_precision<T>()  );// gross check
-
-    // verify that ClearFlag works
-    fft.ClearFlag(fft.Unscaled);
-    fft.inv( buf3 , outbuf);
-    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
-}
-
-template <typename T>
-void test_complex(int nfft)
-{
-  test_complex_generic<StdVectorContainer,T>(nfft);
-  test_complex_generic<EigenVectorContainer,T>(nfft);
-}
-/*
-template <typename T,int nrows,int ncols>
-void test_complex2d()
-{
-    typedef typename Eigen::FFT<T>::Complex Complex;
-    FFT<T> fft;
-    Eigen::Matrix<Complex,nrows,ncols> src,src2,dst,dst2;
-
-    src = Eigen::Matrix<Complex,nrows,ncols>::Random();
-    //src =  Eigen::Matrix<Complex,nrows,ncols>::Identity();
-
-    for (int k=0;k<ncols;k++) {
-        Eigen::Matrix<Complex,nrows,1> tmpOut;
-        fft.fwd( tmpOut,src.col(k) );
-        dst2.col(k) = tmpOut;
-    }
-
-    for (int k=0;k<nrows;k++) {
-        Eigen::Matrix<Complex,1,ncols> tmpOut;
-        fft.fwd( tmpOut,  dst2.row(k) );
-        dst2.row(k) = tmpOut;
-    }
-
-    fft.fwd2(dst.data(),src.data(),ncols,nrows);
-    fft.inv2(src2.data(),dst.data(),ncols,nrows);
-    VERIFY( (src-src2).norm() < test_precision<T>() );
-    VERIFY( (dst-dst2).norm() < test_precision<T>() );
-}
-*/
-
-
-void test_return_by_value(int len)
-{
-    VectorXf in;
-    VectorXf in1;
-    in.setRandom( len );
-    VectorXcf out1,out2;
-    FFT<float> fft;
-
-    fft.SetFlag(fft.HalfSpectrum );
-
-    fft.fwd(out1,in);
-    out2 = fft.fwd(in);
-    VERIFY( (out1-out2).norm() < test_precision<float>() );
-    in1 = fft.inv(out1);
-    VERIFY( (in1-in).norm() < test_precision<float>() );
-}
-
-EIGEN_DECLARE_TEST(FFTW)
-{
-  CALL_SUBTEST( test_return_by_value(32) );
-  //CALL_SUBTEST( ( test_complex2d<float,4,8> () ) ); CALL_SUBTEST( ( test_complex2d<double,4,8> () ) );
-  //CALL_SUBTEST( ( test_complex2d<long double,4,8> () ) );
-  CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); 
-  CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); 
-  CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); 
-  CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); 
-  CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); 
-  CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); 
-  CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); 
-
-  CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); 
-  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); 
-  CALL_SUBTEST( test_scalar<float>(50) ); CALL_SUBTEST( test_scalar<double>(50) ); 
-  CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); 
-  CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); 
-  
-  #ifdef EIGEN_HAS_FFTWL
-  CALL_SUBTEST( test_complex<long double>(32) );
-  CALL_SUBTEST( test_complex<long double>(256) );
-  CALL_SUBTEST( test_complex<long double>(3*8) );
-  CALL_SUBTEST( test_complex<long double>(5*32) );
-  CALL_SUBTEST( test_complex<long double>(2*3*4) );
-  CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
-  CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
-  
-  CALL_SUBTEST( test_scalar<long double>(32) );
-  CALL_SUBTEST( test_scalar<long double>(45) );
-  CALL_SUBTEST( test_scalar<long double>(50) );
-  CALL_SUBTEST( test_scalar<long double>(256) );
-  CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
-  #endif
-}
+#define EIGEN_FFTW_DEFAULT 1 
+#include "fft_test_shared.h"
diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
index 99e1807..b2f5994 100644
--- a/unsupported/test/cxx11_tensor_of_complex.cpp
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -47,6 +47,20 @@
   }
 }
 
+static void test_arg()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<double>, 1> data2(3);
+  data1.setRandom();
+  data2.setRandom();
+
+  Tensor<float, 1> arg1 = data1.arg();
+  Tensor<double, 1> arg2 = data2.arg();
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_APPROX(arg1(i), std::arg(data1(i)));
+    VERIFY_IS_APPROX(arg2(i), std::arg(data2(i)));
+  }
+}
 
 static void test_conjugate()
 {
@@ -98,6 +112,7 @@
 {
   CALL_SUBTEST(test_additions());
   CALL_SUBTEST(test_abs());
+  CALL_SUBTEST(test_arg());
   CALL_SUBTEST(test_conjugate());
   CALL_SUBTEST(test_contractions());
 }
diff --git a/unsupported/test/fft_test_shared.h b/unsupported/test/fft_test_shared.h
new file mode 100644
index 0000000..0e040ad
--- /dev/null
+++ b/unsupported/test/fft_test_shared.h
@@ -0,0 +1,277 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/FFT>
+
+template <typename T>
+inline std::complex<T> RandomCpx() {
+  return std::complex<T>((T)(rand() / (T)RAND_MAX - .5), (T)(rand() / (T)RAND_MAX - .5));
+}
+
+using namespace std;
+using namespace Eigen;
+
+template <typename T>
+inline complex<long double> promote(complex<T> x) {
+  return complex<long double>((long double)x.real(), (long double)x.imag());
+}
+
+inline complex<long double> promote(float x) { return complex<long double>((long double)x); }
+inline complex<long double> promote(double x) { return complex<long double>((long double)x); }
+inline complex<long double> promote(long double x) { return complex<long double>((long double)x); }
+
+template <typename VT1, typename VT2>
+long double fft_rmse(const VT1& fftbuf, const VT2& timebuf) {
+  long double totalpower = 0;
+  long double difpower = 0;
+  long double pi = acos((long double)-1);
+  for (size_t k0 = 0; k0 < (size_t)fftbuf.size(); ++k0) {
+    complex<long double> acc = 0;
+    long double phinc = (long double)(-2.) * k0 * pi / timebuf.size();
+    for (size_t k1 = 0; k1 < (size_t)timebuf.size(); ++k1) {
+      acc += promote(timebuf[k1]) * exp(complex<long double>(0, k1 * phinc));
+    }
+    totalpower += numext::abs2(acc);
+    complex<long double> x = promote(fftbuf[k0]);
+    complex<long double> dif = acc - x;
+    difpower += numext::abs2(dif);
+    // cerr << k0 << "\t" << acc << "\t" <<  x << "\t" << sqrt(numext::abs2(dif)) << endl;
+  }
+  // cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
+  return sqrt(difpower / totalpower);
+}
+
+template <typename VT1, typename VT2>
+long double dif_rmse(const VT1 buf1, const VT2 buf2) {
+  long double totalpower = 0;
+  long double difpower = 0;
+  size_t n = (min)(buf1.size(), buf2.size());
+  for (size_t k = 0; k < n; ++k) {
+    totalpower += (long double)((numext::abs2(buf1[k]) + numext::abs2(buf2[k])) / 2);
+    difpower += (long double)(numext::abs2(buf1[k] - buf2[k]));
+  }
+  return sqrt(difpower / totalpower);
+}
+
+enum { StdVectorContainer, EigenVectorContainer };
+
+template <int Container, typename Scalar>
+struct VectorType;
+
+template <typename Scalar>
+struct VectorType<StdVectorContainer, Scalar> {
+  typedef vector<Scalar> type;
+};
+
+template <typename Scalar>
+struct VectorType<EigenVectorContainer, Scalar> {
+  typedef Matrix<Scalar, Dynamic, 1> type;
+};
+
+template <int Container, typename T>
+void test_scalar_generic(int nfft) {
+  typedef typename FFT<T>::Complex Complex;
+  typedef typename FFT<T>::Scalar Scalar;
+  typedef typename VectorType<Container, Scalar>::type ScalarVector;
+  typedef typename VectorType<Container, Complex>::type ComplexVector;
+
+  FFT<T> fft;
+  ScalarVector tbuf(nfft);
+  ComplexVector freqBuf;
+  for (int k = 0; k < nfft; ++k) tbuf[k] = (T)(rand() / (double)RAND_MAX - .5);
+
+  // make sure it DOESN'T give the right full spectrum answer
+  // if we've asked for half-spectrum
+  fft.SetFlag(fft.HalfSpectrum);
+  fft.fwd(freqBuf, tbuf);
+  VERIFY((size_t)freqBuf.size() == (size_t)((nfft >> 1) + 1));
+  VERIFY(T(fft_rmse(freqBuf, tbuf)) < test_precision<T>());  // gross check
+
+  fft.ClearFlag(fft.HalfSpectrum);
+  fft.fwd(freqBuf, tbuf);
+  VERIFY((size_t)freqBuf.size() == (size_t)nfft);
+  VERIFY(T(fft_rmse(freqBuf, tbuf)) < test_precision<T>());  // gross check
+
+  if (nfft & 1) return;  // odd FFTs get the wrong size inverse FFT
+
+  ScalarVector tbuf2;
+  fft.inv(tbuf2, freqBuf);
+  VERIFY(T(dif_rmse(tbuf, tbuf2)) < test_precision<T>());  // gross check
+
+  // verify that the Unscaled flag takes effect
+  ScalarVector tbuf3;
+  fft.SetFlag(fft.Unscaled);
+
+  fft.inv(tbuf3, freqBuf);
+
+  for (int k = 0; k < nfft; ++k) tbuf3[k] *= T(1. / nfft);
+
+  // for (size_t i=0;i<(size_t) tbuf.size();++i)
+  //     cout << "freqBuf=" << freqBuf[i] << " in2=" << tbuf3[i] << " -  in=" << tbuf[i] << " => " << (tbuf3[i] -
+  //     tbuf[i] ) <<  endl;
+
+  VERIFY(T(dif_rmse(tbuf, tbuf3)) < test_precision<T>());  // gross check
+
+  // verify that ClearFlag works
+  fft.ClearFlag(fft.Unscaled);
+  fft.inv(tbuf2, freqBuf);
+  VERIFY(T(dif_rmse(tbuf, tbuf2)) < test_precision<T>());  // gross check
+}
+
+template <typename T>
+void test_scalar(int nfft) {
+  test_scalar_generic<StdVectorContainer, T>(nfft);
+  // test_scalar_generic<EigenVectorContainer,T>(nfft);
+}
+
+template <int Container, typename T>
+void test_complex_generic(int nfft) {
+  typedef typename FFT<T>::Complex Complex;
+  typedef typename VectorType<Container, Complex>::type ComplexVector;
+
+  FFT<T> fft;
+
+  ComplexVector inbuf(nfft);
+  ComplexVector outbuf;
+  ComplexVector buf3;
+  for (int k = 0; k < nfft; ++k)
+    inbuf[k] = Complex((T)(rand() / (double)RAND_MAX - .5), (T)(rand() / (double)RAND_MAX - .5));
+  fft.fwd(outbuf, inbuf);
+
+  VERIFY(T(fft_rmse(outbuf, inbuf)) < test_precision<T>());  // gross check
+  fft.inv(buf3, outbuf);
+
+  VERIFY(T(dif_rmse(inbuf, buf3)) < test_precision<T>());  // gross check
+
+  // verify that the Unscaled flag takes effect
+  ComplexVector buf4;
+  fft.SetFlag(fft.Unscaled);
+  fft.inv(buf4, outbuf);
+  for (int k = 0; k < nfft; ++k) buf4[k] *= T(1. / nfft);
+  VERIFY(T(dif_rmse(inbuf, buf4)) < test_precision<T>());  // gross check
+
+  // verify that ClearFlag works
+  fft.ClearFlag(fft.Unscaled);
+  fft.inv(buf3, outbuf);
+  VERIFY(T(dif_rmse(inbuf, buf3)) < test_precision<T>());  // gross check
+}
+
+template <typename T>
+void test_complex(int nfft) {
+  test_complex_generic<StdVectorContainer, T>(nfft);
+  test_complex_generic<EigenVectorContainer, T>(nfft);
+}
+
+template <typename T, int nrows, int ncols>
+void test_complex2d() {
+  typedef typename Eigen::FFT<T>::Complex Complex;
+  FFT<T> fft;
+  Eigen::Matrix<Complex, nrows, ncols> src, src2, dst, dst2;
+
+  src = Eigen::Matrix<Complex, nrows, ncols>::Random();
+  // src =  Eigen::Matrix<Complex,nrows,ncols>::Identity();
+
+  for (int k = 0; k < ncols; k++) {
+    Eigen::Matrix<Complex, nrows, 1> tmpOut;
+    fft.fwd(tmpOut, src.col(k));
+    dst2.col(k) = tmpOut;
+  }
+
+  for (int k = 0; k < nrows; k++) {
+    Eigen::Matrix<Complex, 1, ncols> tmpOut;
+    fft.fwd(tmpOut, dst2.row(k));
+    dst2.row(k) = tmpOut;
+  }
+
+  fft.fwd2(dst.data(), src.data(), ncols, nrows);
+  fft.inv2(src2.data(), dst.data(), ncols, nrows);
+  VERIFY((src - src2).norm() < test_precision<T>());
+  VERIFY((dst - dst2).norm() < test_precision<T>());
+}
+
+inline void test_return_by_value(int len) {
+  VectorXf in;
+  VectorXf in1;
+  in.setRandom(len);
+  VectorXcf out1, out2;
+  FFT<float> fft;
+
+  fft.SetFlag(fft.HalfSpectrum);
+
+  fft.fwd(out1, in);
+  out2 = fft.fwd(in);
+  VERIFY((out1 - out2).norm() < test_precision<float>());
+  in1 = fft.inv(out1);
+  VERIFY((in1 - in).norm() < test_precision<float>());
+}
+
+EIGEN_DECLARE_TEST(FFTW) {
+  CALL_SUBTEST(test_return_by_value(32));
+  CALL_SUBTEST(test_complex<float>(32));
+  CALL_SUBTEST(test_complex<double>(32));
+  CALL_SUBTEST(test_complex<float>(256));
+  CALL_SUBTEST(test_complex<double>(256));
+  CALL_SUBTEST(test_complex<float>(3 * 8));
+  CALL_SUBTEST(test_complex<double>(3 * 8));
+  CALL_SUBTEST(test_complex<float>(5 * 32));
+  CALL_SUBTEST(test_complex<double>(5 * 32));
+  CALL_SUBTEST(test_complex<float>(2 * 3 * 4));
+  CALL_SUBTEST(test_complex<double>(2 * 3 * 4));
+  CALL_SUBTEST(test_complex<float>(2 * 3 * 4 * 5));
+  CALL_SUBTEST(test_complex<double>(2 * 3 * 4 * 5));
+  CALL_SUBTEST(test_complex<float>(2 * 3 * 4 * 5 * 7));
+  CALL_SUBTEST(test_complex<double>(2 * 3 * 4 * 5 * 7));
+
+  CALL_SUBTEST(test_scalar<float>(32));
+  CALL_SUBTEST(test_scalar<double>(32));
+  CALL_SUBTEST(test_scalar<float>(45));
+  CALL_SUBTEST(test_scalar<double>(45));
+  CALL_SUBTEST(test_scalar<float>(50));
+  CALL_SUBTEST(test_scalar<double>(50));
+  CALL_SUBTEST(test_scalar<float>(256));
+  CALL_SUBTEST(test_scalar<double>(256));
+  CALL_SUBTEST(test_scalar<float>(2 * 3 * 4 * 5 * 7));
+  CALL_SUBTEST(test_scalar<double>(2 * 3 * 4 * 5 * 7));
+
+#if defined EIGEN_HAS_FFTWL || defined EIGEN_POCKETFFT_DEFAULT
+  CALL_SUBTEST(test_complex<long double>(32));
+  CALL_SUBTEST(test_complex<long double>(256));
+  CALL_SUBTEST(test_complex<long double>(3 * 8));
+  CALL_SUBTEST(test_complex<long double>(5 * 32));
+  CALL_SUBTEST(test_complex<long double>(2 * 3 * 4));
+  CALL_SUBTEST(test_complex<long double>(2 * 3 * 4 * 5));
+  CALL_SUBTEST(test_complex<long double>(2 * 3 * 4 * 5 * 7));
+
+  CALL_SUBTEST(test_scalar<long double>(32));
+  CALL_SUBTEST(test_scalar<long double>(45));
+  CALL_SUBTEST(test_scalar<long double>(50));
+  CALL_SUBTEST(test_scalar<long double>(256));
+  CALL_SUBTEST(test_scalar<long double>(2 * 3 * 4 * 5 * 7));
+
+  CALL_SUBTEST((test_complex2d<long double, 2 * 3 * 4, 2 * 3 * 4>()));
+  CALL_SUBTEST((test_complex2d<long double, 3 * 4 * 5, 3 * 4 * 5>()));
+  CALL_SUBTEST((test_complex2d<long double, 24, 60>()));
+  CALL_SUBTEST((test_complex2d<long double, 60, 24>()));
+// fail to build since Eigen limit the stack allocation size,too big here.
+// CALL_SUBTEST( ( test_complex2d<long double, 256, 256> () ) );
+#endif
+#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_MKL_DEFAULT
+  CALL_SUBTEST((test_complex2d<float, 24, 24>()));
+  CALL_SUBTEST((test_complex2d<float, 60, 60>()));
+  CALL_SUBTEST((test_complex2d<float, 24, 60>()));
+  CALL_SUBTEST((test_complex2d<float, 60, 24>()));
+#endif
+#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_MKL_DEFAULT
+  CALL_SUBTEST((test_complex2d<double, 24, 24>()));
+  CALL_SUBTEST((test_complex2d<double, 60, 60>()));
+  CALL_SUBTEST((test_complex2d<double, 24, 60>()));
+  CALL_SUBTEST((test_complex2d<double, 60, 24>()));
+#endif
+}
diff --git a/unsupported/test/mklfft.cpp b/unsupported/test/mklfft.cpp
new file mode 100644
index 0000000..631dd20
--- /dev/null
+++ b/unsupported/test/mklfft.cpp
@@ -0,0 +1,2 @@
+#define EIGEN_MKL_DEFAULT 1
+#include "fft_test_shared.h"
diff --git a/unsupported/test/pocketfft.cpp b/unsupported/test/pocketfft.cpp
new file mode 100644
index 0000000..5e8a0b6
--- /dev/null
+++ b/unsupported/test/pocketfft.cpp
@@ -0,0 +1,2 @@
+#define EIGEN_POCKETFFT_DEFAULT 1
+#include "fft_test_shared.h"