Update Eigen to commit:e8515f78ac098329ab9f8cab21c87caede090a3f

CHANGELOG
=========
e8515f78a - Fix sparse triangular view iterator
6d829e766 - Fix extra semicolon in XprHelper
ba47341a1 - [SYCL-2020] Enabling half precision support for SYCL.
92a77a596 - Fix call to static functions from device by adding EIGEN_DEVICE_FUNC attribute to run methods
8f858a4ea - Export ThreadPool symbols from legacy header.
4e598ad25 - New panel modes for GEMM MMA (real & complex).
2c64a655f - Stage will not be ok if pardiso returned error

PiperOrigin-RevId: 571952024
Change-Id: Ic4ba7448c04e842e42dfb20a19f7cc146a00f66f
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index fc11174..faa3853 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -1,870 +1,870 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-//
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
-#define EIGEN_PACKET_MATH_FP16_AVX512_H
-
+// This file is part of Eigen, a lightweight C++ template library

+// for linear algebra.

+//

+//

+//

+// This Source Code Form is subject to the terms of the Mozilla

+// Public License v. 2.0. If a copy of the MPL was not distributed

+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

+

+#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H

+#define EIGEN_PACKET_MATH_FP16_AVX512_H

+

 // IWYU pragma: private
-#include "../../InternalHeaderCheck.h"
-
-namespace Eigen {
-
-namespace internal {
-
-typedef __m512h Packet32h;
-typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
-typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
-
-template <>
-struct is_arithmetic<Packet8h> {
-  enum { value = true };
-};
-
-template <>
-struct packet_traits<half> : default_packet_traits {
-  typedef Packet32h type;
-  typedef Packet16h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 32,
-
-    HasCmp = 1,
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasDiv = 1,
-    HasNegate = 1,
-    HasAbs = 1,
-    HasAbs2 = 0,
-    HasMin = 1,
-    HasMax = 1,
-    HasConj = 1,
-    HasSetLinear = 0,
-    HasLog = 1,
-    HasLog1p = 1,
-    HasExp = 1,
-    HasExpm1 = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    // These ones should be implemented in future
-    HasBessel = 0,
-    HasNdtri = 0,
-    HasSin = EIGEN_FAST_MATH,
-    HasCos = EIGEN_FAST_MATH,
-    HasTanh = EIGEN_FAST_MATH,
-    HasErf = 0,  // EIGEN_FAST_MATH,
-    HasBlend = 0,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1
-  };
-};
-
-template <>
-struct unpacket_traits<Packet32h> {
-  typedef Eigen::half type;
-  typedef Packet16h half;
-  enum {
-    size = 32,
-    alignment = Aligned64,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-
-template <>
-struct unpacket_traits<Packet16h> {
-  typedef Eigen::half type;
-  typedef Packet8h half;
-  enum {
-    size = 16,
-    alignment = Aligned32,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-
-template <>
-struct unpacket_traits<Packet8h> {
-  typedef Eigen::half type;
-  typedef Packet8h half;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-
-// Memory functions
-
-// pset1
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
-  return _mm512_set1_ph(static_cast<_Float16>(from));
-}
-
-// pset1frombits
-template <>
-EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
-  return _mm512_castsi512_ph(_mm512_set1_epi16(from));
-}
-
-// pfirst
-
-template <>
-EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  return half_impl::raw_uint16_to_half(
-      static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
-#else
-  Eigen::half dest[32];
-  _mm512_storeu_ph(dest, from);
-  return dest[0];
-#endif
-}
-
-// pload
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
-}
-
-// ploadu
-
-template <>
-EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
-}
-
-// pstore
-
-template <>
-EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
-}
-
-// pstoreu
-
-template <>
-EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
-}
-
-// ploaddup
-template <>
-EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
-  __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
-  return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
-                                                5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
-                               a);
-}
-
-// ploadquad
-template <>
-EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
-  __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
-  return _mm512_permutexvar_ph(
-      _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
-      a);
-}
-
-// pabs
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
-  return _mm512_abs_ph(a);
-}
-
-// psignbit
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
-  return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
-}
-
-// pmin
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
-  return _mm512_min_ph(a, b);
-}
-
-// pmax
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
-  return _mm512_max_ph(a, b);
-}
-
-// plset
-template <>
-EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
-  return _mm512_add_ph(_mm512_set1_ph(a),
-                       _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,
-                                     19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
-                                     7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
-}
-
-// por
-
-template <>
-EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
-  return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pxor
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
-  return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pand
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
-  return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pandnot
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
-  return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
-}
-
-// pselect
-
-template <>
-EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
-  __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
-  return _mm512_mask_blend_ph(mask32, a, b);
-}
-
-// pcmp_eq
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
-  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_le
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
-  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_lt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
-  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_lt_or_nan
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
-  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));
-}
-
-// padd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
-  return _mm512_add_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// psub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
-  return _mm512_sub_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pmul
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
-  return _mm512_mul_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pdiv
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
-  return _mm512_div_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pround
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
-  // Work-around for default std::round rounding mode.
-
-  // Mask for the sign bit
-  const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
-  // The largest half-preicision float less than 0.5
-  const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
-
-  return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
-}
-
-// print
-
-template <>
-EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
-  return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
-}
-
-// pceil
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
-  return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
-}
-
-// pfloor
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
-  return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
-}
-
-// predux
-template <>
-EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
-  return (half)_mm512_reduce_add_ph(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
-  return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
-}
-
-template <>
-EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
-  return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
-}
-
-// predux_half_dowto4
-template <>
-EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
-  __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
-
-  return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#else
-  Eigen::half data[32];
-  _mm512_storeu_ph(data, a);
-
-  __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
-  __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
-
-  return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#endif
-}
-
-// predux_max
-
-// predux_min
-
-// predux_mul
-
-#ifdef EIGEN_VECTORIZE_FMA
-
-// pmadd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
-  return _mm512_fmadd_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pmsub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
-  return _mm512_fmsub_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pnmadd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
-  return _mm512_fnmadd_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pnmsub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
-  return _mm512_fnmsub_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-#endif
-
-// pnegate
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
-  return _mm512_sub_ph(_mm512_set1_ph(0.0), a);
-}
-
-// pconj
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {
-  return a;
-}
-
-// psqrt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
-  return _mm512_sqrt_ph(a);
-}
-
-// prsqrt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
-  return _mm512_rsqrt_ph(a);
-}
-
-// preciprocal
-
-template <>
-EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
-  return _mm512_rcp_ph(a);
-}
-
-// ptranspose
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {
-  __m512i t[32];
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < 16; i++) {
-    t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
-    t[2 * i + 1] =
-        _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
-  }
-
-  __m512i p[32];
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < 8; i++) {
-    p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
-    p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
-    p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
-    p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
-  }
-
-  __m512i q[32];
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < 4; i++) {
-    q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
-    q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
-    q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
-    q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
-    q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
-    q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
-    q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
-    q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
-  }
-
-  __m512i f[32];
-
-#define PACKET32H_TRANSPOSE_HELPER(X, Y)                                                            \
-  do {                                                                                              \
-    f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X);             \
-    f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
-    f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
-    f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
-    f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
-    f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
-    f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
-    f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
-  } while (false);
-
-  PACKET32H_TRANSPOSE_HELPER(0, 0);
-  PACKET32H_TRANSPOSE_HELPER(1, 1);
-  PACKET32H_TRANSPOSE_HELPER(2, 2);
-  PACKET32H_TRANSPOSE_HELPER(3, 3);
-
-  PACKET32H_TRANSPOSE_HELPER(1, 0);
-  PACKET32H_TRANSPOSE_HELPER(2, 0);
-  PACKET32H_TRANSPOSE_HELPER(3, 0);
-  PACKET32H_TRANSPOSE_HELPER(2, 1);
-  PACKET32H_TRANSPOSE_HELPER(3, 1);
-  PACKET32H_TRANSPOSE_HELPER(3, 2);
-
-  PACKET32H_TRANSPOSE_HELPER(0, 1);
-  PACKET32H_TRANSPOSE_HELPER(0, 2);
-  PACKET32H_TRANSPOSE_HELPER(0, 3);
-  PACKET32H_TRANSPOSE_HELPER(1, 2);
-  PACKET32H_TRANSPOSE_HELPER(1, 3);
-  PACKET32H_TRANSPOSE_HELPER(2, 3);
-
-#undef PACKET32H_TRANSPOSE_HELPER
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < 32; i++) {
-    a.packet[i] = _mm512_castsi512_ph(f[i]);
-  }
-}
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
-  __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
-  t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
-  t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
-  t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
-  t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
-
-  p0 = _mm512_unpacklo_epi32(t0, t2);
-  p1 = _mm512_unpackhi_epi32(t0, t2);
-  p2 = _mm512_unpacklo_epi32(t1, t3);
-  p3 = _mm512_unpackhi_epi32(t1, t3);
-
-  a0 = p0;
-  a1 = p1;
-  a2 = p2;
-  a3 = p3;
-
-  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
-  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
-
-  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
-  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
-
-  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
-  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
-
-  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
-  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
-
-  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
-  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
-
-  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
-  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
-
-  a.packet[0] = _mm512_castsi512_ph(a0);
-  a.packet[1] = _mm512_castsi512_ph(a1);
-  a.packet[2] = _mm512_castsi512_ph(a2);
-  a.packet[3] = _mm512_castsi512_ph(a3);
-}
-
-// preverse
-
-template <>
-EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
-  return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-                                                20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
-                               a);
-}
-
-// pscatter
-
-template <>
-EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
-  EIGEN_ALIGN64 half aux[32];
-  pstore(aux, from);
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < 32; i++) {
-    to[stride * i] = aux[i];
-  }
-}
-
-// pgather
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
-  return _mm512_castsi512_ph(_mm512_set_epi16(
-      from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
-      from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
-      from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
-      from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
-      from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
-      from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
-      from[1 * stride].x, from[0 * stride].x));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
-
-EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
-  __m512d result = _mm512_undefined_pd();
-  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
-  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
-  return _mm512_castpd_ph(result);
-}
-
-EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
-  a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
-  b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
-}
-
-// psin
-template <>
-EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = psin(low);
-  Packet16h highOut = psin(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pcos
-template <>
-EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = pcos(low);
-  Packet16h highOut = pcos(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// plog
-template <>
-EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = plog(low);
-  Packet16h highOut = plog(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// plog2
-template <>
-EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = plog2(low);
-  Packet16h highOut = plog2(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// plog1p
-template <>
-EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = plog1p(low);
-  Packet16h highOut = plog1p(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = pexp(low);
-  Packet16h highOut = pexp(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pexpm1
-template <>
-EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = pexpm1(low);
-  Packet16h highOut = pexpm1(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// ptanh
-template <>
-EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = ptanh(low);
-  Packet16h highOut = ptanh(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pfrexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h exp1 = _mm256_undefined_si256();
-  Packet16h exp2 = _mm256_undefined_si256();
-
-  Packet16h lowOut = pfrexp(low, exp1);
-  Packet16h highOut = pfrexp(high, exp2);
-
-  exponent = combine2Packet16h(exp1, exp2);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pldexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h exp1;
-  Packet16h exp2;
-  extract2Packet16h(exponent, exp1, exp2);
-
-  Packet16h lowOut = pldexp(low, exp1);
-  Packet16h highOut = pldexp(high, exp2);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif  // EIGEN_PACKET_MATH_FP16_AVX512_H
+#include "../../InternalHeaderCheck.h"

+

+namespace Eigen {

+

+namespace internal {

+

+typedef __m512h Packet32h;

+typedef eigen_packet_wrapper<__m256i, 1> Packet16h;

+typedef eigen_packet_wrapper<__m128i, 2> Packet8h;

+

+template <>

+struct is_arithmetic<Packet8h> {

+  enum { value = true };

+};

+

+template <>

+struct packet_traits<half> : default_packet_traits {

+  typedef Packet32h type;

+  typedef Packet16h half;

+  enum {

+    Vectorizable = 1,

+    AlignedOnScalar = 1,

+    size = 32,

+

+    HasCmp = 1,

+    HasAdd = 1,

+    HasSub = 1,

+    HasMul = 1,

+    HasDiv = 1,

+    HasNegate = 1,

+    HasAbs = 1,

+    HasAbs2 = 0,

+    HasMin = 1,

+    HasMax = 1,

+    HasConj = 1,

+    HasSetLinear = 0,

+    HasLog = 1,

+    HasLog1p = 1,

+    HasExp = 1,

+    HasExpm1 = 1,

+    HasSqrt = 1,

+    HasRsqrt = 1,

+    // These ones should be implemented in future

+    HasBessel = 0,

+    HasNdtri = 0,

+    HasSin = EIGEN_FAST_MATH,

+    HasCos = EIGEN_FAST_MATH,

+    HasTanh = EIGEN_FAST_MATH,

+    HasErf = 0,  // EIGEN_FAST_MATH,

+    HasBlend = 0,

+    HasRound = 1,

+    HasFloor = 1,

+    HasCeil = 1,

+    HasRint = 1

+  };

+};

+

+template <>

+struct unpacket_traits<Packet32h> {

+  typedef Eigen::half type;

+  typedef Packet16h half;

+  enum {

+    size = 32,

+    alignment = Aligned64,

+    vectorizable = true,

+    masked_load_available = false,

+    masked_store_available = false

+  };

+};

+

+template <>

+struct unpacket_traits<Packet16h> {

+  typedef Eigen::half type;

+  typedef Packet8h half;

+  enum {

+    size = 16,

+    alignment = Aligned32,

+    vectorizable = true,

+    masked_load_available = false,

+    masked_store_available = false

+  };

+};

+

+template <>

+struct unpacket_traits<Packet8h> {

+  typedef Eigen::half type;

+  typedef Packet8h half;

+  enum {

+    size = 8,

+    alignment = Aligned16,

+    vectorizable = true,

+    masked_load_available = false,

+    masked_store_available = false

+  };

+};

+

+// Memory functions

+

+// pset1

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {

+  return _mm512_set1_ph(static_cast<_Float16>(from));

+}

+

+// pset1frombits

+template <>

+EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {

+  return _mm512_castsi512_ph(_mm512_set1_epi16(from));

+}

+

+// pfirst

+

+template <>

+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {

+#ifdef EIGEN_VECTORIZE_AVX512DQ

+  return half_impl::raw_uint16_to_half(

+      static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));

+#else

+  Eigen::half dest[32];

+  _mm512_storeu_ph(dest, from);

+  return dest[0];

+#endif

+}

+

+// pload

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {

+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);

+}

+

+// ploadu

+

+template <>

+EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {

+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);

+}

+

+// pstore

+

+template <>

+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {

+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);

+}

+

+// pstoreu

+

+template <>

+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {

+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);

+}

+

+// ploaddup

+template <>

+EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {

+  __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));

+  return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,

+                                                5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),

+                               a);

+}

+

+// ploadquad

+template <>

+EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {

+  __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));

+  return _mm512_permutexvar_ph(

+      _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),

+      a);

+}

+

+// pabs

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {

+  return _mm512_abs_ph(a);

+}

+

+// psignbit

+

+template <>

+EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {

+  return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));

+}

+

+// pmin

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {

+  return _mm512_min_ph(a, b);

+}

+

+// pmax

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {

+  return _mm512_max_ph(a, b);

+}

+

+// plset

+template <>

+EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {

+  return _mm512_add_ph(_mm512_set1_ph(a),

+                       _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,

+                                     19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,

+                                     7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));

+}

+

+// por

+

+template <>

+EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {

+  return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));

+}

+

+// pxor

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {

+  return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));

+}

+

+// pand

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {

+  return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));

+}

+

+// pandnot

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {

+  return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));

+}

+

+// pselect

+

+template <>

+EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {

+  __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);

+  return _mm512_mask_blend_ph(mask32, a, b);

+}

+

+// pcmp_eq

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {

+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);

+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));

+}

+

+// pcmp_le

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {

+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);

+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));

+}

+

+// pcmp_lt

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {

+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);

+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));

+}

+

+// pcmp_lt_or_nan

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {

+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);

+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));

+}

+

+// padd

+

+template <>

+EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {

+  return _mm512_add_ph(a, b);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {

+  return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {

+  return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));

+}

+

+// psub

+

+template <>

+EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {

+  return _mm512_sub_ph(a, b);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {

+  return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {

+  return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));

+}

+

+// pmul

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {

+  return _mm512_mul_ph(a, b);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {

+  return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {

+  return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));

+}

+

+// pdiv

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {

+  return _mm512_div_ph(a, b);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {

+  return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {

+  return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));

+}

+

+// pround

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {

+  // Work-around for default std::round rounding mode.

+

+  // Mask for the sign bit

+  const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));

+  // The largest half-preicision float less than 0.5

+  const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));

+

+  return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);

+}

+

+// print

+

+template <>

+EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {

+  return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);

+}

+

+// pceil

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {

+  return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);

+}

+

+// pfloor

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {

+  return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);

+}

+

+// predux

+template <>

+EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {

+  return (half)_mm512_reduce_add_ph(a);

+}

+

+template <>

+EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {

+  return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));

+}

+

+template <>

+EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {

+  return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));

+}

+

+// predux_half_dowto4

+template <>

+EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {

+#ifdef EIGEN_VECTORIZE_AVX512DQ

+  __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));

+  __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));

+

+  return Packet16h(padd<Packet16h>(lowHalf, highHalf));

+#else

+  Eigen::half data[32];

+  _mm512_storeu_ph(data, a);

+

+  __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));

+  __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));

+

+  return Packet16h(padd<Packet16h>(lowHalf, highHalf));

+#endif

+}

+

+// predux_max

+

+// predux_min

+

+// predux_mul

+

+#ifdef EIGEN_VECTORIZE_FMA

+

+// pmadd

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {

+  return _mm512_fmadd_ph(a, b, c);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {

+  return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {

+  return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));

+}

+

+// pmsub

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {

+  return _mm512_fmsub_ph(a, b, c);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {

+  return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {

+  return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));

+}

+

+// pnmadd

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {

+  return _mm512_fnmadd_ph(a, b, c);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {

+  return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {

+  return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));

+}

+

+// pnmsub

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {

+  return _mm512_fnmsub_ph(a, b, c);

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {

+  return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {

+  return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));

+}

+

+#endif

+

+// pnegate

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {

+  return _mm512_sub_ph(_mm512_set1_ph(0.0), a);

+}

+

+// pconj

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {

+  return a;

+}

+

+// psqrt

+

+template <>

+EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {

+  return _mm512_sqrt_ph(a);

+}

+

+// prsqrt

+

+template <>

+EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {

+  return _mm512_rsqrt_ph(a);

+}

+

+// preciprocal

+

+template <>

+EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {

+  return _mm512_rcp_ph(a);

+}

+

+// ptranspose

+

+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {

+  __m512i t[32];

+

+  EIGEN_UNROLL_LOOP

+  for (int i = 0; i < 16; i++) {

+    t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));

+    t[2 * i + 1] =

+        _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));

+  }

+

+  __m512i p[32];

+

+  EIGEN_UNROLL_LOOP

+  for (int i = 0; i < 8; i++) {

+    p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);

+    p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);

+    p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);

+    p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);

+  }

+

+  __m512i q[32];

+

+  EIGEN_UNROLL_LOOP

+  for (int i = 0; i < 4; i++) {

+    q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);

+    q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);

+    q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);

+    q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);

+    q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);

+    q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);

+    q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);

+    q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);

+  }

+

+  __m512i f[32];

+

+#define PACKET32H_TRANSPOSE_HELPER(X, Y)                                                            \

+  do {                                                                                              \

+    f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X);             \

+    f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \

+    f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \

+    f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \

+    f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \

+    f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \

+    f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \

+    f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \

+  } while (false);

+

+  PACKET32H_TRANSPOSE_HELPER(0, 0);

+  PACKET32H_TRANSPOSE_HELPER(1, 1);

+  PACKET32H_TRANSPOSE_HELPER(2, 2);

+  PACKET32H_TRANSPOSE_HELPER(3, 3);

+

+  PACKET32H_TRANSPOSE_HELPER(1, 0);

+  PACKET32H_TRANSPOSE_HELPER(2, 0);

+  PACKET32H_TRANSPOSE_HELPER(3, 0);

+  PACKET32H_TRANSPOSE_HELPER(2, 1);

+  PACKET32H_TRANSPOSE_HELPER(3, 1);

+  PACKET32H_TRANSPOSE_HELPER(3, 2);

+

+  PACKET32H_TRANSPOSE_HELPER(0, 1);

+  PACKET32H_TRANSPOSE_HELPER(0, 2);

+  PACKET32H_TRANSPOSE_HELPER(0, 3);

+  PACKET32H_TRANSPOSE_HELPER(1, 2);

+  PACKET32H_TRANSPOSE_HELPER(1, 3);

+  PACKET32H_TRANSPOSE_HELPER(2, 3);

+

+#undef PACKET32H_TRANSPOSE_HELPER

+

+  EIGEN_UNROLL_LOOP

+  for (int i = 0; i < 32; i++) {

+    a.packet[i] = _mm512_castsi512_ph(f[i]);

+  }

+}

+

+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {

+  __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;

+  t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));

+  t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));

+  t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));

+  t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));

+

+  p0 = _mm512_unpacklo_epi32(t0, t2);

+  p1 = _mm512_unpackhi_epi32(t0, t2);

+  p2 = _mm512_unpacklo_epi32(t1, t3);

+  p3 = _mm512_unpackhi_epi32(t1, t3);

+

+  a0 = p0;

+  a1 = p1;

+  a2 = p2;

+  a3 = p3;

+

+  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);

+  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);

+

+  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);

+  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);

+

+  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);

+  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);

+

+  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);

+  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);

+

+  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);

+  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);

+

+  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);

+  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);

+

+  a.packet[0] = _mm512_castsi512_ph(a0);

+  a.packet[1] = _mm512_castsi512_ph(a1);

+  a.packet[2] = _mm512_castsi512_ph(a2);

+  a.packet[3] = _mm512_castsi512_ph(a3);

+}

+

+// preverse

+

+template <>

+EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {

+  return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,

+                                                20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),

+                               a);

+}

+

+// pscatter

+

+template <>

+EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {

+  EIGEN_ALIGN64 half aux[32];

+  pstore(aux, from);

+

+  EIGEN_UNROLL_LOOP

+  for (int i = 0; i < 32; i++) {

+    to[stride * i] = aux[i];

+  }

+}

+

+// pgather

+

+template <>

+EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {

+  return _mm512_castsi512_ph(_mm512_set_epi16(

+      from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,

+      from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,

+      from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,

+      from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,

+      from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,

+      from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,

+      from[1 * stride].x, from[0 * stride].x));

+}

+

+template <>

+EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);

+template <>

+EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);

+

+EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {

+  __m512d result = _mm512_undefined_pd();

+  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);

+  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);

+  return _mm512_castpd_ph(result);

+}

+

+EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {

+  a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));

+  b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));

+}

+

+// psin

+template <>

+EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = psin(low);

+  Packet16h highOut = psin(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// pcos

+template <>

+EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = pcos(low);

+  Packet16h highOut = pcos(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// plog

+template <>

+EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = plog(low);

+  Packet16h highOut = plog(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// plog2

+template <>

+EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = plog2(low);

+  Packet16h highOut = plog2(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// plog1p

+template <>

+EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = plog1p(low);

+  Packet16h highOut = plog1p(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// pexp

+template <>

+EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = pexp(low);

+  Packet16h highOut = pexp(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// pexpm1

+template <>

+EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = pexpm1(low);

+  Packet16h highOut = pexpm1(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// ptanh

+template <>

+EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h lowOut = ptanh(low);

+  Packet16h highOut = ptanh(high);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// pfrexp

+template <>

+EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h exp1 = _mm256_undefined_si256();

+  Packet16h exp2 = _mm256_undefined_si256();

+

+  Packet16h lowOut = pfrexp(low, exp1);

+  Packet16h highOut = pfrexp(high, exp2);

+

+  exponent = combine2Packet16h(exp1, exp2);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+// pldexp

+template <>

+EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {

+  Packet16h low;

+  Packet16h high;

+  extract2Packet16h(a, low, high);

+

+  Packet16h exp1;

+  Packet16h exp2;

+  extract2Packet16h(exponent, exp1, exp2);

+

+  Packet16h lowOut = pldexp(low, exp1);

+  Packet16h highOut = pldexp(high, exp2);

+

+  return combine2Packet16h(lowOut, highOut);

+}

+

+}  // end namespace internal

+}  // end namespace Eigen

+

+#endif  // EIGEN_PACKET_MATH_FP16_AVX512_H

diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index b232a8f..e9a9307 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -49,11 +49,6 @@
   #include "MatrixProductMMA.h"
 #endif
 
-/**************************************************************************************************
- * TODO                                                                                           *
- * - Check StorageOrder on dhs_pack (the innermost second loop seems unvectorized when it could). *
- * - Check the possibility of transposing as GETREAL and GETIMAG when needed.                     *
- **************************************************************************************************/
 // IWYU pragma: private
 #include "../../InternalHeaderCheck.h"
 
@@ -120,6 +115,16 @@
                                             20, 21, 22, 23,
                                             28, 29, 30, 31};
 
+const static Packet16uc p16uc_GETREAL32b = {  0,  1,  2,  3,
+                                             16, 17, 18, 19,
+                                              8,  9, 10, 11,
+                                             24, 25, 26, 27};
+
+const static Packet16uc p16uc_GETIMAG32b = {  4,  5,  6,  7,
+                                             20, 21, 22, 23,
+                                             12, 13, 14, 15,
+                                             28, 29, 30, 31};
+
 /*********************************************
  * Single precision real and complex packing *
  * *******************************************/
@@ -440,6 +445,78 @@
 // General template for lhs & rhs complex packing.
 template<typename Scalar, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode, bool UseLhs>
 struct dhs_cpack {
+  template<bool transpose>
+  EIGEN_ALWAYS_INLINE void dhs_cblock(PacketBlock<PacketC,8>& cblock, PacketBlock<Packet,4>& block, Packet16uc permute)
+  {
+    if (transpose) {
+      block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, permute);
+      block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, permute);
+      block.packet[2] = vec_perm(cblock.packet[4].v, cblock.packet[5].v, permute);
+      block.packet[3] = vec_perm(cblock.packet[6].v, cblock.packet[7].v, permute);
+
+      Packet4f t0, t1, t2, t3;
+#ifdef EIGEN_VECTORIZE_VSX
+      t0 = reinterpret_cast<Packet>(vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
+      t1 = reinterpret_cast<Packet>(vec_mergel(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
+      t2 = reinterpret_cast<Packet>(vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
+      t3 = reinterpret_cast<Packet>(vec_mergel(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
+#else
+      t0 = reinterpret_cast<Packet>(vec_perm(block.packet[0], block.packet[1], p16uc_TRANSPOSE64_HI));
+      t1 = reinterpret_cast<Packet>(vec_perm(block.packet[0], block.packet[1], p16uc_TRANSPOSE64_LO));
+      t2 = reinterpret_cast<Packet>(vec_perm(block.packet[2], block.packet[3], p16uc_TRANSPOSE64_HI));
+      t3 = reinterpret_cast<Packet>(vec_perm(block.packet[2], block.packet[3], p16uc_TRANSPOSE64_LO));
+#endif
+
+      block.packet[0] = t0;
+      block.packet[1] = t1;
+      block.packet[2] = t2;
+      block.packet[3] = t3;
+    } else {
+      block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, permute);
+      block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, permute);
+      block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, permute);
+      block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, permute);
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE void dhs_ccopy(Scalar* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii, Index depth, const Index vectorSize)
+  {
+    PacketBlock<Packet,4> blockr, blocki;
+    PacketBlock<PacketC,8> cblock;
+
+    for(; i + vectorSize <= depth; i+=vectorSize)
+    {
+      if (UseLhs) {
+        bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, 0, i);
+      } else {
+        bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, i, 0);
+      }
+
+      if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs)))
+      {
+        dhs_cblock<true>(cblock, blockr, p16uc_GETREAL32b);
+        dhs_cblock<true>(cblock, blocki, p16uc_GETIMAG32b);
+      } else {
+        dhs_cblock<false>(cblock, blockr, p16uc_GETREAL32);
+        dhs_cblock<false>(cblock, blocki, p16uc_GETIMAG32);
+      }
+
+      if(Conjugate)
+      {
+        blocki.packet[0] = -blocki.packet[0];
+        blocki.packet[1] = -blocki.packet[1];
+        blocki.packet[2] = -blocki.packet[2];
+        blocki.packet[3] = -blocki.packet[3];
+      }
+
+      storeBlock<Scalar, Packet, 4>(blockAt + rir, blockr);
+      storeBlock<Scalar, Packet, 4>(blockAt + rii, blocki);
+
+      rir += 4*vectorSize;
+      rii += 4*vectorSize;
+    }
+  }
+
   EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   {
     const Index vectorSize = quad_traits<Scalar>::vectorsize;
@@ -455,47 +532,8 @@
 
       rii = rir + vectorDelta;
 
-      for(; i + vectorSize <= depth; i+=vectorSize)
-      {
-        PacketBlock<Packet,4> blockr, blocki;
-        PacketBlock<PacketC,8> cblock;
+      dhs_ccopy(blockAt, lhs2, i, rir, rii, depth, vectorSize);
 
-        if (UseLhs) {
-          bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, 0, i);
-        } else {
-          bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, i, 0);
-        }
-
-        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
-        blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
-        blockr.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
-        blockr.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
-
-        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
-        blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
-        blocki.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
-        blocki.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
-
-        if(Conjugate)
-        {
-          blocki.packet[0] = -blocki.packet[0];
-          blocki.packet[1] = -blocki.packet[1];
-          blocki.packet[2] = -blocki.packet[2];
-          blocki.packet[3] = -blocki.packet[3];
-        }
-
-        if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs)))
-        {
-          ptranspose(blockr);
-          ptranspose(blocki);
-        }
-
-        storeBlock<Scalar, Packet, 4>(blockAt + rir, blockr);
-        storeBlock<Scalar, Packet, 4>(blockAt + rii, blocki);
-
-        rir += 4*vectorSize;
-        rii += 4*vectorSize;
-      }
       for(; i < depth; i++)
       {
         PacketBlock<Packet,1> blockr, blocki;
@@ -592,6 +630,36 @@
 // General template for lhs & rhs packing.
 template<typename Scalar, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
 struct dhs_pack{
+  template<Index n>
+  EIGEN_ALWAYS_INLINE void dhs_copy(Scalar* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth, const Index vectorSize)
+  {
+    PacketBlock<Packet,4> block[n];
+
+    for(; i + n*vectorSize <= depth; i+=n*vectorSize)
+    {
+      for (Index k = 0; k < n; k++) {
+        if (UseLhs) {
+          bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, 0, i + k*vectorSize);
+        } else {
+          bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, i + k*vectorSize, 0);
+        }
+      }
+
+      if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
+      {
+        for (Index k = 0; k < n; k++) {
+          ptranspose(block[k]);
+        }
+      }
+
+      for (Index k = 0; k < n; k++) {
+        storeBlock<Scalar, Packet, 4>(blockA + ri + k*4*vectorSize, block[k]);
+      }
+
+      ri += n*4*vectorSize;
+    }
+  }
+
   EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   {
     const Index vectorSize = quad_traits<Scalar>::vectorsize;
@@ -604,24 +672,10 @@
 
       if(PanelMode) ri += vectorSize*offset;
 
-      for(; i + vectorSize <= depth; i+=vectorSize)
-      {
-        PacketBlock<Packet,4> block;
+      dhs_copy<4>(blockA, lhs2, i, ri, depth, vectorSize);
+      dhs_copy<2>(blockA, lhs2, i, ri, depth, vectorSize);
+      dhs_copy<1>(blockA, lhs2, i, ri, depth, vectorSize);
 
-        if (UseLhs) {
-          bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block, lhs2, 0, i);
-        } else {
-          bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block, lhs2, i, 0);
-        }
-        if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
-        {
-          ptranspose(block);
-        }
-
-        storeBlock<Scalar, Packet, 4>(blockA + ri, block);
-
-        ri += 4*vectorSize;
-      }
       for(; i < depth; i++)
       {
         if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
@@ -691,6 +745,39 @@
 template<typename DataMapper, int StorageOrder, bool PanelMode>
 struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, true>
 {
+  template<Index n>
+  EIGEN_ALWAYS_INLINE void dhs_copy(double* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth, const Index vectorSize)
+  {
+    PacketBlock<Packet2d,2> block[n];
+
+    for(; i + n*vectorSize <= depth; i+=n*vectorSize)
+    {
+      for (Index k = 0; k < n; k++) {
+        if(StorageOrder == RowMajor)
+        {
+          block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k*vectorSize);
+          block[k].packet[1] = lhs2.template loadPacket<Packet2d>(1, i + k*vectorSize);
+        } else {
+          block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k*vectorSize + 0);
+          block[k].packet[1] = lhs2.template loadPacket<Packet2d>(0, i + k*vectorSize + 1);
+        }
+      }
+
+      if(StorageOrder == RowMajor)
+      {
+        for (Index k = 0; k < n; k++) {
+          ptranspose(block[k]);
+        }
+      }
+
+      for (Index k = 0; k < n; k++) {
+        storeBlock<double, Packet2d, 2>(blockA + ri + k*2*vectorSize, block[k]);
+      }
+
+      ri += n*2*vectorSize;
+    }
+  }
+
   EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   {
     const Index vectorSize = quad_traits<double>::vectorsize;
@@ -703,24 +790,10 @@
 
       if(PanelMode) ri += vectorSize*offset;
 
-      for(; i + vectorSize <= depth; i+=vectorSize)
-      {
-        PacketBlock<Packet2d,2> block;
-        if(StorageOrder == RowMajor)
-        {
-          block.packet[0] = lhs2.template loadPacket<Packet2d>(0, i);
-          block.packet[1] = lhs2.template loadPacket<Packet2d>(1, i);
+      dhs_copy<4>(blockA, lhs2, i, ri, depth, vectorSize);
+      dhs_copy<2>(blockA, lhs2, i, ri, depth, vectorSize);
+      dhs_copy<1>(blockA, lhs2, i, ri, depth, vectorSize);
 
-          ptranspose(block);
-        } else {
-          block.packet[0] = lhs2.template loadPacket<Packet2d>(0, i + 0);
-          block.packet[1] = lhs2.template loadPacket<Packet2d>(0, i + 1);
-        }
-
-        storeBlock<double, Packet2d, 2>(blockA + ri, block);
-
-        ri += 2*vectorSize;
-      }
       for(; i < depth; i++)
       {
         if(StorageOrder == RowMajor)
@@ -759,6 +832,53 @@
 template<typename DataMapper, int StorageOrder, bool PanelMode>
 struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, false>
 {
+  template<Index n>
+  EIGEN_ALWAYS_INLINE void dhs_copy(double* blockB, const DataMapper& rhs2, Index& i, Index& ri, Index depth, const Index vectorSize)
+  {
+    PacketBlock<Packet2d,2> block1[n], block2[n];
+    PacketBlock<Packet2d,4> block3[n];
+
+    for(; i + n*vectorSize <= depth; i+=n*vectorSize)
+    {
+      for (Index k = 0; k < n; k++) {
+        if(StorageOrder == ColMajor)
+        {
+          block1[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 0);
+          block1[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 1);
+          block2[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 2);
+          block2[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 3);
+        } else {
+          block3[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 0, 0); //[a1 a2]
+          block3[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 0, 2); //[a3 a4]
+          block3[k].packet[2] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 1, 0); //[b1 b2]
+          block3[k].packet[3] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 1, 2); //[b3 b4]
+        }
+      }
+
+      if(StorageOrder == ColMajor)
+      {
+        for (Index k = 0; k < n; k++) {
+          ptranspose(block1[k]);
+          ptranspose(block2[k]);
+        }
+      }
+
+      for (Index k = 0; k < n; k++) {
+        if(StorageOrder == ColMajor)
+        {
+          pstore<double>(blockB + ri + k*4*vectorSize    , block1[k].packet[0]);
+          pstore<double>(blockB + ri + k*4*vectorSize + 2, block2[k].packet[0]);
+          pstore<double>(blockB + ri + k*4*vectorSize + 4, block1[k].packet[1]);
+          pstore<double>(blockB + ri + k*4*vectorSize + 6, block2[k].packet[1]);
+        } else {
+          storeBlock<double, Packet2d, 4>(blockB + ri + k*4*vectorSize, block3[k]);
+        }
+      }
+
+      ri += n*4*vectorSize;
+    }
+  }
+
   EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
   {
     const Index vectorSize = quad_traits<double>::vectorsize;
@@ -771,35 +891,10 @@
 
       if(PanelMode) ri += offset*(2*vectorSize);
 
-      for(; i + vectorSize <= depth; i+=vectorSize)
-      {
-        PacketBlock<Packet2d,4> block;
-        if(StorageOrder == ColMajor)
-        {
-          PacketBlock<Packet2d,2> block1, block2;
-          block1.packet[0] = rhs2.template loadPacket<Packet2d>(i, 0);
-          block1.packet[1] = rhs2.template loadPacket<Packet2d>(i, 1);
-          block2.packet[0] = rhs2.template loadPacket<Packet2d>(i, 2);
-          block2.packet[1] = rhs2.template loadPacket<Packet2d>(i, 3);
+      dhs_copy<4>(blockB, rhs2, i, ri, depth, vectorSize);
+      dhs_copy<2>(blockB, rhs2, i, ri, depth, vectorSize);
+      dhs_copy<1>(blockB, rhs2, i, ri, depth, vectorSize);
 
-          ptranspose(block1);
-          ptranspose(block2);
-
-          pstore<double>(blockB + ri    , block1.packet[0]);
-          pstore<double>(blockB + ri + 2, block2.packet[0]);
-          pstore<double>(blockB + ri + 4, block1.packet[1]);
-          pstore<double>(blockB + ri + 6, block2.packet[1]);
-        } else {
-          block.packet[0] = rhs2.template loadPacket<Packet2d>(i + 0, 0); //[a1 a2]
-          block.packet[1] = rhs2.template loadPacket<Packet2d>(i + 0, 2); //[a3 a4]
-          block.packet[2] = rhs2.template loadPacket<Packet2d>(i + 1, 0); //[b1 b2]
-          block.packet[3] = rhs2.template loadPacket<Packet2d>(i + 1, 2); //[b3 b4]
-
-          storeBlock<double, Packet2d, 4>(blockB + ri, block);
-        }
-
-        ri += 4*vectorSize;
-      }
       for(; i < depth; i++)
       {
         if(StorageOrder == ColMajor)
@@ -1296,6 +1391,54 @@
 template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
 struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
 {
+  EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii, Index depth, const Index vectorSize)
+  {
+    PacketBlock<Packet,2> blockr, blocki;
+    PacketBlock<PacketC,4> cblock;
+
+    for(; i + vectorSize <= depth; i+=vectorSize)
+    {
+      if(StorageOrder == ColMajor)
+      {
+        cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i + 0); //[a1 a1i]
+        cblock.packet[1] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
+
+        cblock.packet[2] = lhs2.template loadPacket<PacketC>(1, i + 0); //[a2 a2i]
+        cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i]
+
+        blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[2].v); //[a1 a2]
+        blockr.packet[1] = vec_mergeh(cblock.packet[1].v, cblock.packet[3].v); //[b1 b2]
+
+        blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[2].v);
+        blocki.packet[1] = vec_mergel(cblock.packet[1].v, cblock.packet[3].v);
+      } else {
+        cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i); //[a1 a1i]
+        cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i); //[a2 a2i]
+
+        cblock.packet[2] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
+        cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i
+
+        blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v); //[a1 a2]
+        blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v); //[b1 b2]
+
+        blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
+        blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
+      }
+
+      if(Conjugate)
+      {
+        blocki.packet[0] = -blocki.packet[0];
+        blocki.packet[1] = -blocki.packet[1];
+      }
+
+      storeBlock<double, Packet, 2>(blockAt + rir, blockr);
+      storeBlock<double, Packet, 2>(blockAt + rii, blocki);
+
+      rir += 2*vectorSize;
+      rii += 2*vectorSize;
+    }
+  }
+
   EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
   {
     const Index vectorSize = quad_traits<double>::vectorsize;
@@ -1311,50 +1454,8 @@
 
       rii = rir + vectorDelta;
 
-      for(; i + vectorSize <= depth; i+=vectorSize)
-      {
-        PacketBlock<Packet,2> blockr, blocki;
-        PacketBlock<PacketC,4> cblock;
+      dhs_ccopy(blockAt, lhs2, i, rir, rii, depth, vectorSize);
 
-        if(StorageOrder == ColMajor)
-        {
-          cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i + 0); //[a1 a1i]
-          cblock.packet[1] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
-
-          cblock.packet[2] = lhs2.template loadPacket<PacketC>(1, i + 0); //[a2 a2i]
-          cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i]
-
-          blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[2].v); //[a1 a2]
-          blockr.packet[1] = vec_mergeh(cblock.packet[1].v, cblock.packet[3].v); //[b1 b2]
-
-          blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[2].v);
-          blocki.packet[1] = vec_mergel(cblock.packet[1].v, cblock.packet[3].v);
-        } else {
-          cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i); //[a1 a1i]
-          cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i); //[a2 a2i]
-
-          cblock.packet[2] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
-          cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i
-
-          blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v); //[a1 a2]
-          blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v); //[b1 b2]
-
-          blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
-          blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
-        }
-
-        if(Conjugate)
-        {
-          blocki.packet[0] = -blocki.packet[0];
-          blocki.packet[1] = -blocki.packet[1];
-        }
-
-        storeBlock<double, Packet, 2>(blockAt + rir, blockr);
-        storeBlock<double, Packet, 2>(blockAt + rii, blocki);
-
-        rir += 2*vectorSize;
-        rii += 2*vectorSize;
-      }
       for(; i < depth; i++)
       {
         PacketBlock<Packet,1> blockr, blocki;
@@ -1410,6 +1511,35 @@
 template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
 struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
 {
+  EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockBt, const DataMapper& rhs2, Index& i, Index& rir, Index& rii, Index depth, const Index vectorSize)
+  {
+    for(; i < depth; i++)
+    {
+      PacketBlock<PacketC,4> cblock;
+      PacketBlock<Packet,2> blockr, blocki;
+
+      bload<DataMapper, PacketC, 2, ColMajor, false, 4>(cblock, rhs2, i, 0);
+
+      blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);
+      blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v);
+
+      blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
+      blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
+
+      if(Conjugate)
+      {
+        blocki.packet[0] = -blocki.packet[0];
+        blocki.packet[1] = -blocki.packet[1];
+      }
+
+      storeBlock<double, Packet, 2>(blockBt + rir, blockr);
+      storeBlock<double, Packet, 2>(blockBt + rii, blocki);
+
+      rir += 2*vectorSize;
+      rii += 2*vectorSize;
+    }
+  }
+
   EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
   {
     const Index vectorSize = quad_traits<double>::vectorsize;
@@ -1425,31 +1555,7 @@
 
       rii = rir + vectorDelta;
 
-      for(; i < depth; i++)
-      {
-        PacketBlock<PacketC,4> cblock;
-        PacketBlock<Packet,2> blockr, blocki;
-
-        bload<DataMapper, PacketC, 2, ColMajor, false, 4>(cblock, rhs2, i, 0);
-
-        blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);
-        blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v);
-
-        blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
-        blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
-
-        if(Conjugate)
-        {
-          blocki.packet[0] = -blocki.packet[0];
-          blocki.packet[1] = -blocki.packet[1];
-        }
-
-        storeBlock<double, Packet, 2>(blockBt + rir, blockr);
-        storeBlock<double, Packet, 2>(blockBt + rii, blocki);
-
-        rir += 2*vectorSize;
-        rii += 2*vectorSize;
-      }
+      dhs_ccopy(blockBt, rhs2, i, rir, rii, depth, vectorSize);
 
       rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
     }
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
index e83cba5..72e8c31 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -42,19 +42,13 @@
   __builtin_mma_xxsetaccz(acc);
 }
 
-#ifdef USE_PARTIAL_PACKETS
 template<typename DataMapper, typename Packet, bool full>
 EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Index elements, __vector_quad* acc)
-#else
-template<typename DataMapper, typename Packet, const Index accCols, const Index accCols2>
-EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Packet& pMask, __vector_quad* acc)
-#endif
 {
   PacketBlock<Packet, 4> result;
   __builtin_mma_disassemble_acc(&result.packet, acc);
 
   PacketBlock<Packet, 4> tRes;
-#ifdef USE_PARTIAL_PACKETS
   if (full) {
     EIGEN_UNUSED_VARIABLE(elements);
     bload<DataMapper, Packet, 0, ColMajor, false, 4>(tRes, data, i, 0);
@@ -65,11 +59,6 @@
     bscale<Packet, 4>(tRes, result, alpha);
     bstore_partial<DataMapper, Packet, 4>(tRes, data, i, elements);
   }
-#else
-  bload<DataMapper, Packet, 0, ColMajor, false, 4>(tRes, data, i, 0);
-  bscale<Packet, 4, (accCols != accCols2)>(tRes, result, alpha, pMask);
-  bstore<DataMapper, Packet, 4>(tRes, data, i);
-#endif
 }
 
 template<typename DataMapper, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
@@ -166,78 +155,118 @@
   ploadRhsMMA(lhs, lhsV);
 }
 
-#if (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
+#define GEMM_MULTIPLE_COLS
+
+// Disable in GCC until unnecessary register moves are fixed
+//#if (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
+#if EIGEN_COMP_LLVM
 #define VECTOR_PAIR_LOADS_LHS
 #endif
 
 // PEEL_MMA loop factor.
+#ifdef GEMM_MULTIPLE_COLS
+#define PEEL_MMA 8
+#else
+// Register spillage with GCC12+
+#if EIGEN_COMP_LLVM || (__GNUC__ < 12) || defined(VECTOR_PAIR_LOADS_LHS)
 #define PEEL_MMA 7
+#else
+#define PEEL_MMA 6
+#endif
+#endif
 
 #define MICRO_MMA_UNROLL(func) \
   func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
 
 #define MICRO_MMA_WORK(func, type, peel) \
-  func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
-  func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel)
+  if (accItr == 1) { \
+    func(0,type,peel,0,0) func(1,type,peel,1,0) func(2,type,peel,2,0) func(3,type,peel,3,0) \
+    func(4,type,peel,4,0) func(5,type,peel,5,0) func(6,type,peel,6,0) func(7,type,peel,7,0) \
+  } else if (accItr == 2) { \
+    func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,1,0) func(3,type,peel,1,1) \
+    func(4,type,peel,2,0) func(5,type,peel,2,1) func(6,type,peel,3,0) func(7,type,peel,3,1) \
+  } else { \
+    func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,0,2) func(3,type,peel,0,3) \
+    func(4,type,peel,1,0) func(5,type,peel,1,1) func(6,type,peel,1,2) func(7,type,peel,1,3) \
+  }
 
-#define MICRO_MMA_WORK_ONE(iter, type, peel) \
-  if (unroll_factor > iter) { \
-    pgerMMA<Packet, type, false>(&accZero##iter, rhsV[peel], lhsV##iter); \
+#define MICRO_MMA_WORK_ONE(iter, type, peel, left, right) \
+  if (unroll_factor > left) { \
+    pgerMMA<Packet, type, false>(&accZero##iter, rhsV##right[peel], lhsV##left); \
   }
 
 #ifdef VECTOR_PAIR_LOADS_LHS
-#define MICRO_MMA_WORK_TWO(iter, type, peel) \
-  if (unroll_factor > iter) { \
-    pgerMMA<Packet, type, false>(&accZero##iter, rhsV[peel], lhsV2##iter.packet[peel & 1]); \
+#define MICRO_MMA_WORK_TWO(iter, type, peel, left, right) \
+  if (unroll_factor > left) { \
+    pgerMMA<Packet, type, false>(&accZero##iter, rhsV##right[peel], lhsV2##left.packet[peel & 1]); \
   }
 
-#define MICRO_MMA_LOAD1_TWO(lhs_ptr, iter) \
-  if (unroll_factor > iter) { \
-    if (MICRO_NORMAL(iter)) { \
-      ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr##iter), plhsV##iter); \
-      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsV2##iter.packet), &plhsV##iter); \
-      lhs_ptr##iter += accCols*2; \
+#define MICRO_MMA_LOAD1_TWO(lhs_ptr, left) \
+  if (unroll_factor > left) { \
+    if (MICRO_NORMAL(left)) { \
+      ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr##left), plhsV##left); \
+      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsV2##left.packet), &plhsV##left); \
+      lhs_ptr##left += accCols*2; \
     } else { \
-      lhsV2##iter.packet[0] = ploadLhs<Packet>(lhs_ptr##iter); \
-      lhsV2##iter.packet[1] = ploadLhs<Packet>(lhs_ptr##iter + accCols2); \
-      lhs_ptr##iter += accCols2*2; \
-      EIGEN_UNUSED_VARIABLE(plhsV##iter) \
+      lhsV2##left.packet[0] = ploadLhs<Packet>(lhs_ptr##left); \
+      lhsV2##left.packet[1] = ploadLhs<Packet>(lhs_ptr##left + accCols2); \
+      lhs_ptr##left += accCols2*2; \
+      EIGEN_UNUSED_VARIABLE(plhsV##left); \
     } \
   } else { \
-    EIGEN_UNUSED_VARIABLE(lhsV2##iter); \
-    EIGEN_UNUSED_VARIABLE(plhsV##iter) \
+    EIGEN_UNUSED_VARIABLE(lhsV2##left); \
+    EIGEN_UNUSED_VARIABLE(plhsV##left); \
   }
 
-#define MICRO_MMA_LOAD_TWO(iter) MICRO_MMA_LOAD1_TWO(lhs_ptr, iter)
+#define MICRO_MMA_LOAD_TWO(left) MICRO_MMA_LOAD1_TWO(lhs_ptr, left)
 #endif
 
+#define MICRO_MMA_UNROLL_ITER(func, val) \
+  func(val,0) \
+  if (accItr > 1) { \
+    func(val,1) \
+    if (accItr > 2) { \
+      func(val,2) \
+      func(val,3) \
+    } \
+  }
+
+#define MICRO_MMA_LOAD_ONE_RHS1(peel, right) \
+  ploadRhsMMA(rhs_ptr##right + (accRows * peel), rhsV##right[peel]);
+
+#define MICRO_MMA_LOAD_ONE_RHS(peel) \
+  MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_ONE_RHS1, peel)
+
 #define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
   if (PEEL_MMA > peel) { \
     Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
-    ploadRhsMMA(rhs_ptr + (accRows * peel), rhsV[peel]); \
+    MICRO_MMA_LOAD_ONE_RHS(peel) \
     MICRO_MMA_UNROLL(funcl) \
     MICRO_MMA_WORK(funcw, type, peel) \
   }
 
 #ifndef VECTOR_PAIR_LOADS_LHS
 #define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
-  type rhsV[8]; \
+  type rhsV0[8], rhsV1[(accItr > 1) ? 8 : 1], rhsV2[(accItr > 2) ? 8 : 1], rhsV3[(accItr > 2) ? 8 : 1]; \
   MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,1) \
   MICRO_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,3) \
   MICRO_MMA_TYPE_PEEL(funcw,funcl,type,4) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,5) \
   MICRO_MMA_TYPE_PEEL(funcw,funcl,type,6) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,7)
 #else
+#define MICRO_MMA_LOAD_TWO_RHS(peel1, right) \
+  ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr##right + (accRows * peel1)), prhsV##peel1); \
+  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1);
+
 #define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
   if (PEEL_MMA > peel2) { \
     PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \
     __vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7; \
     if (sizeof(type) == 16) { \
-      ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr + (accRows * peel1)), prhsV##peel1); \
-      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV[peel1]), &prhsV##peel1); \
+      MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_TWO_RHS, peel1) \
     } else { \
       EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
-      ploadRhsMMA(rhs_ptr + (accRows * peel1), rhsV[peel1]); \
-      ploadRhsMMA(rhs_ptr + (accRows * peel2), rhsV[peel2]); \
+      MICRO_MMA_LOAD_ONE_RHS(peel1) \
+      MICRO_MMA_LOAD_ONE_RHS(peel2) \
     } \
     MICRO_MMA_UNROLL(funcl2) \
     MICRO_MMA_WORK(funcw2, type, peel1) \
@@ -248,7 +277,7 @@
   }
 
 #define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
-  type rhsV[8]; \
+  type rhsV0[8], rhsV1[(accItr > 1) ? 8 : 1], rhsV2[(accItr > 2) ? 8 : 1], rhsV3[(accItr > 2) ? 8 : 1]; \
   __vector_pair prhsV0, prhsV2, prhsV4, prhsV6; \
   MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
   MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) \
@@ -257,19 +286,25 @@
 #endif
 
 #define MICRO_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
-  type rhsV[1]; \
+  type rhsV0[1], rhsV1[1], rhsV2[1], rhsV3[1]; \
   MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0)
 
+#define MICRO_MMA_UPDATE_RHS1(size, right) \
+  rhs_ptr##right += (accRows * size);
+
+#define MICRO_MMA_UPDATE_RHS(size) \
+  MICRO_MMA_UNROLL_ITER(MICRO_MMA_UPDATE_RHS1, size)
+
 #define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size) \
   MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, RhsPacket) \
-  rhs_ptr += (accRows * size);
+  MICRO_MMA_UPDATE_RHS(size)
 
 #ifndef VECTOR_PAIR_LOADS_LHS
 #define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_PEEL, PEEL_MMA)
 #else
 #define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size) \
   MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \
-  rhs_ptr += (accRows * size);
+  MICRO_MMA_UPDATE_RHS(size)
 
 #define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_UNROLL_TYPE_PEEL2, PEEL_MMA)
 #endif
@@ -277,7 +312,7 @@
 #define MICRO_MMA_ONE MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_ONE, 1)
 
 #define MICRO_MMA_DST_PTR_ONE(iter) \
-  if (unroll_factor > iter) { \
+  if (unroll_factor * accItr > iter) { \
     bsetzeroMMA(&accZero##iter); \
   } else { \
     EIGEN_UNUSED_VARIABLE(accZero##iter); \
@@ -289,45 +324,69 @@
 
 #define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_PREFETCH_ONE)
 
-#ifdef USE_PARTIAL_PACKETS
-#define MICRO_MMA_STORE_ONE(iter) \
-  if (unroll_factor > iter) { \
-    storeAccumulator<DataMapper, Packet, MICRO_NORMAL_PARTIAL(iter)>(row + iter*accCols, res, pAlpha, accCols2, &accZero##iter); \
+#define MICRO_MMA_STORE_ONE(iter, left, right) \
+  if (unroll_factor > left) { \
+    storeAccumulator<DataMapper, Packet, MICRO_NORMAL_PARTIAL(left)>(row + left*accCols, res##right, pAlpha, accCols2, &accZero##iter); \
   }
-#else
-#define MICRO_MMA_STORE_ONE(iter) \
-  if (unroll_factor > iter) { \
-    storeAccumulator<DataMapper, Packet, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlpha, pMask, &accZero##iter); \
+
+#define MICRO_MMA_ITER_UNROLL(func) \
+  if (accItr == 1) { \
+    func(0,0,0) func(1,1,0) func(2,2,0) func(3,3,0) \
+    func(4,4,0) func(5,5,0) func(6,6,0) func(7,7,0) \
+  } else if (accItr == 2) { \
+    func(0,0,0) func(1,0,1) func(2,1,0) func(3,1,1) \
+    func(4,2,0) func(5,2,1) func(6,3,0) func(7,3,1) \
+  } else { \
+    func(0,0,0) func(1,0,1) func(2,0,2) func(3,0,3) \
+    func(4,1,0) func(5,1,1) func(6,1,2) func(7,1,3) \
   }
-#endif
 
-#define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
+#define MICRO_MMA_STORE MICRO_MMA_ITER_UNROLL(MICRO_MMA_STORE_ONE)
 
-#ifdef USE_PARTIAL_PACKETS
-template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool full>
-#else
-template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2>
-#endif
+#define MICRO_MMA_EXTRA_ROWS(right) \
+  gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3##right, blockA, rhs_base + right*accRows*strideB, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
+
+#define MICRO_MMA_EXTRA_ROWS1(val, right) \
+  MICRO_MMA_EXTRA_ROWS(right);
+
+template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool full, const Index accItr>
 EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
-  const DataMapper& res,
+  const DataMapper& res0,
+  const DataMapper& res1,
+  const DataMapper& res2,
+  const DataMapper& res3,
   const Scalar* lhs_base,
   const Scalar* rhs_base,
   Index depth,
   Index strideA,
+  Index strideB,
   Index offsetA,
   Index& row,
   const Packet& pAlpha,
-#ifdef USE_PARTIAL_PACKETS
   Index accCols2
-#else
-  const Packet& pMask
-#endif
   )
 {
-  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL, * rhs_ptr3 = NULL;
   const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
   __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
 
+  if (accItr > 1) {
+    rhs_ptr1 = rhs_base + (accRows * strideB);
+  } else {
+    EIGEN_UNUSED_VARIABLE(strideB);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr1);
+    EIGEN_UNUSED_VARIABLE(res1);
+  }
+  if (accItr > 2) {
+    rhs_ptr2 = rhs_base + (2 * accRows * strideB);
+    rhs_ptr3 = rhs_base + (3 * accRows * strideB);
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr2);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr3);
+    EIGEN_UNUSED_VARIABLE(res2);
+    EIGEN_UNUSED_VARIABLE(res3);
+  }
+
   MICRO_MMA_SRC_PTR
   MICRO_MMA_DST_PTR
 
@@ -347,17 +406,16 @@
   MICRO_UPDATE
 }
 
-#ifdef USE_PARTIAL_PACKETS
 #define MICRO_MMA_UNROLL_ITER2(N, M) \
-  gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, !M>(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, M ? remaining_rows : accCols); \
+  gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, !M, accItr>(res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, strideB, offsetA, row, pAlpha, M ? remaining_rows : accCols); \
   if (M) return;
-#else
-#define MICRO_MMA_UNROLL_ITER2(N, M) \
-  gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, pMask); \
-  if (M) return;
-#endif
 
-template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
+#define MICRO_MMA_ROWS(n) \
+  while(row + n*accCols <= rows) { \
+    MICRO_MMA_UNROLL_ITER2(n, 0); \
+  }
+
+template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accItr>
 EIGEN_ALWAYS_INLINE void gemmMMA_cols(
   const DataMapper& res,
   const Scalar* blockA,
@@ -373,45 +431,71 @@
   const Packet& pAlpha,
   const Packet& pMask)
 {
-  const DataMapper res3 = res.getSubMapper(0, col);
+  const DataMapper res30 = res.getSubMapper(0, col);
+  const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows*1) : res30;
+  const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows*2) : res30;
+  const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows*3) : res30;
 
   const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
   const Scalar* lhs_base = blockA + accCols*offsetA;
   Index row = 0;
 
 #define MAX_MMA_UNROLL 7
-  while(row + MAX_MMA_UNROLL*accCols <= rows) {
-    MICRO_MMA_UNROLL_ITER2(MAX_MMA_UNROLL, 0);
+
+#if MAX_MMA_UNROLL < 2
+  if (1) {
+#elif MAX_MMA_UNROLL < 4
+  if (accItr <= 2) {
+#else
+  if (accItr == 1) {
+#endif
+    MICRO_MMA_ROWS(MAX_MMA_UNROLL);
+  } else if (accItr == 2) {
+    MICRO_MMA_ROWS(4);
+  } else {
+    MICRO_MMA_ROWS(2);
   }
   switch( (rows-row)/accCols ) {
 #if MAX_MMA_UNROLL > 7
     case 7:
-      MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 7)
+      if (accItr == 1) {
+        MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 7)
+      }
       break;
 #endif
 #if MAX_MMA_UNROLL > 6
     case 6:
-      MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 6)
+      if (accItr == 1) {
+        MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 6)
+      }
       break;
 #endif
 #if MAX_MMA_UNROLL > 5
     case 5:
-      MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 5)
+      if (accItr == 1) {
+        MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 5)
+      }
       break;
 #endif
 #if MAX_MMA_UNROLL > 4
     case 4:
-      MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 4)
+      if (accItr == 1) {
+        MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 4)
+      }
       break;
 #endif
 #if MAX_MMA_UNROLL > 3
     case 3:
-      MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 3)
+      if (accItr <= 2) {
+        MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 3)
+      }
       break;
 #endif
 #if MAX_MMA_UNROLL > 2
     case 2:
-      MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 2)
+      if (accItr <= 2) {
+        MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 2)
+      }
       break;
 #endif
 #if MAX_MMA_UNROLL > 1
@@ -426,10 +510,16 @@
 
   if(remaining_rows > 0)
   {
-    gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
+    MICRO_MMA_UNROLL_ITER(MICRO_MMA_EXTRA_ROWS1, 0)
   }
 }
 
+#define MICRO_MMA_COLS(n) \
+  for(; col + n*accRows <= cols; col += n*accRows) \
+  { \
+    gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, accRows, accCols, n>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); \
+  }
+
 template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
 void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
@@ -444,10 +534,11 @@
       typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
 
       Index col = 0;
-      for(; col + accRows <= cols; col += accRows)
-      {
-        gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
-      }
+#ifdef GEMM_MULTIPLE_COLS
+      MICRO_MMA_COLS(4);
+      MICRO_MMA_COLS(2);
+#endif
+      MICRO_MMA_COLS(1);
 
       if (col != cols)
       {
@@ -459,62 +550,88 @@
 #define advanceCols ((RhsIsReal) ? 1 : 2)
 
 // PEEL_COMPLEX_MMA loop factor.
+#ifdef GEMM_MULTIPLE_COLS
+#define PEEL_COMPLEX_MMA 4
+#else
 #define PEEL_COMPLEX_MMA 3
+#endif
 
 #define MICRO_COMPLEX_MMA_UNROLL(func) \
   func(0) func(1) func(2) func(3)
 
 #define MICRO_COMPLEX_MMA_WORK(func, type, peel) \
-  func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel)
+  if (accItr == 1) { \
+    func(0,type,peel,0,0) func(1,type,peel,1,0) func(2,type,peel,2,0) func(3,type,peel,3,0) \
+  } else if (accItr == 2) { \
+    func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,1,0) func(3,type,peel,1,1) \
+  } else { \
+    func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,0,2) func(3,type,peel,0,3) \
+  }
 
-#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \
-  if (unroll_factor > iter) { \
-    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV[peel], rhsVi[peel]); \
+#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel, left, right) \
+  if (unroll_factor > left) { \
+    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##left, lhsVi##left, rhsV##right[peel], rhsVi##right[peel]); \
   }
 
 #ifdef VECTOR_PAIR_LOADS_LHS
-#define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel) \
-  if (unroll_factor > iter) { \
-    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV2##iter.packet[peel & 1], lhsVi2##iter.packet[peel & 1], rhsV[peel], rhsVi[peel]); \
+#define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel, left, right) \
+  if (unroll_factor > left) { \
+    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV2##left.packet[peel & 1], lhsVi2##left.packet[peel & 1], rhsV##right[peel], rhsVi##right[peel]); \
   }
 
-#define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter) \
-  if (!LhsIsReal && (unroll_factor > iter)) { \
-    if (MICRO_NORMAL(iter)) { \
-      ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr_real##iter + imag_delta), plhsVi##iter); \
-      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsVi2##iter.packet), &plhsVi##iter); \
+#define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left) \
+  if (!LhsIsReal && (unroll_factor > left)) { \
+    if (MICRO_NORMAL(left)) { \
+      ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr_real##left + imag_delta), plhsVi##left); \
+      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsVi2##left.packet), &plhsVi##left); \
     } else { \
-      lhsVi2##iter.packet[0] = ploadLhs<Packet>(lhs_ptr_real##iter + imag_delta2); \
-      lhsVi2##iter.packet[1] = ploadLhs<Packet>(lhs_ptr_real##iter + imag_delta2 + accCols2); \
-      EIGEN_UNUSED_VARIABLE(plhsVi##iter) \
+      lhsVi2##left.packet[0] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2); \
+      lhsVi2##left.packet[1] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2 + accCols2); \
+      EIGEN_UNUSED_VARIABLE(plhsVi##left); \
     } \
   } else { \
-    EIGEN_UNUSED_VARIABLE(lhsVi2##iter); \
-    EIGEN_UNUSED_VARIABLE(plhsVi##iter) \
+    EIGEN_UNUSED_VARIABLE(lhsVi2##left); \
+    EIGEN_UNUSED_VARIABLE(plhsVi##left); \
   } \
-  MICRO_MMA_LOAD1_TWO(lhs_ptr_real, iter)
+  MICRO_MMA_LOAD1_TWO(lhs_ptr_real, left)
 
-#define MICRO_COMPLEX_MMA_LOAD_TWO(iter) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter)
+#define MICRO_COMPLEX_MMA_LOAD_TWO(left) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left)
 #endif
 
+#define MICRO_COMPLEX_MMA_LOAD_RHS1(peel, right) \
+  ploadRhsMMA(rhs_ptr_real##right + (accRows * peel), rhsV##right[peel]); \
+  if (!RhsIsReal) { \
+    ploadRhsMMA(rhs_ptr_imag##right + (accRows * peel), rhsVi##right[peel]); \
+  }
+
+#define MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) \
+  MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_RHS1, peel)
+
 #define MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
   if (PEEL_COMPLEX_MMA > peel) { \
     Packet lhsV0, lhsV1, lhsV2, lhsV3; \
     Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
-    ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV[peel]); \
-    if(!RhsIsReal) { \
-      ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi[peel]); \
-    } \
+    MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) \
     MICRO_COMPLEX_MMA_UNROLL(funcl) \
     MICRO_COMPLEX_MMA_WORK(funcw, type, peel) \
   }
 
 #ifndef VECTOR_PAIR_LOADS_LHS
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
-  type rhsV[4], rhsVi[4]; \
+  type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1]; \
   MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,1) \
   MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,3)
 #else
+#define MICRO_COMPLEX_MMA_LOAD_TWO_RHS(peel1, right) \
+  ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_real##right + (accRows * peel1)), prhsV##peel1); \
+  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1); \
+  if(!RhsIsReal) { \
+    ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_imag##right + (accRows * peel1)), prhsVi##peel1); \
+    __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsVi##right[peel1]), &prhsVi##peel1); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
+  }
+
 #define MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
   if (PEEL_COMPLEX_MMA > peel2) { \
     PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23; \
@@ -522,23 +639,12 @@
     __vector_pair plhsV0, plhsV1, plhsV2, plhsV3; \
     __vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3; \
     if (sizeof(type) == 16) { \
-      ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_real + (accRows * peel1)), prhsV##peel1); \
-      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV[peel1]), &prhsV##peel1); \
-      if(!RhsIsReal) { \
-        ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_imag + (accRows * peel1)), prhsVi##peel1); \
-        __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsVi[peel1]), &prhsVi##peel1); \
-      } else { \
-        EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
-      } \
+      MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_TWO_RHS, peel1) \
     } else { \
       EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
       EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
-      ploadRhsMMA(rhs_ptr_real + (accRows * peel1), rhsV[peel1]); \
-      ploadRhsMMA(rhs_ptr_real + (accRows * peel2), rhsV[peel2]); \
-      if(!RhsIsReal) { \
-        ploadRhsMMA(rhs_ptr_imag + (accRows * peel1), rhsVi[peel1]); \
-        ploadRhsMMA(rhs_ptr_imag + (accRows * peel2), rhsVi[peel2]); \
-      } \
+      MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel1); \
+      MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel2); \
     } \
     MICRO_COMPLEX_MMA_UNROLL(funcl2) \
     MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1) \
@@ -550,7 +656,7 @@
   }
 
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
-  type rhsV[4], rhsVi[4]; \
+  type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1]; \
   __vector_pair prhsV0, prhsV2; \
   __vector_pair prhsVi0, prhsVi2; \
   MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
@@ -558,21 +664,26 @@
 #endif
 
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
-  type rhsV[1], rhsVi[1]; \
+  type rhsV0[1], rhsVi0[1], rhsV1[1], rhsVi1[1], rhsV2[1], rhsVi2[1], rhsV3[1], rhsVi3[1]; \
   MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0)
 
+#define MICRO_COMPLEX_MMA_UPDATE_RHS1(size, right) \
+  rhs_ptr_real##right += (accRows * size); \
+  if(!RhsIsReal) rhs_ptr_imag##right += (accRows * size);
+
+#define MICRO_COMPLEX_MMA_UPDATE_RHS(size) \
+  MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_UPDATE_RHS1, size)
+
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size) \
   MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, RhsPacket) \
-  rhs_ptr_real += (accRows * size); \
-  if(!RhsIsReal) rhs_ptr_imag += (accRows * size);
+  MICRO_COMPLEX_MMA_UPDATE_RHS(size);
 
 #ifndef VECTOR_PAIR_LOADS_LHS
 #define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL, PEEL_COMPLEX_MMA)
 #else
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size) \
   MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \
-  rhs_ptr_real += (accRows * size); \
-  if(!RhsIsReal) rhs_ptr_imag += (accRows * size);
+  MICRO_COMPLEX_MMA_UPDATE_RHS(size);
 
 #define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2, PEEL_COMPLEX_MMA)
 #endif
@@ -580,7 +691,7 @@
 #define MICRO_COMPLEX_MMA_ONE MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE, 1)
 
 #define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \
-  if (unroll_factor > iter) { \
+  if (unroll_factor * accItr > iter) { \
     bsetzeroMMA(&accReal##iter); \
     bsetzeroMMA(&accImag##iter); \
   } else { \
@@ -594,16 +705,34 @@
 
 #define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
 
-#define MICRO_COMPLEX_MMA_STORE_ONE(iter) \
-  if (unroll_factor > iter) { \
-    storeComplexAccumulator<DataMapper, Packet, Packetc, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \
+#define MICRO_COMPLEX_MMA_STORE_ONE(iter, left, right) \
+  if (unroll_factor > left) { \
+    storeComplexAccumulator<DataMapper, Packet, Packetc, accCols, (unroll_factor != (left + 1)) ? accCols : accCols2>(row + left*accCols, res##right, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \
   }
 
-#define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
+#define MICRO_COMPLEX_MMA_ITER_UNROLL(func) \
+  if (accItr == 1) { \
+    func(0,0,0) func(1,1,0) func(2,2,0) func(3,3,0) \
+  } else if (accItr == 2) { \
+    func(0,0,0) func(1,0,1) func(2,1,0) func(3,1,1) \
+  } else { \
+    func(0,0,0) func(1,0,1) func(2,0,2) func(3,0,3) \
+  }
 
-template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+#define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_ITER_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
+
+#define MICRO_COMPLEX_MMA_EXTRA_ROWS(right) \
+  gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3##right, blockA, rhs_base + right*accRows*(RhsIsReal ? 1 : 2)*strideB, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+
+#define MICRO_COMPLEX_MMA_EXTRA_ROWS1(val, right) \
+  MICRO_COMPLEX_MMA_EXTRA_ROWS(right);
+
+template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index accItr>
 EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
-  const DataMapper& res,
+  const DataMapper& res0,
+  const DataMapper& res1,
+  const DataMapper& res2,
+  const DataMapper& res3,
   const Scalar* lhs_base,
   const Scalar* rhs_base,
   Index depth,
@@ -615,14 +744,48 @@
   const Packet& pAlphaImag,
   const Packet& pMask)
 {
-  const Scalar* rhs_ptr_real = rhs_base;
-  const Scalar* rhs_ptr_imag = NULL;
+  const Scalar* rhs_ptr_real0 = rhs_base, * rhs_ptr_real1 = NULL, * rhs_ptr_real2 = NULL, * rhs_ptr_real3 = NULL;
+  const Scalar* rhs_ptr_imag0 = NULL, * rhs_ptr_imag1 = NULL, * rhs_ptr_imag2 = NULL, * rhs_ptr_imag3 = NULL;
   const Index imag_delta = accCols*strideA;
   const Index imag_delta2 = accCols2*strideA;
+
   if(!RhsIsReal) {
-    rhs_ptr_imag = rhs_base + accRows*strideB;
+    rhs_ptr_imag0 = rhs_base + accRows*strideB;
   } else {
-    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag0);
+  }
+  if (accItr > 1) {
+    if(!RhsIsReal) {
+      rhs_ptr_real1 = rhs_base + (2*accRows*strideB);
+      rhs_ptr_imag1 = rhs_base + (3*accRows*strideB);
+    } else {
+      rhs_ptr_real1 = rhs_base + accRows*strideB;
+      EIGEN_UNUSED_VARIABLE(rhs_ptr_imag1);
+    }
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_real1);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag1);
+    EIGEN_UNUSED_VARIABLE(res1);
+  }
+  if (accItr > 2) {
+    if(!RhsIsReal) {
+      rhs_ptr_real2 = rhs_base + (4*accRows*strideB);
+      rhs_ptr_imag2 = rhs_base + (5*accRows*strideB);
+      rhs_ptr_real3 = rhs_base + (6*accRows*strideB);
+      rhs_ptr_imag3 = rhs_base + (7*accRows*strideB);
+    } else {
+      rhs_ptr_real2 = rhs_base + (2*accRows*strideB);
+      rhs_ptr_real3 = rhs_base + (3*accRows*strideB);
+      EIGEN_UNUSED_VARIABLE(rhs_ptr_imag2);
+      EIGEN_UNUSED_VARIABLE(rhs_ptr_imag3);
+    }
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_real2);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_real3);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag2);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag3);
+    EIGEN_UNUSED_VARIABLE(res2);
+    EIGEN_UNUSED_VARIABLE(res3);
   }
   const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
   const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
@@ -651,10 +814,15 @@
 }
 
 #define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M) \
-  gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
+  gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, accItr>(res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
   if (M) return;
 
-template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+#define MICRO_COMPLEX_MMA_ROWS(n) \
+  while(row + n*accCols <= rows) { \
+    MICRO_COMPLEX_MMA_UNROLL_ITER2(n, 0); \
+  }
+
+template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index accItr>
 EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
   const DataMapper& res,
   const Scalar* blockA,
@@ -671,35 +839,50 @@
   const Packet& pAlphaImag,
   const Packet& pMask)
 {
-  const DataMapper res3 = res.getSubMapper(0, col);
+  const DataMapper res30 = res.getSubMapper(0, col);
+  const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows*1) : res30;
+  const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows*2) : res30;
+  const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows*3) : res30;
 
   const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
   const Scalar* lhs_base = blockA + accCols*offsetA;
   Index row = 0;
 
 #define MAX_COMPLEX_MMA_UNROLL 4
-  while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
-    MICRO_COMPLEX_MMA_UNROLL_ITER2(MAX_COMPLEX_MMA_UNROLL, 0);
+
+#if MAX_COMPLEX_MMA_UNROLL < 2
+  if (1) {
+#elif MAX_COMPLEX_MMA_UNROLL < 4
+  if (accItr <= 2) {
+#else
+  if (accItr == 1) {
+#endif
+    MICRO_COMPLEX_MMA_ROWS(MAX_COMPLEX_MMA_UNROLL);
+  } else if (accItr == 2) {
+    MICRO_COMPLEX_MMA_ROWS(2);
+  } else {
+    MICRO_COMPLEX_MMA_ROWS(1);
   }
   switch( (rows-row)/accCols ) {
-#if MAX_COMPLEX_MMA_UNROLL > 4
-    case 4:
-      MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 4)
-      break;
-#endif
 #if MAX_COMPLEX_MMA_UNROLL > 3
     case 3:
-      MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 3)
+      if (accItr == 1) {
+        MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 3)
+      }
       break;
 #endif
 #if MAX_COMPLEX_MMA_UNROLL > 2
     case 2:
-      MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 2)
+      if (accItr == 1) {
+        MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 2)
+      }
       break;
 #endif
 #if MAX_COMPLEX_MMA_UNROLL > 1
     case 1:
-      MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 1)
+      if (accItr <= 2) {
+        MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 1)
+      }
       break;
 #endif
     default:
@@ -709,10 +892,16 @@
 
   if(remaining_rows > 0)
   {
-    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+    MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_EXTRA_ROWS1, 0)
   }
 }
 
+#define MICRO_COMPLEX_MMA_COLS(n) \
+  for(; col + n*accRows <= cols; col += n*accRows) \
+  { \
+    gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, n>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); \
+  }
+
 template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
@@ -731,10 +920,11 @@
       typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
 
       Index col = 0;
-      for(; col + accRows <= cols; col += accRows)
-      {
-        gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
-      }
+#ifdef GEMM_MULTIPLE_COLS
+      MICRO_COMPLEX_MMA_COLS(4);
+      MICRO_COMPLEX_MMA_COLS(2);
+#endif
+      MICRO_COMPLEX_MMA_COLS(1);
 
       if (col != cols)
       {
diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
index ed4edc1..27d9a82 100644
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -86,6 +86,8 @@
     typedef packet_type half;                                              \
   };
 
+SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, Eigen::half, 8)
+SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, const Eigen::half, 8)
 SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
 SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
 SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
@@ -100,6 +102,7 @@
   struct is_arithmetic<packet_type> { \
     enum { value = true };            \
   };
+SYCL_ARITHMETIC(cl::sycl::cl_half8)
 SYCL_ARITHMETIC(cl::sycl::cl_float4)
 SYCL_ARITHMETIC(cl::sycl::cl_double2)
 #undef SYCL_ARITHMETIC
@@ -111,6 +114,7 @@
     enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \
     typedef packet_type half;                                            \
   };
+SYCL_UNPACKET_TRAITS(cl::sycl::cl_half8, Eigen::half, 8)
 SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
 SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
 
diff --git a/Eigen/src/Core/arch/SYCL/MathFunctions.h b/Eigen/src/Core/arch/SYCL/MathFunctions.h
index bdbb21f..a8adc46 100644
--- a/Eigen/src/Core/arch/SYCL/MathFunctions.h
+++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h
@@ -38,6 +38,7 @@
     return cl::sycl::log(a);                                           \
   }
 
+SYCL_PLOG(cl::sycl::cl_half8)
 SYCL_PLOG(cl::sycl::cl_float4)
 SYCL_PLOG(cl::sycl::cl_double2)
 #undef SYCL_PLOG
@@ -49,6 +50,7 @@
     return cl::sycl::log1p(a);                                           \
   }
 
+SYCL_PLOG1P(cl::sycl::cl_half8)
 SYCL_PLOG1P(cl::sycl::cl_float4)
 SYCL_PLOG1P(cl::sycl::cl_double2)
 #undef SYCL_PLOG1P
@@ -60,6 +62,7 @@
     return cl::sycl::log10(a);                                           \
   }
 
+SYCL_PLOG10(cl::sycl::cl_half8)
 SYCL_PLOG10(cl::sycl::cl_float4)
 SYCL_PLOG10(cl::sycl::cl_double2)
 #undef SYCL_PLOG10
@@ -71,6 +74,8 @@
     return cl::sycl::exp(a);                                           \
   }
 
+SYCL_PEXP(cl::sycl::cl_half8)
+SYCL_PEXP(cl::sycl::cl_half)
 SYCL_PEXP(cl::sycl::cl_float4)
 SYCL_PEXP(cl::sycl::cl_float)
 SYCL_PEXP(cl::sycl::cl_double2)
@@ -83,6 +88,7 @@
     return cl::sycl::expm1(a);                                           \
   }
 
+SYCL_PEXPM1(cl::sycl::cl_half8)
 SYCL_PEXPM1(cl::sycl::cl_float4)
 SYCL_PEXPM1(cl::sycl::cl_double2)
 #undef SYCL_PEXPM1
@@ -94,6 +100,7 @@
     return cl::sycl::sqrt(a);                                           \
   }
 
+SYCL_PSQRT(cl::sycl::cl_half8)
 SYCL_PSQRT(cl::sycl::cl_float4)
 SYCL_PSQRT(cl::sycl::cl_double2)
 #undef SYCL_PSQRT
@@ -105,6 +112,7 @@
     return cl::sycl::rsqrt(a);                                           \
   }
 
+SYCL_PRSQRT(cl::sycl::cl_half8)
 SYCL_PRSQRT(cl::sycl::cl_float4)
 SYCL_PRSQRT(cl::sycl::cl_double2)
 #undef SYCL_PRSQRT
@@ -117,6 +125,7 @@
     return cl::sycl::sin(a);                                           \
   }
 
+SYCL_PSIN(cl::sycl::cl_half8)
 SYCL_PSIN(cl::sycl::cl_float4)
 SYCL_PSIN(cl::sycl::cl_double2)
 #undef SYCL_PSIN
@@ -129,6 +138,7 @@
     return cl::sycl::cos(a);                                           \
   }
 
+SYCL_PCOS(cl::sycl::cl_half8)
 SYCL_PCOS(cl::sycl::cl_float4)
 SYCL_PCOS(cl::sycl::cl_double2)
 #undef SYCL_PCOS
@@ -141,6 +151,7 @@
     return cl::sycl::tan(a);                                           \
   }
 
+SYCL_PTAN(cl::sycl::cl_half8)
 SYCL_PTAN(cl::sycl::cl_float4)
 SYCL_PTAN(cl::sycl::cl_double2)
 #undef SYCL_PTAN
@@ -153,6 +164,7 @@
     return cl::sycl::asin(a);                                           \
   }
 
+SYCL_PASIN(cl::sycl::cl_half8)
 SYCL_PASIN(cl::sycl::cl_float4)
 SYCL_PASIN(cl::sycl::cl_double2)
 #undef SYCL_PASIN
@@ -165,6 +177,7 @@
     return cl::sycl::acos(a);                                           \
   }
 
+SYCL_PACOS(cl::sycl::cl_half8)
 SYCL_PACOS(cl::sycl::cl_float4)
 SYCL_PACOS(cl::sycl::cl_double2)
 #undef SYCL_PACOS
@@ -177,6 +190,7 @@
     return cl::sycl::atan(a);                                           \
   }
 
+SYCL_PATAN(cl::sycl::cl_half8)
 SYCL_PATAN(cl::sycl::cl_float4)
 SYCL_PATAN(cl::sycl::cl_double2)
 #undef SYCL_PATAN
@@ -189,6 +203,7 @@
     return cl::sycl::sinh(a);                                           \
   }
 
+SYCL_PSINH(cl::sycl::cl_half8)
 SYCL_PSINH(cl::sycl::cl_float4)
 SYCL_PSINH(cl::sycl::cl_double2)
 #undef SYCL_PSINH
@@ -201,6 +216,7 @@
     return cl::sycl::cosh(a);                                           \
   }
 
+SYCL_PCOSH(cl::sycl::cl_half8)
 SYCL_PCOSH(cl::sycl::cl_float4)
 SYCL_PCOSH(cl::sycl::cl_double2)
 #undef SYCL_PCOSH
@@ -213,6 +229,7 @@
     return cl::sycl::tanh(a);                                           \
   }
 
+SYCL_PTANH(cl::sycl::cl_half8)
 SYCL_PTANH(cl::sycl::cl_float4)
 SYCL_PTANH(cl::sycl::cl_double2)
 #undef SYCL_PTANH
@@ -224,6 +241,7 @@
     return cl::sycl::ceil(a);                                           \
   }
 
+SYCL_PCEIL(cl::sycl::cl_half)
 SYCL_PCEIL(cl::sycl::cl_float4)
 SYCL_PCEIL(cl::sycl::cl_double2)
 #undef SYCL_PCEIL
@@ -235,6 +253,7 @@
     return cl::sycl::round(a);                                           \
   }
 
+SYCL_PROUND(cl::sycl::cl_half8)
 SYCL_PROUND(cl::sycl::cl_float4)
 SYCL_PROUND(cl::sycl::cl_double2)
 #undef SYCL_PROUND
@@ -246,6 +265,7 @@
     return cl::sycl::rint(a);                                           \
   }
 
+SYCL_PRINT(cl::sycl::cl_half8)
 SYCL_PRINT(cl::sycl::cl_float4)
 SYCL_PRINT(cl::sycl::cl_double2)
 #undef SYCL_PRINT
@@ -257,6 +277,7 @@
     return cl::sycl::floor(a);                                           \
   }
 
+SYCL_FLOOR(cl::sycl::cl_half8)
 SYCL_FLOOR(cl::sycl::cl_float4)
 SYCL_FLOOR(cl::sycl::cl_double2)
 #undef SYCL_FLOOR
@@ -268,6 +289,7 @@
     return expr;                                                       \
   }
 
+SYCL_PMIN(cl::sycl::cl_half8, cl::sycl::fmin(a, b))
 SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
 SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
 #undef SYCL_PMIN
@@ -279,6 +301,7 @@
     return expr;                                                       \
   }
 
+SYCL_PMAX(cl::sycl::cl_half8, cl::sycl::fmax(a, b))
 SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
 SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
 #undef SYCL_PMAX
@@ -292,6 +315,7 @@
                                      cl::sycl::rounding_mode::automatic>()); \
   }
 
+SYCL_PLDEXP(cl::sycl::cl_half8)
 SYCL_PLDEXP(cl::sycl::cl_float4)
 SYCL_PLDEXP(cl::sycl::cl_double2)
 #undef SYCL_PLDEXP
diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h
index fbdcdc0..4b0b1c6 100644
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@@ -44,9 +44,34 @@
 SYCL_PLOAD(cl::sycl::cl_float4, )
 SYCL_PLOAD(cl::sycl::cl_double2, u)
 SYCL_PLOAD(cl::sycl::cl_double2, )
-
 #undef SYCL_PLOAD
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
+    pload<cl::sycl::cl_half8>(
+        const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
+  auto ptr = cl::sycl::address_space_cast<
+      cl::sycl::access::address_space::generic_space,
+      cl::sycl::access::decorated::no>(
+      reinterpret_cast<const cl::sycl::cl_half*>(from));
+  cl::sycl::cl_half8 res{};
+  res.load(0, ptr);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
+ploadu<cl::sycl::cl_half8>(
+    const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
+  auto ptr = cl::sycl::address_space_cast<
+      cl::sycl::access::address_space::generic_space,
+      cl::sycl::access::decorated::no>(
+      reinterpret_cast<const cl::sycl::cl_half*>(from));
+  cl::sycl::cl_half8 res{};
+  res.load(0, ptr);
+  return res;
+}
+
 #define SYCL_PSTORE(scalar, packet_type, alignment)             \
   template <>                                                   \
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
@@ -59,9 +84,28 @@
 SYCL_PSTORE(float, cl::sycl::cl_float4, u)
 SYCL_PSTORE(double, cl::sycl::cl_double2, )
 SYCL_PSTORE(double, cl::sycl::cl_double2, u)
-
 #undef SYCL_PSTORE
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoreu(
+    Eigen::half* to, const cl::sycl::cl_half8& from) {
+  auto ptr = cl::sycl::address_space_cast<
+      cl::sycl::access::address_space::generic_space,
+      cl::sycl::access::decorated::no>(
+      reinterpret_cast<cl::sycl::cl_half*>(to));
+  from.store(0, ptr);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore(
+    Eigen::half* to, const cl::sycl::cl_half8& from) {
+  auto ptr = cl::sycl::address_space_cast<
+      cl::sycl::access::address_space::generic_space,
+      cl::sycl::access::decorated::no>(
+      reinterpret_cast<cl::sycl::cl_half*>(to));
+  from.store(0, ptr);
+}
+
 #define SYCL_PSET1(packet_type)                                         \
   template <>                                                           \
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
@@ -70,6 +114,7 @@
   }
 
 // global space
+SYCL_PSET1(cl::sycl::cl_half8)
 SYCL_PSET1(cl::sycl::cl_float4)
 SYCL_PSET1(cl::sycl::cl_double2)
 
@@ -87,6 +132,58 @@
 };
 
 template <>
+struct get_base_packet<cl::sycl::cl_half8> {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_ploaddup(
+      sycl_multi_pointer from) {
+    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0]),
+                              static_cast<cl::sycl::half>(from[0]),
+                              static_cast<cl::sycl::half>(from[1]),
+                              static_cast<cl::sycl::half>(from[1]),
+                              static_cast<cl::sycl::half>(from[2]),
+                              static_cast<cl::sycl::half>(from[2]),
+                              static_cast<cl::sycl::half>(from[3]),
+                              static_cast<cl::sycl::half>(from[3]));
+  }
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_pgather(
+      sycl_multi_pointer from, Index stride) {
+    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0 * stride]),
+                              static_cast<cl::sycl::half>(from[1 * stride]),
+                              static_cast<cl::sycl::half>(from[2 * stride]),
+                              static_cast<cl::sycl::half>(from[3 * stride]),
+                              static_cast<cl::sycl::half>(from[4 * stride]),
+                              static_cast<cl::sycl::half>(from[5 * stride]),
+                              static_cast<cl::sycl::half>(from[6 * stride]),
+                              static_cast<cl::sycl::half>(from[7 * stride]));
+  }
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
+      sycl_multi_pointer to, const cl::sycl::cl_half8& from, Index stride) {
+    auto tmp = stride;
+    to[0] = Eigen::half(from.s0());
+    to[tmp] = Eigen::half(from.s1());
+    to[tmp += stride] = Eigen::half(from.s2());
+    to[tmp += stride] = Eigen::half(from.s3());
+    to[tmp += stride] = Eigen::half(from.s4());
+    to[tmp += stride] = Eigen::half(from.s5());
+    to[tmp += stride] = Eigen::half(from.s6());
+    to[tmp += stride] = Eigen::half(from.s7());
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 set_plset(
+      const cl::sycl::half& a) {
+    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(a), static_cast<cl::sycl::half>(a + 1),
+                              static_cast<cl::sycl::half>(a + 2),
+                              static_cast<cl::sycl::half>(a + 3),
+                              static_cast<cl::sycl::half>(a + 4),
+                              static_cast<cl::sycl::half>(a + 5),
+                              static_cast<cl::sycl::half>(a + 6),
+                              static_cast<cl::sycl::half>(a + 7));
+  }
+};
+
+template <>
 struct get_base_packet<cl::sycl::cl_float4> {
   template <typename sycl_multi_pointer>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(
@@ -152,6 +249,7 @@
     return get_base_packet<packet_type>::get_ploaddup(from);               \
   }
 
+SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_half8)
 SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
 SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
 
@@ -165,9 +263,14 @@
   }
 SYCL_PLSET(cl::sycl::cl_float4)
 SYCL_PLSET(cl::sycl::cl_double2)
-
 #undef SYCL_PLSET
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 plset<cl::sycl::cl_half8>(
+    const typename unpacket_traits<cl::sycl::cl_half8>::type& a) {
+  return get_base_packet<cl::sycl::cl_half8>::set_plset((const cl::sycl::half &) a);
+}
+
 #define SYCL_PGATHER_SPECILIZE(scalar, packet_type)                            \
   template <>                                                                  \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type                            \
@@ -176,9 +279,9 @@
     return get_base_packet<packet_type>::get_pgather(from, stride);            \
   }
 
+SYCL_PGATHER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
 SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
 SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
-
 #undef SYCL_PGATHER_SPECILIZE
 
 #define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)                        \
@@ -189,6 +292,7 @@
     get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
   }
 
+SYCL_PSCATTER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
 SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
 SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
 
@@ -201,11 +305,17 @@
     return cl::sycl::mad(a, b, c);                                        \
   }
 
+SYCL_PMAD(cl::sycl::cl_half8)
 SYCL_PMAD(cl::sycl::cl_float4)
 SYCL_PMAD(cl::sycl::cl_double2)
 #undef SYCL_PMAD
 
 template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half pfirst<cl::sycl::cl_half8>(
+    const cl::sycl::cl_half8& a) {
+  return Eigen::half(a.s0());
+}
+template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(
     const cl::sycl::cl_float4& a) {
   return a.x();
@@ -217,6 +327,13 @@
 }
 
 template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux<cl::sycl::cl_half8>(
+    const cl::sycl::cl_half8& a) {
+  return Eigen::half(a.s0() + a.s1() + a.s2() + a.s3() + a.s4() + a.s5()
+                     + a.s6() + a.s7());
+}
+
+template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(
     const cl::sycl::cl_float4& a) {
   return a.x() + a.y() + a.z() + a.w();
@@ -229,6 +346,17 @@
 }
 
 template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_max<cl::sycl::cl_half8>(
+    const cl::sycl::cl_half8& a) {
+  return Eigen::half(cl::sycl::fmax(
+          cl::sycl::fmax(
+            cl::sycl::fmax(a.s0(), a.s1()),
+            cl::sycl::fmax(a.s2(), a.s3())),
+          cl::sycl::fmax(
+            cl::sycl::fmax(a.s4(), a.s5()),
+            cl::sycl::fmax(a.s6(), a.s7()))));
+}
+template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(
     const cl::sycl::cl_float4& a) {
   return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()),
@@ -241,6 +369,17 @@
 }
 
 template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_min<cl::sycl::cl_half8>(
+    const cl::sycl::cl_half8& a) {
+  return Eigen::half(cl::sycl::fmin(
+      cl::sycl::fmin(
+          cl::sycl::fmin(a.s0(), a.s1()),
+          cl::sycl::fmin(a.s2(), a.s3())),
+      cl::sycl::fmin(
+          cl::sycl::fmin(a.s4(), a.s5()),
+          cl::sycl::fmin(a.s6(), a.s7()))));
+}
+template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(
     const cl::sycl::cl_float4& a) {
   return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()),
@@ -253,6 +392,12 @@
 }
 
 template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_mul<cl::sycl::cl_half8>(
+    const cl::sycl::cl_half8& a) {
+  return Eigen::half(a.s0() * a.s1() * a.s2() * a.s3() * a.s4() * a.s5() *
+                     a.s6() * a.s7());
+}
+template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(
     const cl::sycl::cl_float4& a) {
   return a.x() * a.y() * a.z() * a.w();
@@ -264,6 +409,14 @@
 }
 
 template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
+pabs<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return cl::sycl::cl_half8(cl::sycl::fabs(a.s0()), cl::sycl::fabs(a.s1()),
+                            cl::sycl::fabs(a.s2()), cl::sycl::fabs(a.s3()),
+                            cl::sycl::fabs(a.s4()), cl::sycl::fabs(a.s5()),
+                            cl::sycl::fabs(a.s6()), cl::sycl::fabs(a.s7()));
+}
+template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
 pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
   return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()),
@@ -300,6 +453,9 @@
     return sycl_pcmp_##OP<TYPE>(a, b);                                         \
   }
 
+SYCL_PCMP(le, cl::sycl::cl_half8)
+SYCL_PCMP(lt, cl::sycl::cl_half8)
+SYCL_PCMP(eq, cl::sycl::cl_half8)
 SYCL_PCMP(le, cl::sycl::cl_float4)
 SYCL_PCMP(lt, cl::sycl::cl_float4)
 SYCL_PCMP(eq, cl::sycl::cl_float4)
@@ -309,6 +465,121 @@
 #undef SYCL_PCMP
 
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
+    PacketBlock<cl::sycl::cl_half8, 8>& kernel) {
+  cl::sycl::cl_half tmp = kernel.packet[0].s1();
+  kernel.packet[0].s1() = kernel.packet[1].s0();
+  kernel.packet[1].s0() = tmp;
+
+  tmp = kernel.packet[0].s2();
+  kernel.packet[0].s2() = kernel.packet[2].s0();
+  kernel.packet[2].s0() = tmp;
+
+  tmp = kernel.packet[0].s3();
+  kernel.packet[0].s3() = kernel.packet[3].s0();
+  kernel.packet[3].s0() = tmp;
+
+  tmp = kernel.packet[0].s4();
+  kernel.packet[0].s4() = kernel.packet[4].s0();
+  kernel.packet[4].s0() = tmp;
+
+  tmp = kernel.packet[0].s5();
+  kernel.packet[0].s5() = kernel.packet[5].s0();
+  kernel.packet[5].s0() = tmp;
+
+  tmp = kernel.packet[0].s6();
+  kernel.packet[0].s6() = kernel.packet[6].s0();
+  kernel.packet[6].s0() = tmp;
+
+  tmp = kernel.packet[0].s7();
+  kernel.packet[0].s7() = kernel.packet[7].s0();
+  kernel.packet[7].s0() = tmp;
+
+  tmp = kernel.packet[1].s2();
+  kernel.packet[1].s2() = kernel.packet[2].s1();
+  kernel.packet[2].s1() = tmp;
+
+  tmp = kernel.packet[1].s3();
+  kernel.packet[1].s3() = kernel.packet[3].s1();
+  kernel.packet[3].s1() = tmp;
+
+  tmp = kernel.packet[1].s4();
+  kernel.packet[1].s4() = kernel.packet[4].s1();
+  kernel.packet[4].s1() = tmp;
+
+  tmp = kernel.packet[1].s5();
+  kernel.packet[1].s5() = kernel.packet[5].s1();
+  kernel.packet[5].s1() = tmp;
+
+  tmp = kernel.packet[1].s6();
+  kernel.packet[1].s6() = kernel.packet[6].s1();
+  kernel.packet[6].s1() = tmp;
+
+  tmp = kernel.packet[1].s7();
+  kernel.packet[1].s7() = kernel.packet[7].s1();
+  kernel.packet[7].s1() = tmp;
+
+  tmp = kernel.packet[2].s3();
+  kernel.packet[2].s3() = kernel.packet[3].s2();
+  kernel.packet[3].s2() = tmp;
+
+  tmp = kernel.packet[2].s4();
+  kernel.packet[2].s4() = kernel.packet[4].s2();
+  kernel.packet[4].s2() = tmp;
+
+  tmp = kernel.packet[2].s5();
+  kernel.packet[2].s5() = kernel.packet[5].s2();
+  kernel.packet[5].s2() = tmp;
+
+  tmp = kernel.packet[2].s6();
+  kernel.packet[2].s6() = kernel.packet[6].s2();
+  kernel.packet[6].s2() = tmp;
+
+  tmp = kernel.packet[2].s7();
+  kernel.packet[2].s7() = kernel.packet[7].s2();
+  kernel.packet[7].s2() = tmp;
+
+  tmp = kernel.packet[3].s4();
+  kernel.packet[3].s4() = kernel.packet[4].s3();
+  kernel.packet[4].s3() = tmp;
+
+  tmp = kernel.packet[3].s5();
+  kernel.packet[3].s5() = kernel.packet[5].s3();
+  kernel.packet[5].s3() = tmp;
+
+  tmp = kernel.packet[3].s6();
+  kernel.packet[3].s6() = kernel.packet[6].s3();
+  kernel.packet[6].s3() = tmp;
+
+  tmp = kernel.packet[3].s7();
+  kernel.packet[3].s7() = kernel.packet[7].s3();
+  kernel.packet[7].s3() = tmp;
+
+  tmp = kernel.packet[4].s5();
+  kernel.packet[4].s5() = kernel.packet[5].s4();
+  kernel.packet[5].s4() = tmp;
+
+  tmp = kernel.packet[4].s6();
+  kernel.packet[4].s6() = kernel.packet[6].s4();
+  kernel.packet[6].s4() = tmp;
+
+  tmp = kernel.packet[4].s7();
+  kernel.packet[4].s7() = kernel.packet[7].s4();
+  kernel.packet[7].s4() = tmp;
+
+  tmp = kernel.packet[5].s6();
+  kernel.packet[5].s6() = kernel.packet[6].s5();
+  kernel.packet[6].s5() = tmp;
+
+  tmp = kernel.packet[5].s7();
+  kernel.packet[5].s7() = kernel.packet[7].s5();
+  kernel.packet[7].s5() = tmp;
+
+  tmp = kernel.packet[6].s7();
+  kernel.packet[6].s7() = kernel.packet[7].s6();
+  kernel.packet[7].s6() = tmp;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
     PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
   float tmp = kernel.packet[0].y();
   kernel.packet[0].y() = kernel.packet[1].x();
@@ -343,6 +614,19 @@
 }
 
 template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pblend(
+    const Selector<unpacket_traits<cl::sycl::cl_half8>::size>& ifPacket,
+    const cl::sycl::cl_half8& thenPacket,
+    const cl::sycl::cl_half8& elsePacket) {
+  cl::sycl::cl_short8 condition(
+      ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
+      ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1,
+      ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1,
+      ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1);
+  return cl::sycl::select(thenPacket, elsePacket, condition);
+}
+
+template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
     const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
     const cl::sycl::cl_float4& thenPacket,
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index bb382a7..d05e7d1 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -825,7 +825,7 @@
 
   // For regular block expressions, simply forward along the InnerPanel argument,
   // which is set when calling row/column expressions.
-  static constexpr bool is_inner_panel(bool inner_panel) { return inner_panel; };
+  static constexpr bool is_inner_panel(bool inner_panel) { return inner_panel; }
   
   // Only enable non-const base function if XprType is not const (otherwise we get a duplicate definition).
   template<typename T = XprType, typename EnableIf=std::enable_if_t<!std::is_const<T>::value>>
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 8f64863..1cfff1b 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -1432,7 +1432,7 @@
   typedef Transform<typename Other::Scalar,Dim,Mode,Options> TransformType;
   typedef typename TransformType::MatrixType MatrixType;
   typedef Transform<typename Other::Scalar,Dim,Projective,Options> ResultType;
-  static ResultType run(const Other& other,const TransformType& tr)
+  static EIGEN_DEVICE_FUNC ResultType run(const Other& other,const TransformType& tr)
   { return ResultType(other * tr.matrix()); }
 };
 
@@ -1443,7 +1443,7 @@
   typedef Transform<typename Other::Scalar,Dim,AffineCompact,Options> TransformType;
   typedef typename TransformType::MatrixType MatrixType;
   typedef Transform<typename Other::Scalar,Dim,Projective,Options> ResultType;
-  static ResultType run(const Other& other,const TransformType& tr)
+  static EIGEN_DEVICE_FUNC ResultType run(const Other& other,const TransformType& tr)
   {
     ResultType res;
     res.matrix().noalias() = other.template block<HDim,Dim>(0,0) * tr.matrix();
@@ -1459,7 +1459,7 @@
   typedef Transform<typename Other::Scalar,Dim,Mode,Options> TransformType;
   typedef typename TransformType::MatrixType MatrixType;
   typedef TransformType ResultType;
-  static ResultType run(const Other& other,const TransformType& tr)
+  static EIGEN_DEVICE_FUNC ResultType run(const Other& other,const TransformType& tr)
   {
     ResultType res;
     res.affine().noalias() = other * tr.matrix();
@@ -1475,7 +1475,7 @@
   typedef Transform<typename Other::Scalar,Dim,AffineCompact,Options> TransformType;
   typedef typename TransformType::MatrixType MatrixType;
   typedef TransformType ResultType;
-  static ResultType run(const Other& other,const TransformType& tr)
+  static EIGEN_DEVICE_FUNC ResultType run(const Other& other,const TransformType& tr)
   {
     ResultType res;
     res.matrix().noalias() = other.template block<Dim,Dim>(0,0) * tr.matrix();
@@ -1491,7 +1491,7 @@
   typedef Transform<typename Other::Scalar,Dim,Mode,Options> TransformType;
   typedef typename TransformType::MatrixType MatrixType;
   typedef TransformType ResultType;
-  static ResultType run(const Other& other, const TransformType& tr)
+  static EIGEN_DEVICE_FUNC ResultType run(const Other& other, const TransformType& tr)
   {
     TransformType res;
     if(Mode!=int(AffineCompact))
@@ -1513,7 +1513,7 @@
   typedef Transform<Scalar,Dim,LhsMode,LhsOptions> Lhs;
   typedef Transform<Scalar,Dim,RhsMode,RhsOptions> Rhs;
   typedef Transform<Scalar,Dim,ResultMode,LhsOptions> ResultType;
-  static ResultType run(const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs)
   {
     ResultType res;
     res.linear() = lhs.linear() * rhs.linear();
@@ -1529,7 +1529,7 @@
   typedef Transform<Scalar,Dim,LhsMode,LhsOptions> Lhs;
   typedef Transform<Scalar,Dim,RhsMode,RhsOptions> Rhs;
   typedef Transform<Scalar,Dim,Projective> ResultType;
-  static ResultType run(const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs)
   {
     return ResultType( lhs.matrix() * rhs.matrix() );
   }
@@ -1541,7 +1541,7 @@
   typedef Transform<Scalar,Dim,AffineCompact,LhsOptions> Lhs;
   typedef Transform<Scalar,Dim,Projective,RhsOptions> Rhs;
   typedef Transform<Scalar,Dim,Projective> ResultType;
-  static ResultType run(const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs)
   {
     ResultType res;
     res.matrix().template topRows<Dim>() = lhs.matrix() * rhs.matrix();
@@ -1556,7 +1556,7 @@
   typedef Transform<Scalar,Dim,Projective,LhsOptions> Lhs;
   typedef Transform<Scalar,Dim,AffineCompact,RhsOptions> Rhs;
   typedef Transform<Scalar,Dim,Projective> ResultType;
-  static ResultType run(const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs)
   {
     ResultType res(lhs.matrix().template leftCols<Dim>() * rhs.matrix());
     res.matrix().col(Dim) += lhs.matrix().col(Dim);
diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index ced7595..f9f7979 100644
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -274,8 +274,8 @@
                                                             m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
                                                             m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   manageErrorCode(error);
-  m_analysisIsOk = true;
-  m_factorizationIsOk = true;
+  m_analysisIsOk = m_info == Eigen::Success;
+  m_factorizationIsOk = m_info == Eigen::Success;
   m_isInitialized = true;
   return derived();
 }
@@ -296,7 +296,7 @@
                                                             m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   
   manageErrorCode(error);
-  m_analysisIsOk = true;
+  m_analysisIsOk = m_info == Eigen::Success;
   m_factorizationIsOk = false;
   m_isInitialized = true;
   return derived();
@@ -316,7 +316,7 @@
                                                             m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   
   manageErrorCode(error);
-  m_factorizationIsOk = true;
+  m_factorizationIsOk = m_info == Eigen::Success;
   return derived();
 }
 
diff --git a/Eigen/src/SparseCore/SparseTriangularView.h b/Eigen/src/SparseCore/SparseTriangularView.h
index 5cac825..2e53114 100644
--- a/Eigen/src/SparseCore/SparseTriangularView.h
+++ b/Eigen/src/SparseCore/SparseTriangularView.h
@@ -152,8 +152,8 @@
         }
       }
 
-//       inline Index row() const { return (ArgType::Flags&RowMajorBit ? Base::outer() : this->index()); }
-//       inline Index col() const { return (ArgType::Flags&RowMajorBit ? this->index() : Base::outer()); }
+      inline Index row() const { return (ArgType::Flags&RowMajorBit ? Base::outer() : this->index()); }
+      inline Index col() const { return (ArgType::Flags&RowMajorBit ? this->index() : Base::outer()); }
       inline StorageIndex index() const
       {
         if(HasUnitDiag && m_returnOne)  return internal::convert_index<StorageIndex>(Base::outer());
diff --git a/debug/msvc/eigen.natvis b/debug/msvc/eigen.natvis
index 22cf346..da89857 100644
--- a/debug/msvc/eigen.natvis
+++ b/debug/msvc/eigen.natvis
@@ -1,235 +1,235 @@
-<?xml version="1.0" encoding="utf-8"?>
-
-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-
-  <!-- Fixed x Fixed Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,*,*,*,*,*&gt;">      
-      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>
-      <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>
-      <Expand>
-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
-          <Rank>2</Rank>
-          <Size>$i==0 ? $T2 : $T3</Size>
-          <ValuePointer>m_storage.m_data.array</ValuePointer>
-        </ArrayItems>
-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
-          <Direction>Backward</Direction>
-          <Rank>2</Rank>
-          <Size>$i==0 ? $T2 : $T3</Size>
-          <ValuePointer>m_storage.m_data.array</ValuePointer>
-        </ArrayItems>
-      </Expand>
-  </Type>
-  
-  <!-- 2 x 2 Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,2,2,*,*,*&gt;">      
-      <AlternativeType Name="Eigen::Array&lt;*,2,2,*,*,*&gt;"/>
-      <DisplayString>[2, 2] (fixed matrix)</DisplayString>
-      <Expand>
-        <Synthetic Name="[row 0]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 0]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 1]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 1]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>
-        </Synthetic>        
-      </Expand>
-  </Type>
-  
-  <!-- 3 x 3 Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,3,3,*,*,*&gt;">      
-      <AlternativeType Name="Eigen::Array&lt;*,3,3,*,*,*&gt;"/>
-      <DisplayString>[3, 3] (fixed matrix)</DisplayString>
-      <Expand>
-        <Synthetic Name="[row 0]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 0]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 1]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 1]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 2]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 2]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>
-        </Synthetic>        
-      </Expand>
-  </Type>
-  
-  <!-- 4 x 4 Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,4,4,*,*,*&gt;">      
-      <AlternativeType Name="Eigen::Array&lt;*,4,4,*,*,*&gt;"/>
-      <DisplayString>[4, 4] (fixed matrix)</DisplayString>
-      <Expand>
-        <Synthetic Name="[row 0]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 0]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 1]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 1]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 2]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 2]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 3]" Condition="Flags%2">
-          <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>
-        </Synthetic>
-        <Synthetic Name="[row 3]" Condition="!(Flags%2)">
-          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>
-        </Synthetic>        
-      </Expand>
-  </Type>  
-  
-  <!-- Dynamic x Dynamic Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,-1,-1,*,*,*&gt;">      
-      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>
-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>
-      <Expand>
-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
-          <Rank>2</Rank>
-          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
-          <Direction>Backward</Direction>
-          <Rank>2</Rank>
-          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-      </Expand>
-  </Type>
-  
-  <!-- Fixed x Dynamic Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,*,-1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Array&lt;*,*,-1,*,*,*&gt;"/>
-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
-      <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>
-      <Expand>
-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
-          <Rank>2</Rank>
-          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
-          <Direction>Backward</Direction>
-          <Rank>2</Rank>
-          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-      </Expand>
-  </Type>
-  
-  <!-- Dynamic x Fixed Matrix -->
-  <Type Name="Eigen::Matrix&lt;*,-1,*,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Array&lt;*,-1,*,*,*,*&gt;"/>
-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>
-      <Expand>
-        <ArrayItems Condition="Flags%2"> <!-- row major layout -->
-          <Rank>2</Rank>
-          <Size>$i==0 ? m_storage.m_rows : $T2</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
-          <Direction>Backward</Direction>
-          <Rank>2</Rank>
-          <Size>$i==0 ? m_storage.m_rows : $T2</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-      </Expand>
-  </Type>
-  
-  <!-- Dynamic Column Vector -->
-  <Type Name="Eigen::Matrix&lt;*,1,-1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Array&lt;*,1,-1,*,*,*&gt;"/>
-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>
-      <Expand>
-        <Item Name="[size]">m_storage.m_cols</Item>
-        <ArrayItems>
-          <Size>m_storage.m_cols</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-      </Expand>
-  </Type>
-  
-  <!-- Dynamic Row Vector -->
-  <Type Name="Eigen::Matrix&lt;*,-1,1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Array&lt;*,-1,1,*,*,*&gt;"/>
-      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
-      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>
-      <Expand>
-        <Item Name="[size]">m_storage.m_rows</Item>
-        <ArrayItems>
-          <Size>m_storage.m_rows</Size>
-          <ValuePointer>m_storage.m_data</ValuePointer>
-        </ArrayItems>
-      </Expand>
-  </Type>
-  
-  <!-- Fixed Vector -->
-  <Type Name="Eigen::Matrix&lt;*,1,1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Array&lt;*,1,1,*,*,*&gt;"/>
-      <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>
-      <Expand>
-        <Item Name="[x]">m_storage.m_data.array[0]</Item>
-      </Expand>
-  </Type>
-  
-  <Type Name="Eigen::Matrix&lt;*,2,1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Matrix&lt;*,1,2,*,*,*&gt;"/>
-      <AlternativeType Name="Eigen::Array&lt;*,2,1,*,*,*&gt;"/>
-      <AlternativeType Name="Eigen::Array&lt;*,1,2,*,*,*&gt;"/>
-      <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
-      <Expand>
-        <Item Name="[x]">m_storage.m_data.array[0]</Item>
-        <Item Name="[y]">m_storage.m_data.array[1]</Item>
-      </Expand>
-  </Type>
-  
-  <Type Name="Eigen::Matrix&lt;*,3,1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Matrix&lt;*,1,3,*,*,*&gt;"/>
-      <AlternativeType Name="Eigen::Array&lt;*,3,1,*,*,*&gt;"/>
-      <AlternativeType Name="Eigen::Array&lt;*,1,3,*,*,*&gt;"/>
-      <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
-      <Expand>
-        <Item Name="[x]">m_storage.m_data.array[0]</Item>
-        <Item Name="[y]">m_storage.m_data.array[1]</Item>
-        <Item Name="[z]">m_storage.m_data.array[2]</Item>
-      </Expand>
-  </Type>
-  
-    <Type Name="Eigen::Matrix&lt;*,4,1,*,*,*&gt;">
-      <AlternativeType Name="Eigen::Matrix&lt;*,1,4,*,*,*&gt;"/>
-      <AlternativeType Name="Eigen::Array&lt;*,4,1,*,*,*&gt;"/>
-      <AlternativeType Name="Eigen::Array&lt;*,1,4,*,*,*&gt;"/>
-      <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
-      <Expand>
-        <Item Name="[x]">m_storage.m_data.array[0]</Item>
-        <Item Name="[y]">m_storage.m_data.array[1]</Item>
-        <Item Name="[z]">m_storage.m_data.array[2]</Item>
-        <Item Name="[w]">m_storage.m_data.array[3]</Item>
-      </Expand>
-  </Type>
-
-</AutoVisualizer>
+<?xml version="1.0" encoding="utf-8"?>

+

+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">

+

+  <!-- Fixed x Fixed Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,*,*,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>

+      <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>

+      <Expand>

+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

+          <Rank>2</Rank>

+          <Size>$i==0 ? $T2 : $T3</Size>

+          <ValuePointer>m_storage.m_data.array</ValuePointer>

+        </ArrayItems>

+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

+          <Direction>Backward</Direction>

+          <Rank>2</Rank>

+          <Size>$i==0 ? $T2 : $T3</Size>

+          <ValuePointer>m_storage.m_data.array</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- 2 x 2 Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,2,2,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,2,2,*,*,*&gt;"/>

+      <DisplayString>[2, 2] (fixed matrix)</DisplayString>

+      <Expand>

+        <Synthetic Name="[row 0]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>

+        </Synthetic>        

+      </Expand>

+  </Type>

+  

+  <!-- 3 x 3 Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,3,3,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,3,3,*,*,*&gt;"/>

+      <DisplayString>[3, 3] (fixed matrix)</DisplayString>

+      <Expand>

+        <Synthetic Name="[row 0]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 2]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 2]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>

+        </Synthetic>        

+      </Expand>

+  </Type>

+  

+  <!-- 4 x 4 Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,4,4,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,4,4,*,*,*&gt;"/>

+      <DisplayString>[4, 4] (fixed matrix)</DisplayString>

+      <Expand>

+        <Synthetic Name="[row 0]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 0]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 1]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 2]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 2]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 3]" Condition="Flags%2">

+          <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>

+        </Synthetic>

+        <Synthetic Name="[row 3]" Condition="!(Flags%2)">

+          <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>

+        </Synthetic>        

+      </Expand>

+  </Type>  

+  

+  <!-- Dynamic x Dynamic Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,-1,-1,*,*,*&gt;">      

+      <AlternativeType Name="Eigen::Array&lt;*,-1,-1,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>

+      <Expand>

+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

+          <Rank>2</Rank>

+          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

+          <Direction>Backward</Direction>

+          <Rank>2</Rank>

+          <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Fixed x Dynamic Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,*,-1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,*,-1,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>

+      <Expand>

+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

+          <Rank>2</Rank>

+          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

+          <Direction>Backward</Direction>

+          <Rank>2</Rank>

+          <Size>$i==0 ? $T2 : m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Dynamic x Fixed Matrix -->

+  <Type Name="Eigen::Matrix&lt;*,-1,*,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,-1,*,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>

+      <Expand>

+        <ArrayItems Condition="Flags%2"> <!-- row major layout -->

+          <Rank>2</Rank>

+          <Size>$i==0 ? m_storage.m_rows : $T2</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+        <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->

+          <Direction>Backward</Direction>

+          <Rank>2</Rank>

+          <Size>$i==0 ? m_storage.m_rows : $T2</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Dynamic Column Vector -->

+  <Type Name="Eigen::Matrix&lt;*,1,-1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,1,-1,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>

+      <Expand>

+        <Item Name="[size]">m_storage.m_cols</Item>

+        <ArrayItems>

+          <Size>m_storage.m_cols</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Dynamic Row Vector -->

+  <Type Name="Eigen::Matrix&lt;*,-1,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,-1,1,*,*,*&gt;"/>

+      <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>

+      <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>

+      <Expand>

+        <Item Name="[size]">m_storage.m_rows</Item>

+        <ArrayItems>

+          <Size>m_storage.m_rows</Size>

+          <ValuePointer>m_storage.m_data</ValuePointer>

+        </ArrayItems>

+      </Expand>

+  </Type>

+  

+  <!-- Fixed Vector -->

+  <Type Name="Eigen::Matrix&lt;*,1,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Array&lt;*,1,1,*,*,*&gt;"/>

+      <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>

+      <Expand>

+        <Item Name="[x]">m_storage.m_data.array[0]</Item>

+      </Expand>

+  </Type>

+  

+  <Type Name="Eigen::Matrix&lt;*,2,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Matrix&lt;*,1,2,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,2,1,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,1,2,*,*,*&gt;"/>

+      <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>

+      <Expand>

+        <Item Name="[x]">m_storage.m_data.array[0]</Item>

+        <Item Name="[y]">m_storage.m_data.array[1]</Item>

+      </Expand>

+  </Type>

+  

+  <Type Name="Eigen::Matrix&lt;*,3,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Matrix&lt;*,1,3,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,3,1,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,1,3,*,*,*&gt;"/>

+      <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>

+      <Expand>

+        <Item Name="[x]">m_storage.m_data.array[0]</Item>

+        <Item Name="[y]">m_storage.m_data.array[1]</Item>

+        <Item Name="[z]">m_storage.m_data.array[2]</Item>

+      </Expand>

+  </Type>

+  

+    <Type Name="Eigen::Matrix&lt;*,4,1,*,*,*&gt;">

+      <AlternativeType Name="Eigen::Matrix&lt;*,1,4,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,4,1,*,*,*&gt;"/>

+      <AlternativeType Name="Eigen::Array&lt;*,1,4,*,*,*&gt;"/>

+      <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>

+      <Expand>

+        <Item Name="[x]">m_storage.m_data.array[0]</Item>

+        <Item Name="[y]">m_storage.m_data.array[1]</Item>

+        <Item Name="[z]">m_storage.m_data.array[2]</Item>

+        <Item Name="[w]">m_storage.m_data.array[3]</Item>

+      </Expand>

+  </Type>

+

+</AutoVisualizer>

diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat
index 273c10d..35ef580 100644
--- a/debug/msvc/eigen_autoexp_part.dat
+++ b/debug/msvc/eigen_autoexp_part.dat
@@ -1,295 +1,295 @@
-; ***************************************************************
-; * Eigen Visualizer
-; *
-; * Author: Hauke Heibel <hauke.heibel@gmail.com>
-; *
-; * Support the enhanced debugging of the following Eigen
-; * types (*: any, +:fixed dimension) :
-; *
-; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>
-; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>
-; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>
-; * - Eigen::Matrix<*,-1,-1,*,*,*>
-; * - Eigen::Matrix<*,+,-1,*,*,*>
-; * - Eigen::Matrix<*,-1,+,*,*,*>
-; * - Eigen::Matrix<*,+,+,*,*,*>
-; *
-; * Matrices are displayed properly independently of the memory
-; * alignment (RowMajor vs. ColMajor).
-; *
-; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
-; * that your original autoexp.dat file is copied to a safe 
-; * place before proceeding with its modification.
-; ***************************************************************
-
-[Visualizer]
-
-; Fixed size 4-vectors
-Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{
-   children
-   (
-      #(
-        [internals]: [$c,!],
-         x : ($c.m_storage.m_data.array)[0],
-         y : ($c.m_storage.m_data.array)[1],
-         z : ($c.m_storage.m_data.array)[2],
-         w : ($c.m_storage.m_data.array)[3]
-      )
-   )
-
-   preview
-   (
-      #(
-        "[",
-        4,
-        "](",
-        #array(expr: $e.m_storage.m_data.array[$i], size: 4),
-        ")"
-      )
-   )
-}
-
-; Fixed size 3-vectors
-Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{
-   children
-   (
-      #(
-        [internals]: [$c,!],
-         x : ($c.m_storage.m_data.array)[0],
-         y : ($c.m_storage.m_data.array)[1],
-         z : ($c.m_storage.m_data.array)[2]
-      )
-   )
-
-   preview
-   (
-      #(
-        "[",
-        3,
-        "](",
-        #array(expr: $e.m_storage.m_data.array[$i], size: 3),
-        ")"
-      )
-   )
-}
-
-; Fixed size 2-vectors
-Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{
-   children
-   (
-      #(
-        [internals]: [$c,!],
-         x : ($c.m_storage.m_data.array)[0],
-         y : ($c.m_storage.m_data.array)[1]
-      )
-   )
-
-   preview
-   (
-      #(
-        "[",
-        2,
-        "](",
-        #array(expr: $e.m_storage.m_data.array[$i], size: 2),
-        ")"
-      )
-   )
-}
-
-; Fixed size 1-vectors
-Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{
-   children
-   (
-      #(
-        [internals]: [$c,!],
-         x : ($c.m_storage.m_data.array)[0]
-      )
-   )
-
-   preview
-   (
-      #(
-        "[",
-        1,
-        "](",
-        #array(expr: $e.m_storage.m_data.array[$i], size: 1),
-        ")"
-      )
-   )
-}
-
-; Dynamic matrices (ColMajor and RowMajor support)
-Eigen::Matrix<*,-1,-1,*,*,*>{
-  children
-   (
-      #(
-         [internals]: [$c,!],
-         rows: $c.m_storage.m_rows,
-         cols: $c.m_storage.m_cols,
-         ; Check for RowMajorBit
-         #if ($c.Flags & 0x1) (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 
-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
-             )
-         ) #else (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data)[$i],
-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
-             )
-         )
-      )
-   )
-
-   preview
-   (
-     #(
-         "[",
-           $c.m_storage.m_rows,
-         ",",
-           $c.m_storage.m_cols,
-         "](",
-           #array(
-            expr :    [($c.m_storage.m_data)[$i],g],
-            size :    $c.m_storage.m_rows*$c.m_storage.m_cols
-           ),
-         ")"
-      )
-   )
-}
-
-; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,*,-1,*,*,*>{
-  children
-   (
-      #(
-         [internals]: [$c,!],
-         rows: $c.RowsAtCompileTime,
-         cols: $c.m_storage.m_cols,
-         ; Check for RowMajorBit
-         #if ($c.Flags & 0x1) (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
-             )
-         ) #else (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data)[$i],
-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
-             )
-         )
-      )
-   )
-
-   preview
-   (
-     #(
-         "[",
-           $c.RowsAtCompileTime,
-         ",",
-           $c.m_storage.m_cols,
-         "](",
-           #array(
-            expr :    [($c.m_storage.m_data)[$i],g],
-            size :    $c.RowsAtCompileTime*$c.m_storage.m_cols
-           ),
-         ")"
-      )
-   )
-}
-
-; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,-1,*,*,*,*>{
-  children
-   (
-      #(
-         [internals]: [$c,!],
-         rows: $c.m_storage.m_rows,
-         cols: $c.ColsAtCompileTime,
-         ; Check for RowMajorBit
-         #if ($c.Flags & 0x1) (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 
-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
-             )
-         ) #else (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data)[$i],
-                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
-             )
-         )
-      )
-   )
-
-   preview
-   (
-     #(
-         "[",
-           $c.m_storage.m_rows,
-         ",",
-           $c.ColsAtCompileTime,
-         "](",
-           #array(
-            expr :    [($c.m_storage.m_data)[$i],g],
-            size :    $c.m_storage.m_rows*$c.ColsAtCompileTime
-           ),
-         ")"
-      )
-   )
-}
-
-; Fixed size matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,*,*,*,*,*>{
-  children
-   (
-      #(
-         [internals]: [$c,!],
-         rows: $c.RowsAtCompileTime,
-         cols: $c.ColsAtCompileTime,
-         ; Check for RowMajorBit
-         #if ($c.Flags & 0x1) (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], 
-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
-             )
-         ) #else (
-             #array(
-                rank: 2,
-                base: 0,
-                expr: ($c.m_storage.m_data.array)[$i],
-                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
-             )
-         )
-      )
-   )
-
-   preview
-   (
-     #(
-         "[",
-           $c.RowsAtCompileTime,
-         ",",
-           $c.ColsAtCompileTime,
-         "](",
-           #array(
-            expr :    [($c.m_storage.m_data.array)[$i],g],
-            size :    $c.RowsAtCompileTime*$c.ColsAtCompileTime
-           ),
-         ")"
-      )
-   )
-}
+; ***************************************************************

+; * Eigen Visualizer

+; *

+; * Author: Hauke Heibel <hauke.heibel@gmail.com>

+; *

+; * Support the enhanced debugging of the following Eigen

+; * types (*: any, +:fixed dimension) :

+; *

+; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>

+; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>

+; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>

+; * - Eigen::Matrix<*,-1,-1,*,*,*>

+; * - Eigen::Matrix<*,+,-1,*,*,*>

+; * - Eigen::Matrix<*,-1,+,*,*,*>

+; * - Eigen::Matrix<*,+,+,*,*,*>

+; *

+; * Matrices are displayed properly independently of the memory

+; * alignment (RowMajor vs. ColMajor).

+; *

+; * This file is distributed WITHOUT ANY WARRANTY. Please ensure

+; * that your original autoexp.dat file is copied to a safe 

+; * place before proceeding with its modification.

+; ***************************************************************

+

+[Visualizer]

+

+; Fixed size 4-vectors

+Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{

+   children

+   (

+      #(

+        [internals]: [$c,!],

+         x : ($c.m_storage.m_data.array)[0],

+         y : ($c.m_storage.m_data.array)[1],

+         z : ($c.m_storage.m_data.array)[2],

+         w : ($c.m_storage.m_data.array)[3]

+      )

+   )

+

+   preview

+   (

+      #(

+        "[",

+        4,

+        "](",

+        #array(expr: $e.m_storage.m_data.array[$i], size: 4),

+        ")"

+      )

+   )

+}

+

+; Fixed size 3-vectors

+Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{

+   children

+   (

+      #(

+        [internals]: [$c,!],

+         x : ($c.m_storage.m_data.array)[0],

+         y : ($c.m_storage.m_data.array)[1],

+         z : ($c.m_storage.m_data.array)[2]

+      )

+   )

+

+   preview

+   (

+      #(

+        "[",

+        3,

+        "](",

+        #array(expr: $e.m_storage.m_data.array[$i], size: 3),

+        ")"

+      )

+   )

+}

+

+; Fixed size 2-vectors

+Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{

+   children

+   (

+      #(

+        [internals]: [$c,!],

+         x : ($c.m_storage.m_data.array)[0],

+         y : ($c.m_storage.m_data.array)[1]

+      )

+   )

+

+   preview

+   (

+      #(

+        "[",

+        2,

+        "](",

+        #array(expr: $e.m_storage.m_data.array[$i], size: 2),

+        ")"

+      )

+   )

+}

+

+; Fixed size 1-vectors

+Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{

+   children

+   (

+      #(

+        [internals]: [$c,!],

+         x : ($c.m_storage.m_data.array)[0]

+      )

+   )

+

+   preview

+   (

+      #(

+        "[",

+        1,

+        "](",

+        #array(expr: $e.m_storage.m_data.array[$i], size: 1),

+        ")"

+      )

+   )

+}

+

+; Dynamic matrices (ColMajor and RowMajor support)

+Eigen::Matrix<*,-1,-1,*,*,*>{

+  children

+   (

+      #(

+         [internals]: [$c,!],

+         rows: $c.m_storage.m_rows,

+         cols: $c.m_storage.m_cols,

+         ; Check for RowMajorBit

+         #if ($c.Flags & 0x1) (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 

+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols

+             )

+         ) #else (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[$i],

+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols

+             )

+         )

+      )

+   )

+

+   preview

+   (

+     #(

+         "[",

+           $c.m_storage.m_rows,

+         ",",

+           $c.m_storage.m_cols,

+         "](",

+           #array(

+            expr :    [($c.m_storage.m_data)[$i],g],

+            size :    $c.m_storage.m_rows*$c.m_storage.m_cols

+           ),

+         ")"

+      )

+   )

+}

+

+; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)

+Eigen::Matrix<*,*,-1,*,*,*>{

+  children

+   (

+      #(

+         [internals]: [$c,!],

+         rows: $c.RowsAtCompileTime,

+         cols: $c.m_storage.m_cols,

+         ; Check for RowMajorBit

+         #if ($c.Flags & 0x1) (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],

+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols

+             )

+         ) #else (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[$i],

+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols

+             )

+         )

+      )

+   )

+

+   preview

+   (

+     #(

+         "[",

+           $c.RowsAtCompileTime,

+         ",",

+           $c.m_storage.m_cols,

+         "](",

+           #array(

+            expr :    [($c.m_storage.m_data)[$i],g],

+            size :    $c.RowsAtCompileTime*$c.m_storage.m_cols

+           ),

+         ")"

+      )

+   )

+}

+

+; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)

+Eigen::Matrix<*,-1,*,*,*,*>{

+  children

+   (

+      #(

+         [internals]: [$c,!],

+         rows: $c.m_storage.m_rows,

+         cols: $c.ColsAtCompileTime,

+         ; Check for RowMajorBit

+         #if ($c.Flags & 0x1) (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)], 

+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime

+             )

+         ) #else (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data)[$i],

+                size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime

+             )

+         )

+      )

+   )

+

+   preview

+   (

+     #(

+         "[",

+           $c.m_storage.m_rows,

+         ",",

+           $c.ColsAtCompileTime,

+         "](",

+           #array(

+            expr :    [($c.m_storage.m_data)[$i],g],

+            size :    $c.m_storage.m_rows*$c.ColsAtCompileTime

+           ),

+         ")"

+      )

+   )

+}

+

+; Fixed size matrix (ColMajor and RowMajor support)

+Eigen::Matrix<*,*,*,*,*,*>{

+  children

+   (

+      #(

+         [internals]: [$c,!],

+         rows: $c.RowsAtCompileTime,

+         cols: $c.ColsAtCompileTime,

+         ; Check for RowMajorBit

+         #if ($c.Flags & 0x1) (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)], 

+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime

+             )

+         ) #else (

+             #array(

+                rank: 2,

+                base: 0,

+                expr: ($c.m_storage.m_data.array)[$i],

+                size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime

+             )

+         )

+      )

+   )

+

+   preview

+   (

+     #(

+         "[",

+           $c.RowsAtCompileTime,

+         ",",

+           $c.ColsAtCompileTime,

+         "](",

+           #array(

+            expr :    [($c.m_storage.m_data.array)[$i],g],

+            size :    $c.RowsAtCompileTime*$c.ColsAtCompileTime

+           ),

+         ")"

+      )

+   )

+}

diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 9e102b7..298619e 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -746,6 +746,11 @@
     // check sparse-triangular to dense
     refMat3 = m2.template triangularView<StrictlyUpper>();
     VERIFY_IS_APPROX(refMat3, DenseMatrix(refMat2.template triangularView<StrictlyUpper>()));
+
+    // check sparse triangular view iteration-based evaluation
+    m2.setZero();
+    VERIFY_IS_CWISE_EQUAL(m2.template triangularView<UnitLower>().toDense(), DenseMatrix::Identity(rows, cols));
+    VERIFY_IS_CWISE_EQUAL(m2.template triangularView<UnitUpper>().toDense(), DenseMatrix::Identity(rows, cols));
   }
   
   // test selfadjointView
diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index 9cdf2c0..d487333 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -1 +1 @@
-#include "../../../Eigen/ThreadPool"
\ No newline at end of file
+#include "../../../Eigen/ThreadPool"  // IWYU pragma: export
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index cb74031..480e3ac 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -1118,7 +1118,7 @@
                                               : globalContractDimOffset + privateOffsetC)
                              : OutScalar(0);
 
-        outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
+        outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]);
         privateOffsetNC += Properties::LocalThreadSizeNC;
       }
       privateOffsetC += Properties::LocalThreadSizeC;
@@ -1263,7 +1263,7 @@
     StorageIndex localid = itemID.get_local_id(0);
     OutScalar accumulator = OutScalar(0);
     for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
-      accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
+      accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator);
     }
     auto out_scratch_ptr = scratch_ptr + localid;
     *out_scratch_ptr = accumulator;
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 1d40ae5..d41baf2 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -151,6 +151,7 @@
   ei_add_test(cxx11_tensor_argmax_sycl)
   ei_add_test(cxx11_tensor_custom_op_sycl)
   ei_add_test(cxx11_tensor_scan_sycl)
+  ei_add_test(cxx11_tensor_of_float16_sycl)
   set(EIGEN_SYCL OFF)
 endif()
 
diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
index 41ea3cf..8f4e095 100644
--- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@@ -32,9 +32,9 @@
   Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
   Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
   in.setRandom();
-  in *= in.constant(100.0);
-  in(0, 0, 0) = -1000.0;
-  in(1, 1, 1) = 1000.0;
+  in *= in.constant(static_cast<DataType>(100.0));
+  in(0, 0, 0) = static_cast<DataType>(-1000.0);
+  in(1, 1, 1) = static_cast<DataType>(1000.0);
 
   std::size_t in_bytes = in.size() * sizeof(DataType);
   std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
@@ -93,7 +93,7 @@
             ix[3] = l;
             // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
             // = 10.0
-            tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
+            tensor(ix) = static_cast<DataType>((ix[dim] != 0) ? -1.0 : 10.0);
           }
         }
       }
@@ -132,7 +132,7 @@
             ix[2] = k;
             ix[3] = l;
             // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
-            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
+            tensor(ix) = static_cast<DataType>((ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0);
           }
         }
       }
@@ -180,7 +180,7 @@
             ix[2] = k;
             ix[3] = l;
             // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
-            tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
+            tensor(ix) = static_cast<DataType>((ix[dim] != 0) ? 1.0 : -10.0);
           }
         }
       }
@@ -219,7 +219,7 @@
             ix[2] = k;
             ix[3] = l;
             // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
-            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
+            tensor(ix) = static_cast<DataType>((ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0);
           }
         }
       }
@@ -252,6 +252,7 @@
 
 EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
   for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_argmax_test_per_device<half>(device));
     CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
index 20f84b8..6ca5ac7 100644
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -139,6 +139,7 @@
 
 EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_broadcast_test_per_device<half>(device));
     CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index be7afd2..c8668d8 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -28,26 +28,135 @@
 // Functions used to compare the TensorMap implementation on the device with
 // the equivalent on the host
 namespace SYCL {
-template <typename T> T abs(T x) { 
+
+template <typename T> T abs(T x) {
   return cl::sycl::abs(x);
 }
+template <> Eigen::half abs(Eigen::half x) {
+  return Eigen::half(cl::sycl::fabs(static_cast<cl::sycl::half>(x)));
+}
 
-template <> float abs(float x) { 
+template <> float abs(float x) {
   return cl::sycl::fabs(x);
 }
 
-template <> double abs(double x) { 
+template <> double abs(double x) {
   return cl::sycl::fabs(x);
 }
 
 template <typename T> T square(T x) { return x * x; }
 template <typename T> T cube(T x) { return x * x * x; }
 template <typename T> T inverse(T x) { return T(1) / x; }
-template <typename T> T cwiseMax(T x, T y) { 
-return cl::sycl::max(x, y);
+template <typename T> T cwiseMax(T x, T y) {
+  return cl::sycl::max(x, y);
 }
-template <typename T> T cwiseMin(T x, T y) { 
-  return cl::sycl::min(x, y); 
+template <> Eigen::half cwiseMax(Eigen::half x, Eigen::half y) {
+return Eigen::half(cl::sycl::max(static_cast<cl::sycl::half>(x), static_cast<cl::sycl::half>(y)));
+}
+
+template <typename T> T cwiseMin(T x, T y) {
+  return cl::sycl::min(x, y);
+}
+template <> Eigen::half cwiseMin(Eigen::half x, Eigen::half y) {
+  return Eigen::half(cl::sycl::min(static_cast<cl::sycl::half>(x), static_cast<cl::sycl::half>(y)));
+}
+
+template <typename T> T sqrt(T x) {
+  return cl::sycl::sqrt(x);
+}
+template <> Eigen::half sqrt(Eigen::half x) {
+  return Eigen::half(cl::sycl::sqrt(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T rsqrt(T x) {
+  return cl::sycl::rsqrt(x);
+}
+template <> Eigen::half rsqrt(Eigen::half x) {
+  return Eigen::half(cl::sycl::rsqrt(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T tanh(T x) {
+  return cl::sycl::tanh(x);
+}
+template <> Eigen::half tanh(Eigen::half x) {
+  return Eigen::half(cl::sycl::tanh(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T exp(T x) {
+  return cl::sycl::exp(x);
+}
+template <> Eigen::half exp(Eigen::half x) {
+  return Eigen::half(cl::sycl::exp(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T expm1(T x) {
+  return cl::sycl::expm1(x);
+}
+template <> Eigen::half expm1(Eigen::half x) {
+  return Eigen::half(cl::sycl::expm1(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T log(T x) {
+  return cl::sycl::log(x);
+}
+template <> Eigen::half log(Eigen::half x) {
+  return Eigen::half(cl::sycl::log(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T ceil(T x) {
+  return cl::sycl::ceil(x);
+}
+template <> Eigen::half ceil(Eigen::half x) {
+  return Eigen::half(cl::sycl::ceil(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T floor(T x) {
+  return cl::sycl::floor(x);
+}
+template <> Eigen::half floor(Eigen::half x) {
+  return Eigen::half(cl::sycl::floor(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T round(T x) {
+  return cl::sycl::round(x);
+}
+template <> Eigen::half round(Eigen::half x) {
+  return Eigen::half(cl::sycl::round(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T log1p(T x) {
+  return cl::sycl::log1p(x);
+}
+template <> Eigen::half log1p(Eigen::half x) {
+  return Eigen::half(cl::sycl::log1p(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T sign(T x) {
+  return cl::sycl::sign(x);
+}
+template <> Eigen::half sign(Eigen::half x) {
+  return Eigen::half(cl::sycl::sign(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T isnan(T x) {
+  return cl::sycl::isnan(x);
+}
+template <> Eigen::half isnan(Eigen::half x) {
+  return Eigen::half(cl::sycl::isnan(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T isfinite(T x) {
+  return cl::sycl::isfinite(x);
+}
+template <> Eigen::half isfinite(Eigen::half x) {
+  return Eigen::half(cl::sycl::isfinite(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T isinf(T x) {
+  return cl::sycl::isinf(x);
+}
+template <> Eigen::half isinf(Eigen::half x) {
+  return Eigen::half(cl::sycl::isinf(static_cast<cl::sycl::half>(x)));
 }
 }
 
@@ -157,15 +266,14 @@
 #define DECLARE_UNARY_STRUCT(FUNC)                                 \
   struct op_##FUNC {                                               \
     template <typename T>                                          \
-    auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) {   \
-      return cl::sycl::FUNC(x);                                    \
+    auto operator()(const T& x) -> decltype(SYCL::FUNC(x)) {   \
+      return SYCL::FUNC(x);                                    \
     }                                                              \
     template <typename T>                                          \
     auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
       return x.FUNC();                                             \
     }                                                              \
-  };
-
+};
 
 DECLARE_UNARY_STRUCT(sqrt)
 DECLARE_UNARY_STRUCT(rsqrt)
@@ -390,8 +498,10 @@
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     QueueInterface queueInterface(device);
     Eigen::SyclDevice sycl_device(&queueInterface);
-    CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
-    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
-    CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
+    CALL_SUBTEST_1(test_builtin_unary_sycl<half>(sycl_device));
+    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<half>(sycl_device));
+    CALL_SUBTEST_3(test_builtin_unary_sycl<float>(sycl_device));
+    CALL_SUBTEST_4(test_floating_builtin_binary_sycl<float>(sycl_device));
+    CALL_SUBTEST_5(test_integer_builtin_binary_sycl<int>(sycl_device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
index 1e70931..b018da8 100644
--- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@@ -619,5 +619,6 @@
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
+    CALL_SUBTEST(sycl_chipping_test_per_device<half>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
index 765991b..18dc95b 100644
--- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
@@ -175,6 +175,7 @@
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorConcat_perDevice<half>(device));
     CALL_SUBTEST(tensorConcat_perDevice<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index 2f75ef8..66a13e8 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -82,6 +82,7 @@
 
 EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_device_test_per_device<half>(device));
     CALL_SUBTEST(sycl_device_test_per_device<float>(device));
     CALL_SUBTEST(sycl_device_test_per_device<OffByOneScalar<int>>(device));
   }
diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
index db1c020..083c004 100644
--- a/unsupported/test/cxx11_tensor_image_op_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
@@ -95,6 +95,7 @@
 
 EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) { 
+    CALL_SUBTEST(sycl_computing_test_per_device<half>(device));
    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
 #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
    CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
index 521ae0c..75c2c0e 100644
--- a/unsupported/test/cxx11_tensor_inflation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
@@ -131,6 +131,7 @@
 EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_inflation_test_per_device<half>(device));
     CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
index 9546b91..f556d84 100644
--- a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
@@ -121,6 +121,7 @@
 EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<half>(device));
     CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp
index 029653e..84d638e 100644
--- a/unsupported/test/cxx11_tensor_math_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_math_sycl.cpp
@@ -100,6 +100,7 @@
 
 EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<half>(device));
     CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
index bf001b4..a1545d4 100644
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@@ -381,6 +381,7 @@
 EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_morphing_test_per_device<half>(device));
     CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_of_float16_sycl.cpp b/unsupported/test/cxx11_tensor_of_float16_sycl.cpp
new file mode 100644
index 0000000..1624acc
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_float16_sycl.cpp
@@ -0,0 +1,406 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023
+// Alejandro Acosta  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+#define EIGEN_SYCL_HALF_SUPPORT
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::SyclDevice;
+
+void test_gpu_numext(const Eigen::SyclDevice &sycl_device) {
+  int num_elem = 101;
+
+  float* d_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+  bool* d_res_half = static_cast<bool*>(sycl_device.allocate(num_elem * sizeof(bool)));
+  bool* d_res_float = static_cast<bool*>(sycl_device.allocate(num_elem * sizeof(bool)));
+
+  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float(d_float, num_elem);
+  Eigen::TensorMap<Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(d_res_half, num_elem);
+  Eigen::TensorMap<Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(d_res_float, num_elem);
+
+  gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(sycl_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
+  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
+
+  Tensor<bool, 1> half_prec(num_elem);
+  Tensor<bool, 1> full_prec(num_elem);
+
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(bool));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(bool));
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking numext " << i << std::endl;
+    VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
+  }
+}
+
+void test_gpu_conversion(const Eigen::SyclDevice &sycl_device) {
+  int num_elem = 101;
+
+  float* d_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+  Eigen::half* d_half = static_cast<Eigen::half*>(sycl_device.allocate(num_elem * sizeof(Eigen::half)));
+  float* d_conv = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_float.device(sycl_device) = gpu_float.random();
+  gpu_half.device(sycl_device) = gpu_float.cast<Eigen::half>();
+  gpu_conv.device(sycl_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  sycl_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+}
+
+void test_gpu_unary(const Eigen::SyclDevice &sycl_device) {
+  int num_elem = 101;
+
+  float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(sycl_device) = gpu_float.abs();
+  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  sycl_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking unary " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+}
+
+void test_gpu_elementwise(const Eigen::SyclDevice &sycl_device) {
+  int num_elem = 101;
+
+  float* d_float1 = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+  float* d_float2 = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+  float* d_res_half = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+  float* d_res_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+
+  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_res_half(d_res_half, num_elem);
+  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_res_float(d_res_float, num_elem);
+
+  gpu_float1.device(sycl_device) = gpu_float1.random();
+  gpu_float2.device(sycl_device) = gpu_float2.random();
+  gpu_res_float.device(sycl_device) = (gpu_float1 + gpu_float2) * gpu_float1;
+  gpu_res_half.device(sycl_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(float));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(float));
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
+  }
+}
+
+void test_gpu_trancendental(const Eigen::SyclDevice &sycl_device) {
+  int num_elem = 101;
+
+  float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_float3 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res1_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res1_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
+
+  gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(sycl_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+  gpu_float3.device(sycl_device) = gpu_float3.random();
+  gpu_res1_float.device(sycl_device) = gpu_float1.exp().cast<Eigen::half>();
+  gpu_res2_float.device(sycl_device) = gpu_float2.log().cast<Eigen::half>();
+  gpu_res3_float.device(sycl_device) = gpu_float3.log1p().cast<Eigen::half>();
+  gpu_res4_float.device(sycl_device) = gpu_float3.expm1().cast<Eigen::half>();
+
+  gpu_res1_half.device(sycl_device) = gpu_float1.cast<Eigen::half>();
+  gpu_res1_half.device(sycl_device) = gpu_res1_half.exp();
+
+  gpu_res2_half.device(sycl_device) = gpu_float2.cast<Eigen::half>();
+  gpu_res2_half.device(sycl_device) = gpu_res2_half.log();
+
+  gpu_res3_half.device(sycl_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(sycl_device) = gpu_res3_half.log1p();
+
+  gpu_res3_half.device(sycl_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(sycl_device) = gpu_res3_half.expm1();
+
+  Tensor<float, 1> input1(num_elem);
+  Tensor<Eigen::half, 1> half_prec1(num_elem);
+  Tensor<Eigen::half, 1> full_prec1(num_elem);
+  Tensor<float, 1> input2(num_elem);
+  Tensor<Eigen::half, 1> half_prec2(num_elem);
+  Tensor<Eigen::half, 1> full_prec2(num_elem);
+  Tensor<float, 1> input3(num_elem);
+  Tensor<Eigen::half, 1> half_prec3(num_elem);
+  Tensor<Eigen::half, 1> full_prec3(num_elem);
+  sycl_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
+  sycl_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
+      VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
+    else
+      VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
+  }
+}
+
+void test_gpu_contractions(const Eigen::SyclDevice &sycl_device) {
+  int rows = 23;
+  int cols = 23;
+  int num_elem = rows*cols;
+
+  float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
+      d_res_half, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
+      d_res_float, rows, cols);
+
+  gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(sycl_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
+
+  typedef typename Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  gpu_res_float.device(sycl_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
+  gpu_res_half.device(sycl_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
+
+  Tensor<Eigen::half, 2> half_prec(rows, cols);
+  Tensor<Eigen::half, 2> full_prec(rows, cols);
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
+  sycl_device.synchronize();
+
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
+      if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
+        VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+      }
+    }
+  }
+}
+
+void test_gpu_reductions(const Eigen::SyclDevice &sycl_device, int size1, int size2, int redux) {
+   std::cout << "Reducing " << size1 << " by " << size2
+             << " tensor along dim " << redux << std::endl;
+
+  int num_elem = size1*size2;
+  int result_size = (redux == 1 ? size1 : size2);
+
+  float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, result_size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, result_size);
+
+  gpu_float.device(sycl_device) = gpu_float.random() * 2.0f;
+
+  Eigen::array<int, 1> redux_dim = {redux};
+  gpu_res_float.device(sycl_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
+
+  Tensor<Eigen::half, 1> half_prec(result_size);
+  Tensor<Eigen::half, 1> full_prec(result_size);
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
+  sycl_device.synchronize();
+
+  for (int i = 0; i < result_size; ++i) {
+    std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+}
+
+void test_gpu_reductions(const Eigen::SyclDevice &sycl_device) {
+  test_gpu_reductions(sycl_device, 13, 13, 0);
+  test_gpu_reductions(sycl_device, 13, 13, 1);
+
+  test_gpu_reductions(sycl_device, 35, 36, 0);
+  test_gpu_reductions(sycl_device, 35, 36, 1);
+
+  test_gpu_reductions(sycl_device, 36, 35, 0);
+  test_gpu_reductions(sycl_device, 36, 35, 1);
+}
+
+void test_gpu_full_reductions(const Eigen::SyclDevice &sycl_device) {
+  int size = 13;
+  int num_elem = size*size;
+
+  float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size, size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
+      d_res_half);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
+      d_res_float);
+
+  gpu_float.device(sycl_device) = gpu_float.random();
+
+  gpu_res_float.device(sycl_device) = gpu_float.sum().cast<Eigen::half>();
+  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().sum();
+
+  Tensor<Eigen::half, 0> half_prec;
+  Tensor<Eigen::half, 0> full_prec;
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  sycl_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
+  gpu_res_float.device(sycl_device) = gpu_float.maximum().cast<Eigen::half>();
+  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().maximum();
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  sycl_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+}
+
+void test_gpu_forced_evals(const Eigen::SyclDevice &sycl_device) {
+  int num_elem = 101;
+
+  float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_res_half1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_res_half2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
+      d_res_half1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+      d_res_half2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  Eigen::array<int, 1> no_bcast;
+  no_bcast[0] = 1;
+
+  gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(sycl_device) = gpu_float.abs();
+  gpu_res_half1.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
+  gpu_res_half2.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
+
+  Tensor<float, 1> half_prec1(num_elem);
+  Tensor<float, 1> half_prec2(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  sycl_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
+    VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_sycl)
+{
+  for (const auto& s : Eigen::get_sycl_supported_devices()) {
+    QueueInterface queueInterface(s);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+
+    CALL_SUBTEST_1(test_gpu_numext(sycl_device));
+    CALL_SUBTEST_1(test_gpu_conversion(sycl_device));
+    CALL_SUBTEST_1(test_gpu_unary(sycl_device));
+    CALL_SUBTEST_1(test_gpu_elementwise(sycl_device));
+    CALL_SUBTEST_1(test_gpu_trancendental(sycl_device));
+    CALL_SUBTEST_2(test_gpu_contractions(sycl_device));
+    CALL_SUBTEST_3(test_gpu_reductions(sycl_device));
+    CALL_SUBTEST_4(test_gpu_full_reductions(sycl_device));
+    CALL_SUBTEST_5(test_gpu_forced_evals(sycl_device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp
index 727a9ff..9b16bb1 100644
--- a/unsupported/test/cxx11_tensor_padding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp
@@ -152,6 +152,7 @@
 EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_padding_test_per_device<half>(device));
     CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp
index 7f92bec..1216ad2 100644
--- a/unsupported/test/cxx11_tensor_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp
@@ -244,6 +244,7 @@
 EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_tensor_patch_test_per_device<half>(device));
     CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp
index 14a7c48..dcdfdd3 100644
--- a/unsupported/test/cxx11_tensor_random_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_random_sycl.cpp
@@ -78,6 +78,7 @@
 EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_random_test_per_device<half>(device));
     CALL_SUBTEST(sycl_random_test_per_device<float>(device));
 #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
     CALL_SUBTEST(sycl_random_test_per_device<double>(device));
diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
index dd30c23..5ed007f 100644
--- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@@ -248,6 +248,7 @@
 #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
     CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
 #endif
-    CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
+    CALL_SUBTEST_5(sycl_reverse_test_per_device<half>(device));
+    CALL_SUBTEST_6(sycl_reverse_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
index ca4e8b5..70c4116 100644
--- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@@ -112,6 +112,7 @@
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
   for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_shuffling_test_per_device<half>(device));
     CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
index 8d99a48..dc86dfe 100644
--- a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
@@ -217,6 +217,7 @@
 EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
 {
 for (const auto& device :Eigen::get_sycl_supported_devices()) {
+  CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<half>(device));
   CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
 }
 }