Update Eigen to commit:e8515f78ac098329ab9f8cab21c87caede090a3f
CHANGELOG
=========
e8515f78a - Fix sparse triangular view iterator
6d829e766 - Fix extra semicolon in XprHelper
ba47341a1 - [SYCL-2020] Enabling half precision support for SYCL.
92a77a596 - Fix call to static functions from device by adding EIGEN_DEVICE_FUNC attribute to run methods
8f858a4ea - Export ThreadPool symbols from legacy header.
4e598ad25 - New panel modes for GEMM MMA (real & complex).
2c64a655f - Stage will not be ok if pardiso returned error
PiperOrigin-RevId: 571952024
Change-Id: Ic4ba7448c04e842e42dfb20a19f7cc146a00f66f
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index fc11174..faa3853 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -1,870 +1,870 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-//
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
-#define EIGEN_PACKET_MATH_FP16_AVX512_H
-
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
+#define EIGEN_PACKET_MATH_FP16_AVX512_H
+
// IWYU pragma: private
-#include "../../InternalHeaderCheck.h"
-
-namespace Eigen {
-
-namespace internal {
-
-typedef __m512h Packet32h;
-typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
-typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
-
-template <>
-struct is_arithmetic<Packet8h> {
- enum { value = true };
-};
-
-template <>
-struct packet_traits<half> : default_packet_traits {
- typedef Packet32h type;
- typedef Packet16h half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 32,
-
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 1,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
- HasSetLinear = 0,
- HasLog = 1,
- HasLog1p = 1,
- HasExp = 1,
- HasExpm1 = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
- // These ones should be implemented in future
- HasBessel = 0,
- HasNdtri = 0,
- HasSin = EIGEN_FAST_MATH,
- HasCos = EIGEN_FAST_MATH,
- HasTanh = EIGEN_FAST_MATH,
- HasErf = 0, // EIGEN_FAST_MATH,
- HasBlend = 0,
- HasRound = 1,
- HasFloor = 1,
- HasCeil = 1,
- HasRint = 1
- };
-};
-
-template <>
-struct unpacket_traits<Packet32h> {
- typedef Eigen::half type;
- typedef Packet16h half;
- enum {
- size = 32,
- alignment = Aligned64,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-
-template <>
-struct unpacket_traits<Packet16h> {
- typedef Eigen::half type;
- typedef Packet8h half;
- enum {
- size = 16,
- alignment = Aligned32,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-
-template <>
-struct unpacket_traits<Packet8h> {
- typedef Eigen::half type;
- typedef Packet8h half;
- enum {
- size = 8,
- alignment = Aligned16,
- vectorizable = true,
- masked_load_available = false,
- masked_store_available = false
- };
-};
-
-// Memory functions
-
-// pset1
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
- return _mm512_set1_ph(static_cast<_Float16>(from));
-}
-
-// pset1frombits
-template <>
-EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
- return _mm512_castsi512_ph(_mm512_set1_epi16(from));
-}
-
-// pfirst
-
-template <>
-EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
- return half_impl::raw_uint16_to_half(
- static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
-#else
- Eigen::half dest[32];
- _mm512_storeu_ph(dest, from);
- return dest[0];
-#endif
-}
-
-// pload
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
-}
-
-// ploadu
-
-template <>
-EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
-}
-
-// pstore
-
-template <>
-EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
-}
-
-// pstoreu
-
-template <>
-EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
-}
-
-// ploaddup
-template <>
-EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
- __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
- return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
- 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
- a);
-}
-
-// ploadquad
-template <>
-EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
- __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
- return _mm512_permutexvar_ph(
- _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
- a);
-}
-
-// pabs
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
- return _mm512_abs_ph(a);
-}
-
-// psignbit
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
- return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
-}
-
-// pmin
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_min_ph(a, b);
-}
-
-// pmax
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_max_ph(a, b);
-}
-
-// plset
-template <>
-EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
- return _mm512_add_ph(_mm512_set1_ph(a),
- _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,
- 19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
- 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
-}
-
-// por
-
-template <>
-EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
- return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pxor
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
- return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pand
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
- return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
-}
-
-// pandnot
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
- return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
-}
-
-// pselect
-
-template <>
-EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
- __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
- return _mm512_mask_blend_ph(mask32, a, b);
-}
-
-// pcmp_eq
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
- __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
- return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_le
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
- __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
- return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_lt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
- __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
- return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
-}
-
-// pcmp_lt_or_nan
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
- __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
- return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));
-}
-
-// padd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_add_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// psub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_sub_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pmul
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_mul_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pdiv
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
- return _mm512_div_ph(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
- return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
- return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
-}
-
-// pround
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
- // Work-around for default std::round rounding mode.
-
- // Mask for the sign bit
- const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
- // The largest half-preicision float less than 0.5
- const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
-
- return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
-}
-
-// print
-
-template <>
-EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
- return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
-}
-
-// pceil
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
- return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
-}
-
-// pfloor
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
- return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
-}
-
-// predux
-template <>
-EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
- return (half)_mm512_reduce_add_ph(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
- return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
-}
-
-template <>
-EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
- return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
-}
-
-// predux_half_dowto4
-template <>
-EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
- __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
- __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
-
- return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#else
- Eigen::half data[32];
- _mm512_storeu_ph(data, a);
-
- __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
- __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
-
- return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#endif
-}
-
-// predux_max
-
-// predux_min
-
-// predux_mul
-
-#ifdef EIGEN_VECTORIZE_FMA
-
-// pmadd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
- return _mm512_fmadd_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pmsub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
- return _mm512_fmsub_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pnmadd
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
- return _mm512_fnmadd_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-// pnmsub
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
- return _mm512_fnmsub_ph(a, b, c);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
- return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
- return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
-}
-
-#endif
-
-// pnegate
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
- return _mm512_sub_ph(_mm512_set1_ph(0.0), a);
-}
-
-// pconj
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {
- return a;
-}
-
-// psqrt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
- return _mm512_sqrt_ph(a);
-}
-
-// prsqrt
-
-template <>
-EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
- return _mm512_rsqrt_ph(a);
-}
-
-// preciprocal
-
-template <>
-EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
- return _mm512_rcp_ph(a);
-}
-
-// ptranspose
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {
- __m512i t[32];
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 16; i++) {
- t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
- t[2 * i + 1] =
- _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
- }
-
- __m512i p[32];
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 8; i++) {
- p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
- p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
- p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
- p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
- }
-
- __m512i q[32];
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 4; i++) {
- q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
- q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
- q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
- q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
- q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
- q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
- q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
- q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
- }
-
- __m512i f[32];
-
-#define PACKET32H_TRANSPOSE_HELPER(X, Y) \
- do { \
- f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X); \
- f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
- f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
- f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
- f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
- f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
- f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
- f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
- } while (false);
-
- PACKET32H_TRANSPOSE_HELPER(0, 0);
- PACKET32H_TRANSPOSE_HELPER(1, 1);
- PACKET32H_TRANSPOSE_HELPER(2, 2);
- PACKET32H_TRANSPOSE_HELPER(3, 3);
-
- PACKET32H_TRANSPOSE_HELPER(1, 0);
- PACKET32H_TRANSPOSE_HELPER(2, 0);
- PACKET32H_TRANSPOSE_HELPER(3, 0);
- PACKET32H_TRANSPOSE_HELPER(2, 1);
- PACKET32H_TRANSPOSE_HELPER(3, 1);
- PACKET32H_TRANSPOSE_HELPER(3, 2);
-
- PACKET32H_TRANSPOSE_HELPER(0, 1);
- PACKET32H_TRANSPOSE_HELPER(0, 2);
- PACKET32H_TRANSPOSE_HELPER(0, 3);
- PACKET32H_TRANSPOSE_HELPER(1, 2);
- PACKET32H_TRANSPOSE_HELPER(1, 3);
- PACKET32H_TRANSPOSE_HELPER(2, 3);
-
-#undef PACKET32H_TRANSPOSE_HELPER
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 32; i++) {
- a.packet[i] = _mm512_castsi512_ph(f[i]);
- }
-}
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
- __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
- t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
- t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
- t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
- t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
-
- p0 = _mm512_unpacklo_epi32(t0, t2);
- p1 = _mm512_unpackhi_epi32(t0, t2);
- p2 = _mm512_unpacklo_epi32(t1, t3);
- p3 = _mm512_unpackhi_epi32(t1, t3);
-
- a0 = p0;
- a1 = p1;
- a2 = p2;
- a3 = p3;
-
- a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
- a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
-
- a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
- a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
-
- a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
- a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
-
- a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
- a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
-
- a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
- a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
-
- a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
- a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
-
- a.packet[0] = _mm512_castsi512_ph(a0);
- a.packet[1] = _mm512_castsi512_ph(a1);
- a.packet[2] = _mm512_castsi512_ph(a2);
- a.packet[3] = _mm512_castsi512_ph(a3);
-}
-
-// preverse
-
-template <>
-EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
- return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
- 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
- a);
-}
-
-// pscatter
-
-template <>
-EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
- EIGEN_ALIGN64 half aux[32];
- pstore(aux, from);
-
- EIGEN_UNROLL_LOOP
- for (int i = 0; i < 32; i++) {
- to[stride * i] = aux[i];
- }
-}
-
-// pgather
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
- return _mm512_castsi512_ph(_mm512_set_epi16(
- from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
- from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
- from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
- from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
- from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
- from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
- from[1 * stride].x, from[0 * stride].x));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
-
-EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
- __m512d result = _mm512_undefined_pd();
- result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
- result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
- return _mm512_castpd_ph(result);
-}
-
-EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
- a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
- b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
-}
-
-// psin
-template <>
-EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = psin(low);
- Packet16h highOut = psin(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pcos
-template <>
-EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = pcos(low);
- Packet16h highOut = pcos(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// plog
-template <>
-EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = plog(low);
- Packet16h highOut = plog(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// plog2
-template <>
-EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = plog2(low);
- Packet16h highOut = plog2(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// plog1p
-template <>
-EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = plog1p(low);
- Packet16h highOut = plog1p(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = pexp(low);
- Packet16h highOut = pexp(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pexpm1
-template <>
-EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = pexpm1(low);
- Packet16h highOut = pexpm1(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// ptanh
-template <>
-EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h lowOut = ptanh(low);
- Packet16h highOut = ptanh(high);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pfrexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h exp1 = _mm256_undefined_si256();
- Packet16h exp2 = _mm256_undefined_si256();
-
- Packet16h lowOut = pfrexp(low, exp1);
- Packet16h highOut = pfrexp(high, exp2);
-
- exponent = combine2Packet16h(exp1, exp2);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-// pldexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
- Packet16h low;
- Packet16h high;
- extract2Packet16h(a, low, high);
-
- Packet16h exp1;
- Packet16h exp2;
- extract2Packet16h(exponent, exp1, exp2);
-
- Packet16h lowOut = pldexp(low, exp1);
- Packet16h highOut = pldexp(high, exp2);
-
- return combine2Packet16h(lowOut, highOut);
-}
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_FP16_AVX512_H
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+typedef __m512h Packet32h;
+typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
+typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
+
+template <>
+struct is_arithmetic<Packet8h> {
+ enum { value = true };
+};
+
+template <>
+struct packet_traits<half> : default_packet_traits {
+ typedef Packet32h type;
+ typedef Packet16h half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 32,
+
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
+ HasSetLinear = 0,
+ HasLog = 1,
+ HasLog1p = 1,
+ HasExp = 1,
+ HasExpm1 = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ // These ones should be implemented in future
+ HasBessel = 0,
+ HasNdtri = 0,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
+ HasTanh = EIGEN_FAST_MATH,
+ HasErf = 0, // EIGEN_FAST_MATH,
+ HasBlend = 0,
+ HasRound = 1,
+ HasFloor = 1,
+ HasCeil = 1,
+ HasRint = 1
+ };
+};
+
+template <>
+struct unpacket_traits<Packet32h> {
+ typedef Eigen::half type;
+ typedef Packet16h half;
+ enum {
+ size = 32,
+ alignment = Aligned64,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+};
+
+template <>
+struct unpacket_traits<Packet16h> {
+ typedef Eigen::half type;
+ typedef Packet8h half;
+ enum {
+ size = 16,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+};
+
+template <>
+struct unpacket_traits<Packet8h> {
+ typedef Eigen::half type;
+ typedef Packet8h half;
+ enum {
+ size = 8,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+};
+
+// Memory functions
+
+// pset1
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
+ return _mm512_set1_ph(static_cast<_Float16>(from));
+}
+
+// pset1frombits
+template <>
+EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
+ return _mm512_castsi512_ph(_mm512_set1_epi16(from));
+}
+
+// pfirst
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+ return half_impl::raw_uint16_to_half(
+ static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
+#else
+ Eigen::half dest[32];
+ _mm512_storeu_ph(dest, from);
+ return dest[0];
+#endif
+}
+
+// pload
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
+}
+
+// ploadu
+
+template <>
+EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
+}
+
+// pstore
+
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
+}
+
+// pstoreu
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
+}
+
+// ploaddup
+template <>
+EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
+ __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
+ return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
+ 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
+ a);
+}
+
+// ploadquad
+template <>
+EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
+ __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
+ return _mm512_permutexvar_ph(
+ _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
+ a);
+}
+
+// pabs
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
+ return _mm512_abs_ph(a);
+}
+
+// psignbit
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
+ return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
+}
+
+// pmin
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_min_ph(a, b);
+}
+
+// pmax
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_max_ph(a, b);
+}
+
+// plset
+template <>
+EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
+ return _mm512_add_ph(_mm512_set1_ph(a),
+ _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,
+ 19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
+ 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
+}
+
+// por
+
+template <>
+EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
+ return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+// pxor
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
+ return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+// pand
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
+ return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+// pandnot
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
+ return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
+}
+
+// pselect
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+ return _mm512_mask_blend_ph(mask32, a, b);
+}
+
+// pcmp_eq
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+}
+
+// pcmp_le
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+}
+
+// pcmp_lt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+}
+
+// pcmp_lt_or_nan
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));
+}
+
+// padd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_add_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// psub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_sub_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// pmul
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_mul_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// pdiv
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
+ return _mm512_div_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+}
+
+// pround
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
+ // Work-around for default std::round rounding mode.
+
+ // Mask for the sign bit
+ const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
+ // The largest half-preicision float less than 0.5
+ const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
+
+ return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+// print
+
+template <>
+EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
+ return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+// pceil
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
+ return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
+// pfloor
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
+ return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
+// predux
+template <>
+EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
+ return (half)_mm512_reduce_add_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
+ return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
+ return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
+}
+
+// predux_half_dowto4
+template <>
+EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+ __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
+ __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
+
+ return Packet16h(padd<Packet16h>(lowHalf, highHalf));
+#else
+ Eigen::half data[32];
+ _mm512_storeu_ph(data, a);
+
+ __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
+ __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
+
+ return Packet16h(padd<Packet16h>(lowHalf, highHalf));
+#endif
+}
+
+// predux_max
+
+// predux_min
+
+// predux_mul
+
+#ifdef EIGEN_VECTORIZE_FMA
+
+// pmadd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+ return _mm512_fmadd_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+// pmsub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+ return _mm512_fmsub_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+// pnmadd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+ return _mm512_fnmadd_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+// pnmsub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+ return _mm512_fnmsub_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+ return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+ return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+}
+
+#endif
+
+// pnegate
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
+ return _mm512_sub_ph(_mm512_set1_ph(0.0), a);
+}
+
+// pconj
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {
+ return a;
+}
+
+// psqrt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
+ return _mm512_sqrt_ph(a);
+}
+
+// prsqrt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
+ return _mm512_rsqrt_ph(a);
+}
+
+// preciprocal
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
+ return _mm512_rcp_ph(a);
+}
+
+// ptranspose
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {
+ __m512i t[32];
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 16; i++) {
+ t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
+ t[2 * i + 1] =
+ _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
+ }
+
+ __m512i p[32];
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 8; i++) {
+ p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
+ p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
+ p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
+ p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
+ }
+
+ __m512i q[32];
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 4; i++) {
+ q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
+ q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
+ q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
+ q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
+ q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
+ q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
+ q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
+ q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
+ }
+
+ __m512i f[32];
+
+#define PACKET32H_TRANSPOSE_HELPER(X, Y) \
+ do { \
+ f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X); \
+ f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
+ f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
+ f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
+ f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
+ f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
+ f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
+ f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
+ } while (false);
+
+ PACKET32H_TRANSPOSE_HELPER(0, 0);
+ PACKET32H_TRANSPOSE_HELPER(1, 1);
+ PACKET32H_TRANSPOSE_HELPER(2, 2);
+ PACKET32H_TRANSPOSE_HELPER(3, 3);
+
+ PACKET32H_TRANSPOSE_HELPER(1, 0);
+ PACKET32H_TRANSPOSE_HELPER(2, 0);
+ PACKET32H_TRANSPOSE_HELPER(3, 0);
+ PACKET32H_TRANSPOSE_HELPER(2, 1);
+ PACKET32H_TRANSPOSE_HELPER(3, 1);
+ PACKET32H_TRANSPOSE_HELPER(3, 2);
+
+ PACKET32H_TRANSPOSE_HELPER(0, 1);
+ PACKET32H_TRANSPOSE_HELPER(0, 2);
+ PACKET32H_TRANSPOSE_HELPER(0, 3);
+ PACKET32H_TRANSPOSE_HELPER(1, 2);
+ PACKET32H_TRANSPOSE_HELPER(1, 3);
+ PACKET32H_TRANSPOSE_HELPER(2, 3);
+
+#undef PACKET32H_TRANSPOSE_HELPER
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 32; i++) {
+ a.packet[i] = _mm512_castsi512_ph(f[i]);
+ }
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
+ __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
+ t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
+ t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
+ t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
+ t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
+
+ p0 = _mm512_unpacklo_epi32(t0, t2);
+ p1 = _mm512_unpackhi_epi32(t0, t2);
+ p2 = _mm512_unpacklo_epi32(t1, t3);
+ p3 = _mm512_unpackhi_epi32(t1, t3);
+
+ a0 = p0;
+ a1 = p1;
+ a2 = p2;
+ a3 = p3;
+
+ a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
+ a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
+
+ a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
+ a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
+
+ a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
+ a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
+
+ a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
+ a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
+
+ a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
+ a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
+
+ a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
+ a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
+
+ a.packet[0] = _mm512_castsi512_ph(a0);
+ a.packet[1] = _mm512_castsi512_ph(a1);
+ a.packet[2] = _mm512_castsi512_ph(a2);
+ a.packet[3] = _mm512_castsi512_ph(a3);
+}
+
+// preverse
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
+ return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
+ a);
+}
+
+// pscatter
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
+ EIGEN_ALIGN64 half aux[32];
+ pstore(aux, from);
+
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < 32; i++) {
+ to[stride * i] = aux[i];
+ }
+}
+
+// pgather
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
+ return _mm512_castsi512_ph(_mm512_set_epi16(
+ from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
+ from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
+ from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
+ from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+ from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
+ from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
+ from[1 * stride].x, from[0 * stride].x));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
+template <>
+EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
+
+EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
+ __m512d result = _mm512_undefined_pd();
+ result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
+ result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
+ return _mm512_castpd_ph(result);
+}
+
+EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
+ a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
+ b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
+}
+
+// psin
+template <>
+EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = psin(low);
+ Packet16h highOut = psin(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pcos
+template <>
+EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = pcos(low);
+ Packet16h highOut = pcos(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// plog
+template <>
+EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = plog(low);
+ Packet16h highOut = plog(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// plog2
+template <>
+EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = plog2(low);
+ Packet16h highOut = plog2(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// plog1p
+template <>
+EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = plog1p(low);
+ Packet16h highOut = plog1p(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = pexp(low);
+ Packet16h highOut = pexp(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pexpm1
+template <>
+EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = pexpm1(low);
+ Packet16h highOut = pexpm1(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// ptanh
+template <>
+EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h lowOut = ptanh(low);
+ Packet16h highOut = ptanh(high);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pfrexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h exp1 = _mm256_undefined_si256();
+ Packet16h exp2 = _mm256_undefined_si256();
+
+ Packet16h lowOut = pfrexp(low, exp1);
+ Packet16h highOut = pfrexp(high, exp2);
+
+ exponent = combine2Packet16h(exp1, exp2);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+// pldexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
+ Packet16h low;
+ Packet16h high;
+ extract2Packet16h(a, low, high);
+
+ Packet16h exp1;
+ Packet16h exp2;
+ extract2Packet16h(exponent, exp1, exp2);
+
+ Packet16h lowOut = pldexp(low, exp1);
+ Packet16h highOut = pldexp(high, exp2);
+
+ return combine2Packet16h(lowOut, highOut);
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_FP16_AVX512_H
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index b232a8f..e9a9307 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -49,11 +49,6 @@
#include "MatrixProductMMA.h"
#endif
-/**************************************************************************************************
- * TODO *
- * - Check StorageOrder on dhs_pack (the innermost second loop seems unvectorized when it could). *
- * - Check the possibility of transposing as GETREAL and GETIMAG when needed. *
- **************************************************************************************************/
// IWYU pragma: private
#include "../../InternalHeaderCheck.h"
@@ -120,6 +115,16 @@
20, 21, 22, 23,
28, 29, 30, 31};
+const static Packet16uc p16uc_GETREAL32b = { 0, 1, 2, 3,
+ 16, 17, 18, 19,
+ 8, 9, 10, 11,
+ 24, 25, 26, 27};
+
+const static Packet16uc p16uc_GETIMAG32b = { 4, 5, 6, 7,
+ 20, 21, 22, 23,
+ 12, 13, 14, 15,
+ 28, 29, 30, 31};
+
/*********************************************
* Single precision real and complex packing *
* *******************************************/
@@ -440,6 +445,78 @@
// General template for lhs & rhs complex packing.
template<typename Scalar, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode, bool UseLhs>
struct dhs_cpack {
+ template<bool transpose>
+ EIGEN_ALWAYS_INLINE void dhs_cblock(PacketBlock<PacketC,8>& cblock, PacketBlock<Packet,4>& block, Packet16uc permute)
+ {
+ if (transpose) {
+ block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, permute);
+ block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, permute);
+ block.packet[2] = vec_perm(cblock.packet[4].v, cblock.packet[5].v, permute);
+ block.packet[3] = vec_perm(cblock.packet[6].v, cblock.packet[7].v, permute);
+
+ Packet4f t0, t1, t2, t3;
+#ifdef EIGEN_VECTORIZE_VSX
+ t0 = reinterpret_cast<Packet>(vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
+ t1 = reinterpret_cast<Packet>(vec_mergel(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
+ t2 = reinterpret_cast<Packet>(vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
+ t3 = reinterpret_cast<Packet>(vec_mergel(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
+#else
+ t0 = reinterpret_cast<Packet>(vec_perm(block.packet[0], block.packet[1], p16uc_TRANSPOSE64_HI));
+ t1 = reinterpret_cast<Packet>(vec_perm(block.packet[0], block.packet[1], p16uc_TRANSPOSE64_LO));
+ t2 = reinterpret_cast<Packet>(vec_perm(block.packet[2], block.packet[3], p16uc_TRANSPOSE64_HI));
+ t3 = reinterpret_cast<Packet>(vec_perm(block.packet[2], block.packet[3], p16uc_TRANSPOSE64_LO));
+#endif
+
+ block.packet[0] = t0;
+ block.packet[1] = t1;
+ block.packet[2] = t2;
+ block.packet[3] = t3;
+ } else {
+ block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, permute);
+ block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, permute);
+ block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, permute);
+ block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, permute);
+ }
+ }
+
+ EIGEN_ALWAYS_INLINE void dhs_ccopy(Scalar* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii, Index depth, const Index vectorSize)
+ {
+ PacketBlock<Packet,4> blockr, blocki;
+ PacketBlock<PacketC,8> cblock;
+
+ for(; i + vectorSize <= depth; i+=vectorSize)
+ {
+ if (UseLhs) {
+ bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, 0, i);
+ } else {
+ bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, i, 0);
+ }
+
+ if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs)))
+ {
+ dhs_cblock<true>(cblock, blockr, p16uc_GETREAL32b);
+ dhs_cblock<true>(cblock, blocki, p16uc_GETIMAG32b);
+ } else {
+ dhs_cblock<false>(cblock, blockr, p16uc_GETREAL32);
+ dhs_cblock<false>(cblock, blocki, p16uc_GETIMAG32);
+ }
+
+ if(Conjugate)
+ {
+ blocki.packet[0] = -blocki.packet[0];
+ blocki.packet[1] = -blocki.packet[1];
+ blocki.packet[2] = -blocki.packet[2];
+ blocki.packet[3] = -blocki.packet[3];
+ }
+
+ storeBlock<Scalar, Packet, 4>(blockAt + rir, blockr);
+ storeBlock<Scalar, Packet, 4>(blockAt + rii, blocki);
+
+ rir += 4*vectorSize;
+ rii += 4*vectorSize;
+ }
+ }
+
EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
const Index vectorSize = quad_traits<Scalar>::vectorsize;
@@ -455,47 +532,8 @@
rii = rir + vectorDelta;
- for(; i + vectorSize <= depth; i+=vectorSize)
- {
- PacketBlock<Packet,4> blockr, blocki;
- PacketBlock<PacketC,8> cblock;
+ dhs_ccopy(blockAt, lhs2, i, rir, rii, depth, vectorSize);
- if (UseLhs) {
- bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, 0, i);
- } else {
- bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, i, 0);
- }
-
- blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
- blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
- blockr.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
- blockr.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
-
- blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
- blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
- blocki.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
- blocki.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
-
- if(Conjugate)
- {
- blocki.packet[0] = -blocki.packet[0];
- blocki.packet[1] = -blocki.packet[1];
- blocki.packet[2] = -blocki.packet[2];
- blocki.packet[3] = -blocki.packet[3];
- }
-
- if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs)))
- {
- ptranspose(blockr);
- ptranspose(blocki);
- }
-
- storeBlock<Scalar, Packet, 4>(blockAt + rir, blockr);
- storeBlock<Scalar, Packet, 4>(blockAt + rii, blocki);
-
- rir += 4*vectorSize;
- rii += 4*vectorSize;
- }
for(; i < depth; i++)
{
PacketBlock<Packet,1> blockr, blocki;
@@ -592,6 +630,36 @@
// General template for lhs & rhs packing.
template<typename Scalar, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
struct dhs_pack{
+ template<Index n>
+ EIGEN_ALWAYS_INLINE void dhs_copy(Scalar* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth, const Index vectorSize)
+ {
+ PacketBlock<Packet,4> block[n];
+
+ for(; i + n*vectorSize <= depth; i+=n*vectorSize)
+ {
+ for (Index k = 0; k < n; k++) {
+ if (UseLhs) {
+ bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, 0, i + k*vectorSize);
+ } else {
+ bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, i + k*vectorSize, 0);
+ }
+ }
+
+ if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
+ {
+ for (Index k = 0; k < n; k++) {
+ ptranspose(block[k]);
+ }
+ }
+
+ for (Index k = 0; k < n; k++) {
+ storeBlock<Scalar, Packet, 4>(blockA + ri + k*4*vectorSize, block[k]);
+ }
+
+ ri += n*4*vectorSize;
+ }
+ }
+
EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
const Index vectorSize = quad_traits<Scalar>::vectorsize;
@@ -604,24 +672,10 @@
if(PanelMode) ri += vectorSize*offset;
- for(; i + vectorSize <= depth; i+=vectorSize)
- {
- PacketBlock<Packet,4> block;
+ dhs_copy<4>(blockA, lhs2, i, ri, depth, vectorSize);
+ dhs_copy<2>(blockA, lhs2, i, ri, depth, vectorSize);
+ dhs_copy<1>(blockA, lhs2, i, ri, depth, vectorSize);
- if (UseLhs) {
- bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block, lhs2, 0, i);
- } else {
- bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block, lhs2, i, 0);
- }
- if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
- {
- ptranspose(block);
- }
-
- storeBlock<Scalar, Packet, 4>(blockA + ri, block);
-
- ri += 4*vectorSize;
- }
for(; i < depth; i++)
{
if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
@@ -691,6 +745,39 @@
template<typename DataMapper, int StorageOrder, bool PanelMode>
struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, true>
{
+ template<Index n>
+ EIGEN_ALWAYS_INLINE void dhs_copy(double* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth, const Index vectorSize)
+ {
+ PacketBlock<Packet2d,2> block[n];
+
+ for(; i + n*vectorSize <= depth; i+=n*vectorSize)
+ {
+ for (Index k = 0; k < n; k++) {
+ if(StorageOrder == RowMajor)
+ {
+ block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k*vectorSize);
+ block[k].packet[1] = lhs2.template loadPacket<Packet2d>(1, i + k*vectorSize);
+ } else {
+ block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k*vectorSize + 0);
+ block[k].packet[1] = lhs2.template loadPacket<Packet2d>(0, i + k*vectorSize + 1);
+ }
+ }
+
+ if(StorageOrder == RowMajor)
+ {
+ for (Index k = 0; k < n; k++) {
+ ptranspose(block[k]);
+ }
+ }
+
+ for (Index k = 0; k < n; k++) {
+ storeBlock<double, Packet2d, 2>(blockA + ri + k*2*vectorSize, block[k]);
+ }
+
+ ri += n*2*vectorSize;
+ }
+ }
+
EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
const Index vectorSize = quad_traits<double>::vectorsize;
@@ -703,24 +790,10 @@
if(PanelMode) ri += vectorSize*offset;
- for(; i + vectorSize <= depth; i+=vectorSize)
- {
- PacketBlock<Packet2d,2> block;
- if(StorageOrder == RowMajor)
- {
- block.packet[0] = lhs2.template loadPacket<Packet2d>(0, i);
- block.packet[1] = lhs2.template loadPacket<Packet2d>(1, i);
+ dhs_copy<4>(blockA, lhs2, i, ri, depth, vectorSize);
+ dhs_copy<2>(blockA, lhs2, i, ri, depth, vectorSize);
+ dhs_copy<1>(blockA, lhs2, i, ri, depth, vectorSize);
- ptranspose(block);
- } else {
- block.packet[0] = lhs2.template loadPacket<Packet2d>(0, i + 0);
- block.packet[1] = lhs2.template loadPacket<Packet2d>(0, i + 1);
- }
-
- storeBlock<double, Packet2d, 2>(blockA + ri, block);
-
- ri += 2*vectorSize;
- }
for(; i < depth; i++)
{
if(StorageOrder == RowMajor)
@@ -759,6 +832,53 @@
template<typename DataMapper, int StorageOrder, bool PanelMode>
struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, false>
{
+ template<Index n>
+ EIGEN_ALWAYS_INLINE void dhs_copy(double* blockB, const DataMapper& rhs2, Index& i, Index& ri, Index depth, const Index vectorSize)
+ {
+ PacketBlock<Packet2d,2> block1[n], block2[n];
+ PacketBlock<Packet2d,4> block3[n];
+
+ for(; i + n*vectorSize <= depth; i+=n*vectorSize)
+ {
+ for (Index k = 0; k < n; k++) {
+ if(StorageOrder == ColMajor)
+ {
+ block1[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 0);
+ block1[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 1);
+ block2[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 2);
+ block2[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 3);
+ } else {
+ block3[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 0, 0); //[a1 a2]
+ block3[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 0, 2); //[a3 a4]
+ block3[k].packet[2] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 1, 0); //[b1 b2]
+ block3[k].packet[3] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 1, 2); //[b3 b4]
+ }
+ }
+
+ if(StorageOrder == ColMajor)
+ {
+ for (Index k = 0; k < n; k++) {
+ ptranspose(block1[k]);
+ ptranspose(block2[k]);
+ }
+ }
+
+ for (Index k = 0; k < n; k++) {
+ if(StorageOrder == ColMajor)
+ {
+ pstore<double>(blockB + ri + k*4*vectorSize , block1[k].packet[0]);
+ pstore<double>(blockB + ri + k*4*vectorSize + 2, block2[k].packet[0]);
+ pstore<double>(blockB + ri + k*4*vectorSize + 4, block1[k].packet[1]);
+ pstore<double>(blockB + ri + k*4*vectorSize + 6, block2[k].packet[1]);
+ } else {
+ storeBlock<double, Packet2d, 4>(blockB + ri + k*4*vectorSize, block3[k]);
+ }
+ }
+
+ ri += n*4*vectorSize;
+ }
+ }
+
EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
const Index vectorSize = quad_traits<double>::vectorsize;
@@ -771,35 +891,10 @@
if(PanelMode) ri += offset*(2*vectorSize);
- for(; i + vectorSize <= depth; i+=vectorSize)
- {
- PacketBlock<Packet2d,4> block;
- if(StorageOrder == ColMajor)
- {
- PacketBlock<Packet2d,2> block1, block2;
- block1.packet[0] = rhs2.template loadPacket<Packet2d>(i, 0);
- block1.packet[1] = rhs2.template loadPacket<Packet2d>(i, 1);
- block2.packet[0] = rhs2.template loadPacket<Packet2d>(i, 2);
- block2.packet[1] = rhs2.template loadPacket<Packet2d>(i, 3);
+ dhs_copy<4>(blockB, rhs2, i, ri, depth, vectorSize);
+ dhs_copy<2>(blockB, rhs2, i, ri, depth, vectorSize);
+ dhs_copy<1>(blockB, rhs2, i, ri, depth, vectorSize);
- ptranspose(block1);
- ptranspose(block2);
-
- pstore<double>(blockB + ri , block1.packet[0]);
- pstore<double>(blockB + ri + 2, block2.packet[0]);
- pstore<double>(blockB + ri + 4, block1.packet[1]);
- pstore<double>(blockB + ri + 6, block2.packet[1]);
- } else {
- block.packet[0] = rhs2.template loadPacket<Packet2d>(i + 0, 0); //[a1 a2]
- block.packet[1] = rhs2.template loadPacket<Packet2d>(i + 0, 2); //[a3 a4]
- block.packet[2] = rhs2.template loadPacket<Packet2d>(i + 1, 0); //[b1 b2]
- block.packet[3] = rhs2.template loadPacket<Packet2d>(i + 1, 2); //[b3 b4]
-
- storeBlock<double, Packet2d, 4>(blockB + ri, block);
- }
-
- ri += 4*vectorSize;
- }
for(; i < depth; i++)
{
if(StorageOrder == ColMajor)
@@ -1296,6 +1391,54 @@
template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
{
+ EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii, Index depth, const Index vectorSize)
+ {
+ PacketBlock<Packet,2> blockr, blocki;
+ PacketBlock<PacketC,4> cblock;
+
+ for(; i + vectorSize <= depth; i+=vectorSize)
+ {
+ if(StorageOrder == ColMajor)
+ {
+ cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i + 0); //[a1 a1i]
+ cblock.packet[1] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
+
+ cblock.packet[2] = lhs2.template loadPacket<PacketC>(1, i + 0); //[a2 a2i]
+ cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i]
+
+ blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[2].v); //[a1 a2]
+ blockr.packet[1] = vec_mergeh(cblock.packet[1].v, cblock.packet[3].v); //[b1 b2]
+
+ blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[2].v);
+ blocki.packet[1] = vec_mergel(cblock.packet[1].v, cblock.packet[3].v);
+ } else {
+ cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i); //[a1 a1i]
+ cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i); //[a2 a2i]
+
+ cblock.packet[2] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
+ cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i
+
+ blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v); //[a1 a2]
+ blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v); //[b1 b2]
+
+ blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
+ blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
+ }
+
+ if(Conjugate)
+ {
+ blocki.packet[0] = -blocki.packet[0];
+ blocki.packet[1] = -blocki.packet[1];
+ }
+
+ storeBlock<double, Packet, 2>(blockAt + rir, blockr);
+ storeBlock<double, Packet, 2>(blockAt + rii, blocki);
+
+ rir += 2*vectorSize;
+ rii += 2*vectorSize;
+ }
+ }
+
EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
const Index vectorSize = quad_traits<double>::vectorsize;
@@ -1311,50 +1454,8 @@
rii = rir + vectorDelta;
- for(; i + vectorSize <= depth; i+=vectorSize)
- {
- PacketBlock<Packet,2> blockr, blocki;
- PacketBlock<PacketC,4> cblock;
+ dhs_ccopy(blockAt, lhs2, i, rir, rii, depth, vectorSize);
- if(StorageOrder == ColMajor)
- {
- cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i + 0); //[a1 a1i]
- cblock.packet[1] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
-
- cblock.packet[2] = lhs2.template loadPacket<PacketC>(1, i + 0); //[a2 a2i]
- cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i]
-
- blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[2].v); //[a1 a2]
- blockr.packet[1] = vec_mergeh(cblock.packet[1].v, cblock.packet[3].v); //[b1 b2]
-
- blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[2].v);
- blocki.packet[1] = vec_mergel(cblock.packet[1].v, cblock.packet[3].v);
- } else {
- cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i); //[a1 a1i]
- cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i); //[a2 a2i]
-
- cblock.packet[2] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
- cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i
-
- blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v); //[a1 a2]
- blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v); //[b1 b2]
-
- blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
- blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
- }
-
- if(Conjugate)
- {
- blocki.packet[0] = -blocki.packet[0];
- blocki.packet[1] = -blocki.packet[1];
- }
-
- storeBlock<double, Packet, 2>(blockAt + rir, blockr);
- storeBlock<double, Packet, 2>(blockAt + rii, blocki);
-
- rir += 2*vectorSize;
- rii += 2*vectorSize;
- }
for(; i < depth; i++)
{
PacketBlock<Packet,1> blockr, blocki;
@@ -1410,6 +1511,35 @@
template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
{
+ EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockBt, const DataMapper& rhs2, Index& i, Index& rir, Index& rii, Index depth, const Index vectorSize)
+ {
+ for(; i < depth; i++)
+ {
+ PacketBlock<PacketC,4> cblock;
+ PacketBlock<Packet,2> blockr, blocki;
+
+ bload<DataMapper, PacketC, 2, ColMajor, false, 4>(cblock, rhs2, i, 0);
+
+ blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);
+ blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v);
+
+ blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
+ blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
+
+ if(Conjugate)
+ {
+ blocki.packet[0] = -blocki.packet[0];
+ blocki.packet[1] = -blocki.packet[1];
+ }
+
+ storeBlock<double, Packet, 2>(blockBt + rir, blockr);
+ storeBlock<double, Packet, 2>(blockBt + rii, blocki);
+
+ rir += 2*vectorSize;
+ rii += 2*vectorSize;
+ }
+ }
+
EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
const Index vectorSize = quad_traits<double>::vectorsize;
@@ -1425,31 +1555,7 @@
rii = rir + vectorDelta;
- for(; i < depth; i++)
- {
- PacketBlock<PacketC,4> cblock;
- PacketBlock<Packet,2> blockr, blocki;
-
- bload<DataMapper, PacketC, 2, ColMajor, false, 4>(cblock, rhs2, i, 0);
-
- blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);
- blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v);
-
- blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
- blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
-
- if(Conjugate)
- {
- blocki.packet[0] = -blocki.packet[0];
- blocki.packet[1] = -blocki.packet[1];
- }
-
- storeBlock<double, Packet, 2>(blockBt + rir, blockr);
- storeBlock<double, Packet, 2>(blockBt + rii, blocki);
-
- rir += 2*vectorSize;
- rii += 2*vectorSize;
- }
+ dhs_ccopy(blockBt, rhs2, i, rir, rii, depth, vectorSize);
rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
}
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
index e83cba5..72e8c31 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -42,19 +42,13 @@
__builtin_mma_xxsetaccz(acc);
}
-#ifdef USE_PARTIAL_PACKETS
template<typename DataMapper, typename Packet, bool full>
EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Index elements, __vector_quad* acc)
-#else
-template<typename DataMapper, typename Packet, const Index accCols, const Index accCols2>
-EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Packet& pMask, __vector_quad* acc)
-#endif
{
PacketBlock<Packet, 4> result;
__builtin_mma_disassemble_acc(&result.packet, acc);
PacketBlock<Packet, 4> tRes;
-#ifdef USE_PARTIAL_PACKETS
if (full) {
EIGEN_UNUSED_VARIABLE(elements);
bload<DataMapper, Packet, 0, ColMajor, false, 4>(tRes, data, i, 0);
@@ -65,11 +59,6 @@
bscale<Packet, 4>(tRes, result, alpha);
bstore_partial<DataMapper, Packet, 4>(tRes, data, i, elements);
}
-#else
- bload<DataMapper, Packet, 0, ColMajor, false, 4>(tRes, data, i, 0);
- bscale<Packet, 4, (accCols != accCols2)>(tRes, result, alpha, pMask);
- bstore<DataMapper, Packet, 4>(tRes, data, i);
-#endif
}
template<typename DataMapper, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
@@ -166,78 +155,118 @@
ploadRhsMMA(lhs, lhsV);
}
-#if (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
+#define GEMM_MULTIPLE_COLS
+
+// Disable in GCC until unnecessary register moves are fixed
+//#if (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
+#if EIGEN_COMP_LLVM
#define VECTOR_PAIR_LOADS_LHS
#endif
// PEEL_MMA loop factor.
+#ifdef GEMM_MULTIPLE_COLS
+#define PEEL_MMA 8
+#else
+// Register spillage with GCC12+
+#if EIGEN_COMP_LLVM || (__GNUC__ < 12) || defined(VECTOR_PAIR_LOADS_LHS)
#define PEEL_MMA 7
+#else
+#define PEEL_MMA 6
+#endif
+#endif
#define MICRO_MMA_UNROLL(func) \
func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
#define MICRO_MMA_WORK(func, type, peel) \
- func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
- func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel)
+ if (accItr == 1) { \
+ func(0,type,peel,0,0) func(1,type,peel,1,0) func(2,type,peel,2,0) func(3,type,peel,3,0) \
+ func(4,type,peel,4,0) func(5,type,peel,5,0) func(6,type,peel,6,0) func(7,type,peel,7,0) \
+ } else if (accItr == 2) { \
+ func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,1,0) func(3,type,peel,1,1) \
+ func(4,type,peel,2,0) func(5,type,peel,2,1) func(6,type,peel,3,0) func(7,type,peel,3,1) \
+ } else { \
+ func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,0,2) func(3,type,peel,0,3) \
+ func(4,type,peel,1,0) func(5,type,peel,1,1) func(6,type,peel,1,2) func(7,type,peel,1,3) \
+ }
-#define MICRO_MMA_WORK_ONE(iter, type, peel) \
- if (unroll_factor > iter) { \
- pgerMMA<Packet, type, false>(&accZero##iter, rhsV[peel], lhsV##iter); \
+#define MICRO_MMA_WORK_ONE(iter, type, peel, left, right) \
+ if (unroll_factor > left) { \
+ pgerMMA<Packet, type, false>(&accZero##iter, rhsV##right[peel], lhsV##left); \
}
#ifdef VECTOR_PAIR_LOADS_LHS
-#define MICRO_MMA_WORK_TWO(iter, type, peel) \
- if (unroll_factor > iter) { \
- pgerMMA<Packet, type, false>(&accZero##iter, rhsV[peel], lhsV2##iter.packet[peel & 1]); \
+#define MICRO_MMA_WORK_TWO(iter, type, peel, left, right) \
+ if (unroll_factor > left) { \
+ pgerMMA<Packet, type, false>(&accZero##iter, rhsV##right[peel], lhsV2##left.packet[peel & 1]); \
}
-#define MICRO_MMA_LOAD1_TWO(lhs_ptr, iter) \
- if (unroll_factor > iter) { \
- if (MICRO_NORMAL(iter)) { \
- ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr##iter), plhsV##iter); \
- __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsV2##iter.packet), &plhsV##iter); \
- lhs_ptr##iter += accCols*2; \
+#define MICRO_MMA_LOAD1_TWO(lhs_ptr, left) \
+ if (unroll_factor > left) { \
+ if (MICRO_NORMAL(left)) { \
+ ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr##left), plhsV##left); \
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsV2##left.packet), &plhsV##left); \
+ lhs_ptr##left += accCols*2; \
} else { \
- lhsV2##iter.packet[0] = ploadLhs<Packet>(lhs_ptr##iter); \
- lhsV2##iter.packet[1] = ploadLhs<Packet>(lhs_ptr##iter + accCols2); \
- lhs_ptr##iter += accCols2*2; \
- EIGEN_UNUSED_VARIABLE(plhsV##iter) \
+ lhsV2##left.packet[0] = ploadLhs<Packet>(lhs_ptr##left); \
+ lhsV2##left.packet[1] = ploadLhs<Packet>(lhs_ptr##left + accCols2); \
+ lhs_ptr##left += accCols2*2; \
+ EIGEN_UNUSED_VARIABLE(plhsV##left); \
} \
} else { \
- EIGEN_UNUSED_VARIABLE(lhsV2##iter); \
- EIGEN_UNUSED_VARIABLE(plhsV##iter) \
+ EIGEN_UNUSED_VARIABLE(lhsV2##left); \
+ EIGEN_UNUSED_VARIABLE(plhsV##left); \
}
-#define MICRO_MMA_LOAD_TWO(iter) MICRO_MMA_LOAD1_TWO(lhs_ptr, iter)
+#define MICRO_MMA_LOAD_TWO(left) MICRO_MMA_LOAD1_TWO(lhs_ptr, left)
#endif
+#define MICRO_MMA_UNROLL_ITER(func, val) \
+ func(val,0) \
+ if (accItr > 1) { \
+ func(val,1) \
+ if (accItr > 2) { \
+ func(val,2) \
+ func(val,3) \
+ } \
+ }
+
+#define MICRO_MMA_LOAD_ONE_RHS1(peel, right) \
+ ploadRhsMMA(rhs_ptr##right + (accRows * peel), rhsV##right[peel]);
+
+#define MICRO_MMA_LOAD_ONE_RHS(peel) \
+ MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_ONE_RHS1, peel)
+
#define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
if (PEEL_MMA > peel) { \
Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
- ploadRhsMMA(rhs_ptr + (accRows * peel), rhsV[peel]); \
+ MICRO_MMA_LOAD_ONE_RHS(peel) \
MICRO_MMA_UNROLL(funcl) \
MICRO_MMA_WORK(funcw, type, peel) \
}
#ifndef VECTOR_PAIR_LOADS_LHS
#define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
- type rhsV[8]; \
+ type rhsV0[8], rhsV1[(accItr > 1) ? 8 : 1], rhsV2[(accItr > 2) ? 8 : 1], rhsV3[(accItr > 2) ? 8 : 1]; \
MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,1) \
MICRO_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,3) \
MICRO_MMA_TYPE_PEEL(funcw,funcl,type,4) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,5) \
MICRO_MMA_TYPE_PEEL(funcw,funcl,type,6) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,7)
#else
+#define MICRO_MMA_LOAD_TWO_RHS(peel1, right) \
+ ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr##right + (accRows * peel1)), prhsV##peel1); \
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1);
+
#define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
if (PEEL_MMA > peel2) { \
PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \
__vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7; \
if (sizeof(type) == 16) { \
- ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr + (accRows * peel1)), prhsV##peel1); \
- __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV[peel1]), &prhsV##peel1); \
+ MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_TWO_RHS, peel1) \
} else { \
EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
- ploadRhsMMA(rhs_ptr + (accRows * peel1), rhsV[peel1]); \
- ploadRhsMMA(rhs_ptr + (accRows * peel2), rhsV[peel2]); \
+ MICRO_MMA_LOAD_ONE_RHS(peel1) \
+ MICRO_MMA_LOAD_ONE_RHS(peel2) \
} \
MICRO_MMA_UNROLL(funcl2) \
MICRO_MMA_WORK(funcw2, type, peel1) \
@@ -248,7 +277,7 @@
}
#define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
- type rhsV[8]; \
+ type rhsV0[8], rhsV1[(accItr > 1) ? 8 : 1], rhsV2[(accItr > 2) ? 8 : 1], rhsV3[(accItr > 2) ? 8 : 1]; \
__vector_pair prhsV0, prhsV2, prhsV4, prhsV6; \
MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) \
@@ -257,19 +286,25 @@
#endif
#define MICRO_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
- type rhsV[1]; \
+ type rhsV0[1], rhsV1[1], rhsV2[1], rhsV3[1]; \
MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0)
+#define MICRO_MMA_UPDATE_RHS1(size, right) \
+ rhs_ptr##right += (accRows * size);
+
+#define MICRO_MMA_UPDATE_RHS(size) \
+ MICRO_MMA_UNROLL_ITER(MICRO_MMA_UPDATE_RHS1, size)
+
#define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size) \
MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, RhsPacket) \
- rhs_ptr += (accRows * size);
+ MICRO_MMA_UPDATE_RHS(size)
#ifndef VECTOR_PAIR_LOADS_LHS
#define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_PEEL, PEEL_MMA)
#else
#define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size) \
MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \
- rhs_ptr += (accRows * size);
+ MICRO_MMA_UPDATE_RHS(size)
#define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_UNROLL_TYPE_PEEL2, PEEL_MMA)
#endif
@@ -277,7 +312,7 @@
#define MICRO_MMA_ONE MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_ONE, 1)
#define MICRO_MMA_DST_PTR_ONE(iter) \
- if (unroll_factor > iter) { \
+ if (unroll_factor * accItr > iter) { \
bsetzeroMMA(&accZero##iter); \
} else { \
EIGEN_UNUSED_VARIABLE(accZero##iter); \
@@ -289,45 +324,69 @@
#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_PREFETCH_ONE)
-#ifdef USE_PARTIAL_PACKETS
-#define MICRO_MMA_STORE_ONE(iter) \
- if (unroll_factor > iter) { \
- storeAccumulator<DataMapper, Packet, MICRO_NORMAL_PARTIAL(iter)>(row + iter*accCols, res, pAlpha, accCols2, &accZero##iter); \
+#define MICRO_MMA_STORE_ONE(iter, left, right) \
+ if (unroll_factor > left) { \
+ storeAccumulator<DataMapper, Packet, MICRO_NORMAL_PARTIAL(left)>(row + left*accCols, res##right, pAlpha, accCols2, &accZero##iter); \
}
-#else
-#define MICRO_MMA_STORE_ONE(iter) \
- if (unroll_factor > iter) { \
- storeAccumulator<DataMapper, Packet, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlpha, pMask, &accZero##iter); \
+
+#define MICRO_MMA_ITER_UNROLL(func) \
+ if (accItr == 1) { \
+ func(0,0,0) func(1,1,0) func(2,2,0) func(3,3,0) \
+ func(4,4,0) func(5,5,0) func(6,6,0) func(7,7,0) \
+ } else if (accItr == 2) { \
+ func(0,0,0) func(1,0,1) func(2,1,0) func(3,1,1) \
+ func(4,2,0) func(5,2,1) func(6,3,0) func(7,3,1) \
+ } else { \
+ func(0,0,0) func(1,0,1) func(2,0,2) func(3,0,3) \
+ func(4,1,0) func(5,1,1) func(6,1,2) func(7,1,3) \
}
-#endif
-#define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
+#define MICRO_MMA_STORE MICRO_MMA_ITER_UNROLL(MICRO_MMA_STORE_ONE)
-#ifdef USE_PARTIAL_PACKETS
-template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool full>
-#else
-template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2>
-#endif
+#define MICRO_MMA_EXTRA_ROWS(right) \
+ gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3##right, blockA, rhs_base + right*accRows*strideB, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
+
+#define MICRO_MMA_EXTRA_ROWS1(val, right) \
+ MICRO_MMA_EXTRA_ROWS(right);
+
+template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool full, const Index accItr>
EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
- const DataMapper& res,
+ const DataMapper& res0,
+ const DataMapper& res1,
+ const DataMapper& res2,
+ const DataMapper& res3,
const Scalar* lhs_base,
const Scalar* rhs_base,
Index depth,
Index strideA,
+ Index strideB,
Index offsetA,
Index& row,
const Packet& pAlpha,
-#ifdef USE_PARTIAL_PACKETS
Index accCols2
-#else
- const Packet& pMask
-#endif
)
{
- const Scalar* rhs_ptr = rhs_base;
+ const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL, * rhs_ptr3 = NULL;
const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
__vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+ if (accItr > 1) {
+ rhs_ptr1 = rhs_base + (accRows * strideB);
+ } else {
+ EIGEN_UNUSED_VARIABLE(strideB);
+ EIGEN_UNUSED_VARIABLE(rhs_ptr1);
+ EIGEN_UNUSED_VARIABLE(res1);
+ }
+ if (accItr > 2) {
+ rhs_ptr2 = rhs_base + (2 * accRows * strideB);
+ rhs_ptr3 = rhs_base + (3 * accRows * strideB);
+ } else {
+ EIGEN_UNUSED_VARIABLE(rhs_ptr2);
+ EIGEN_UNUSED_VARIABLE(rhs_ptr3);
+ EIGEN_UNUSED_VARIABLE(res2);
+ EIGEN_UNUSED_VARIABLE(res3);
+ }
+
MICRO_MMA_SRC_PTR
MICRO_MMA_DST_PTR
@@ -347,17 +406,16 @@
MICRO_UPDATE
}
-#ifdef USE_PARTIAL_PACKETS
#define MICRO_MMA_UNROLL_ITER2(N, M) \
- gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, !M>(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, M ? remaining_rows : accCols); \
+ gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, !M, accItr>(res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, strideB, offsetA, row, pAlpha, M ? remaining_rows : accCols); \
if (M) return;
-#else
-#define MICRO_MMA_UNROLL_ITER2(N, M) \
- gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, pMask); \
- if (M) return;
-#endif
-template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
+#define MICRO_MMA_ROWS(n) \
+ while(row + n*accCols <= rows) { \
+ MICRO_MMA_UNROLL_ITER2(n, 0); \
+ }
+
+template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accItr>
EIGEN_ALWAYS_INLINE void gemmMMA_cols(
const DataMapper& res,
const Scalar* blockA,
@@ -373,45 +431,71 @@
const Packet& pAlpha,
const Packet& pMask)
{
- const DataMapper res3 = res.getSubMapper(0, col);
+ const DataMapper res30 = res.getSubMapper(0, col);
+ const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows*1) : res30;
+ const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows*2) : res30;
+ const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows*3) : res30;
const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
const Scalar* lhs_base = blockA + accCols*offsetA;
Index row = 0;
#define MAX_MMA_UNROLL 7
- while(row + MAX_MMA_UNROLL*accCols <= rows) {
- MICRO_MMA_UNROLL_ITER2(MAX_MMA_UNROLL, 0);
+
+#if MAX_MMA_UNROLL < 2
+ if (1) {
+#elif MAX_MMA_UNROLL < 4
+ if (accItr <= 2) {
+#else
+ if (accItr == 1) {
+#endif
+ MICRO_MMA_ROWS(MAX_MMA_UNROLL);
+ } else if (accItr == 2) {
+ MICRO_MMA_ROWS(4);
+ } else {
+ MICRO_MMA_ROWS(2);
}
switch( (rows-row)/accCols ) {
#if MAX_MMA_UNROLL > 7
case 7:
- MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 7)
+ if (accItr == 1) {
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 7)
+ }
break;
#endif
#if MAX_MMA_UNROLL > 6
case 6:
- MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 6)
+ if (accItr == 1) {
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 6)
+ }
break;
#endif
#if MAX_MMA_UNROLL > 5
case 5:
- MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 5)
+ if (accItr == 1) {
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 5)
+ }
break;
#endif
#if MAX_MMA_UNROLL > 4
case 4:
- MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 4)
+ if (accItr == 1) {
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 4)
+ }
break;
#endif
#if MAX_MMA_UNROLL > 3
case 3:
- MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 3)
+ if (accItr <= 2) {
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 3)
+ }
break;
#endif
#if MAX_MMA_UNROLL > 2
case 2:
- MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 2)
+ if (accItr <= 2) {
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 2)
+ }
break;
#endif
#if MAX_MMA_UNROLL > 1
@@ -426,10 +510,16 @@
if(remaining_rows > 0)
{
- gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
+ MICRO_MMA_UNROLL_ITER(MICRO_MMA_EXTRA_ROWS1, 0)
}
}
+#define MICRO_MMA_COLS(n) \
+ for(; col + n*accRows <= cols; col += n*accRows) \
+ { \
+ gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, accRows, accCols, n>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); \
+ }
+
template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
{
@@ -444,10 +534,11 @@
typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
Index col = 0;
- for(; col + accRows <= cols; col += accRows)
- {
- gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
- }
+#ifdef GEMM_MULTIPLE_COLS
+ MICRO_MMA_COLS(4);
+ MICRO_MMA_COLS(2);
+#endif
+ MICRO_MMA_COLS(1);
if (col != cols)
{
@@ -459,62 +550,88 @@
#define advanceCols ((RhsIsReal) ? 1 : 2)
// PEEL_COMPLEX_MMA loop factor.
+#ifdef GEMM_MULTIPLE_COLS
+#define PEEL_COMPLEX_MMA 4
+#else
#define PEEL_COMPLEX_MMA 3
+#endif
#define MICRO_COMPLEX_MMA_UNROLL(func) \
func(0) func(1) func(2) func(3)
#define MICRO_COMPLEX_MMA_WORK(func, type, peel) \
- func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel)
+ if (accItr == 1) { \
+ func(0,type,peel,0,0) func(1,type,peel,1,0) func(2,type,peel,2,0) func(3,type,peel,3,0) \
+ } else if (accItr == 2) { \
+ func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,1,0) func(3,type,peel,1,1) \
+ } else { \
+ func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,0,2) func(3,type,peel,0,3) \
+ }
-#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \
- if (unroll_factor > iter) { \
- pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV[peel], rhsVi[peel]); \
+#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel, left, right) \
+ if (unroll_factor > left) { \
+ pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##left, lhsVi##left, rhsV##right[peel], rhsVi##right[peel]); \
}
#ifdef VECTOR_PAIR_LOADS_LHS
-#define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel) \
- if (unroll_factor > iter) { \
- pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV2##iter.packet[peel & 1], lhsVi2##iter.packet[peel & 1], rhsV[peel], rhsVi[peel]); \
+#define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel, left, right) \
+ if (unroll_factor > left) { \
+ pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV2##left.packet[peel & 1], lhsVi2##left.packet[peel & 1], rhsV##right[peel], rhsVi##right[peel]); \
}
-#define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter) \
- if (!LhsIsReal && (unroll_factor > iter)) { \
- if (MICRO_NORMAL(iter)) { \
- ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr_real##iter + imag_delta), plhsVi##iter); \
- __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsVi2##iter.packet), &plhsVi##iter); \
+#define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left) \
+ if (!LhsIsReal && (unroll_factor > left)) { \
+ if (MICRO_NORMAL(left)) { \
+ ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr_real##left + imag_delta), plhsVi##left); \
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsVi2##left.packet), &plhsVi##left); \
} else { \
- lhsVi2##iter.packet[0] = ploadLhs<Packet>(lhs_ptr_real##iter + imag_delta2); \
- lhsVi2##iter.packet[1] = ploadLhs<Packet>(lhs_ptr_real##iter + imag_delta2 + accCols2); \
- EIGEN_UNUSED_VARIABLE(plhsVi##iter) \
+ lhsVi2##left.packet[0] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2); \
+ lhsVi2##left.packet[1] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2 + accCols2); \
+ EIGEN_UNUSED_VARIABLE(plhsVi##left); \
} \
} else { \
- EIGEN_UNUSED_VARIABLE(lhsVi2##iter); \
- EIGEN_UNUSED_VARIABLE(plhsVi##iter) \
+ EIGEN_UNUSED_VARIABLE(lhsVi2##left); \
+ EIGEN_UNUSED_VARIABLE(plhsVi##left); \
} \
- MICRO_MMA_LOAD1_TWO(lhs_ptr_real, iter)
+ MICRO_MMA_LOAD1_TWO(lhs_ptr_real, left)
-#define MICRO_COMPLEX_MMA_LOAD_TWO(iter) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter)
+#define MICRO_COMPLEX_MMA_LOAD_TWO(left) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left)
#endif
+#define MICRO_COMPLEX_MMA_LOAD_RHS1(peel, right) \
+ ploadRhsMMA(rhs_ptr_real##right + (accRows * peel), rhsV##right[peel]); \
+ if (!RhsIsReal) { \
+ ploadRhsMMA(rhs_ptr_imag##right + (accRows * peel), rhsVi##right[peel]); \
+ }
+
+#define MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) \
+ MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_RHS1, peel)
+
#define MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
if (PEEL_COMPLEX_MMA > peel) { \
Packet lhsV0, lhsV1, lhsV2, lhsV3; \
Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
- ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV[peel]); \
- if(!RhsIsReal) { \
- ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi[peel]); \
- } \
+ MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) \
MICRO_COMPLEX_MMA_UNROLL(funcl) \
MICRO_COMPLEX_MMA_WORK(funcw, type, peel) \
}
#ifndef VECTOR_PAIR_LOADS_LHS
#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
- type rhsV[4], rhsVi[4]; \
+ type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1]; \
MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,1) \
MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,3)
#else
+#define MICRO_COMPLEX_MMA_LOAD_TWO_RHS(peel1, right) \
+ ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_real##right + (accRows * peel1)), prhsV##peel1); \
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1); \
+ if(!RhsIsReal) { \
+ ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_imag##right + (accRows * peel1)), prhsVi##peel1); \
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsVi##right[peel1]), &prhsVi##peel1); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
+ }
+
#define MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
if (PEEL_COMPLEX_MMA > peel2) { \
PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23; \
@@ -522,23 +639,12 @@
__vector_pair plhsV0, plhsV1, plhsV2, plhsV3; \
__vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3; \
if (sizeof(type) == 16) { \
- ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_real + (accRows * peel1)), prhsV##peel1); \
- __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV[peel1]), &prhsV##peel1); \
- if(!RhsIsReal) { \
- ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_imag + (accRows * peel1)), prhsVi##peel1); \
- __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsVi[peel1]), &prhsVi##peel1); \
- } else { \
- EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
- } \
+ MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_TWO_RHS, peel1) \
} else { \
EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
- ploadRhsMMA(rhs_ptr_real + (accRows * peel1), rhsV[peel1]); \
- ploadRhsMMA(rhs_ptr_real + (accRows * peel2), rhsV[peel2]); \
- if(!RhsIsReal) { \
- ploadRhsMMA(rhs_ptr_imag + (accRows * peel1), rhsVi[peel1]); \
- ploadRhsMMA(rhs_ptr_imag + (accRows * peel2), rhsVi[peel2]); \
- } \
+ MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel1); \
+ MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel2); \
} \
MICRO_COMPLEX_MMA_UNROLL(funcl2) \
MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1) \
@@ -550,7 +656,7 @@
}
#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
- type rhsV[4], rhsVi[4]; \
+ type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1]; \
__vector_pair prhsV0, prhsV2; \
__vector_pair prhsVi0, prhsVi2; \
MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
@@ -558,21 +664,26 @@
#endif
#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
- type rhsV[1], rhsVi[1]; \
+ type rhsV0[1], rhsVi0[1], rhsV1[1], rhsVi1[1], rhsV2[1], rhsVi2[1], rhsV3[1], rhsVi3[1]; \
MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0)
+#define MICRO_COMPLEX_MMA_UPDATE_RHS1(size, right) \
+ rhs_ptr_real##right += (accRows * size); \
+ if(!RhsIsReal) rhs_ptr_imag##right += (accRows * size);
+
+#define MICRO_COMPLEX_MMA_UPDATE_RHS(size) \
+ MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_UPDATE_RHS1, size)
+
#define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size) \
MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, RhsPacket) \
- rhs_ptr_real += (accRows * size); \
- if(!RhsIsReal) rhs_ptr_imag += (accRows * size);
+ MICRO_COMPLEX_MMA_UPDATE_RHS(size);
#ifndef VECTOR_PAIR_LOADS_LHS
#define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL, PEEL_COMPLEX_MMA)
#else
#define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size) \
MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \
- rhs_ptr_real += (accRows * size); \
- if(!RhsIsReal) rhs_ptr_imag += (accRows * size);
+ MICRO_COMPLEX_MMA_UPDATE_RHS(size);
#define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2, PEEL_COMPLEX_MMA)
#endif
@@ -580,7 +691,7 @@
#define MICRO_COMPLEX_MMA_ONE MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE, 1)
#define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \
- if (unroll_factor > iter) { \
+ if (unroll_factor * accItr > iter) { \
bsetzeroMMA(&accReal##iter); \
bsetzeroMMA(&accImag##iter); \
} else { \
@@ -594,16 +705,34 @@
#define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
-#define MICRO_COMPLEX_MMA_STORE_ONE(iter) \
- if (unroll_factor > iter) { \
- storeComplexAccumulator<DataMapper, Packet, Packetc, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \
+#define MICRO_COMPLEX_MMA_STORE_ONE(iter, left, right) \
+ if (unroll_factor > left) { \
+ storeComplexAccumulator<DataMapper, Packet, Packetc, accCols, (unroll_factor != (left + 1)) ? accCols : accCols2>(row + left*accCols, res##right, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \
}
-#define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
+#define MICRO_COMPLEX_MMA_ITER_UNROLL(func) \
+ if (accItr == 1) { \
+ func(0,0,0) func(1,1,0) func(2,2,0) func(3,3,0) \
+ } else if (accItr == 2) { \
+ func(0,0,0) func(1,0,1) func(2,1,0) func(3,1,1) \
+ } else { \
+ func(0,0,0) func(1,0,1) func(2,0,2) func(3,0,3) \
+ }
-template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+#define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_ITER_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
+
+#define MICRO_COMPLEX_MMA_EXTRA_ROWS(right) \
+ gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3##right, blockA, rhs_base + right*accRows*(RhsIsReal ? 1 : 2)*strideB, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+
+#define MICRO_COMPLEX_MMA_EXTRA_ROWS1(val, right) \
+ MICRO_COMPLEX_MMA_EXTRA_ROWS(right);
+
+template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index accItr>
EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
- const DataMapper& res,
+ const DataMapper& res0,
+ const DataMapper& res1,
+ const DataMapper& res2,
+ const DataMapper& res3,
const Scalar* lhs_base,
const Scalar* rhs_base,
Index depth,
@@ -615,14 +744,48 @@
const Packet& pAlphaImag,
const Packet& pMask)
{
- const Scalar* rhs_ptr_real = rhs_base;
- const Scalar* rhs_ptr_imag = NULL;
+ const Scalar* rhs_ptr_real0 = rhs_base, * rhs_ptr_real1 = NULL, * rhs_ptr_real2 = NULL, * rhs_ptr_real3 = NULL;
+ const Scalar* rhs_ptr_imag0 = NULL, * rhs_ptr_imag1 = NULL, * rhs_ptr_imag2 = NULL, * rhs_ptr_imag3 = NULL;
const Index imag_delta = accCols*strideA;
const Index imag_delta2 = accCols2*strideA;
+
if(!RhsIsReal) {
- rhs_ptr_imag = rhs_base + accRows*strideB;
+ rhs_ptr_imag0 = rhs_base + accRows*strideB;
} else {
- EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag0);
+ }
+ if (accItr > 1) {
+ if(!RhsIsReal) {
+ rhs_ptr_real1 = rhs_base + (2*accRows*strideB);
+ rhs_ptr_imag1 = rhs_base + (3*accRows*strideB);
+ } else {
+ rhs_ptr_real1 = rhs_base + accRows*strideB;
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag1);
+ }
+ } else {
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_real1);
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag1);
+ EIGEN_UNUSED_VARIABLE(res1);
+ }
+ if (accItr > 2) {
+ if(!RhsIsReal) {
+ rhs_ptr_real2 = rhs_base + (4*accRows*strideB);
+ rhs_ptr_imag2 = rhs_base + (5*accRows*strideB);
+ rhs_ptr_real3 = rhs_base + (6*accRows*strideB);
+ rhs_ptr_imag3 = rhs_base + (7*accRows*strideB);
+ } else {
+ rhs_ptr_real2 = rhs_base + (2*accRows*strideB);
+ rhs_ptr_real3 = rhs_base + (3*accRows*strideB);
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag2);
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag3);
+ }
+ } else {
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_real2);
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_real3);
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag2);
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag3);
+ EIGEN_UNUSED_VARIABLE(res2);
+ EIGEN_UNUSED_VARIABLE(res3);
}
const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
@@ -651,10 +814,15 @@
}
#define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M) \
- gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
+ gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, accItr>(res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
if (M) return;
-template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+#define MICRO_COMPLEX_MMA_ROWS(n) \
+ while(row + n*accCols <= rows) { \
+ MICRO_COMPLEX_MMA_UNROLL_ITER2(n, 0); \
+ }
+
+template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index accItr>
EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
const DataMapper& res,
const Scalar* blockA,
@@ -671,35 +839,50 @@
const Packet& pAlphaImag,
const Packet& pMask)
{
- const DataMapper res3 = res.getSubMapper(0, col);
+ const DataMapper res30 = res.getSubMapper(0, col);
+ const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows*1) : res30;
+ const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows*2) : res30;
+ const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows*3) : res30;
const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
const Scalar* lhs_base = blockA + accCols*offsetA;
Index row = 0;
#define MAX_COMPLEX_MMA_UNROLL 4
- while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
- MICRO_COMPLEX_MMA_UNROLL_ITER2(MAX_COMPLEX_MMA_UNROLL, 0);
+
+#if MAX_COMPLEX_MMA_UNROLL < 2
+ if (1) {
+#elif MAX_COMPLEX_MMA_UNROLL < 4
+ if (accItr <= 2) {
+#else
+ if (accItr == 1) {
+#endif
+ MICRO_COMPLEX_MMA_ROWS(MAX_COMPLEX_MMA_UNROLL);
+ } else if (accItr == 2) {
+ MICRO_COMPLEX_MMA_ROWS(2);
+ } else {
+ MICRO_COMPLEX_MMA_ROWS(1);
}
switch( (rows-row)/accCols ) {
-#if MAX_COMPLEX_MMA_UNROLL > 4
- case 4:
- MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 4)
- break;
-#endif
#if MAX_COMPLEX_MMA_UNROLL > 3
case 3:
- MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 3)
+ if (accItr == 1) {
+ MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 3)
+ }
break;
#endif
#if MAX_COMPLEX_MMA_UNROLL > 2
case 2:
- MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 2)
+ if (accItr == 1) {
+ MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 2)
+ }
break;
#endif
#if MAX_COMPLEX_MMA_UNROLL > 1
case 1:
- MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 1)
+ if (accItr <= 2) {
+ MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 1)
+ }
break;
#endif
default:
@@ -709,10 +892,16 @@
if(remaining_rows > 0)
{
- gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+ MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_EXTRA_ROWS1, 0)
}
}
+#define MICRO_COMPLEX_MMA_COLS(n) \
+ for(; col + n*accRows <= cols; col += n*accRows) \
+ { \
+ gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, n>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); \
+ }
+
template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
{
@@ -731,10 +920,11 @@
typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
Index col = 0;
- for(; col + accRows <= cols; col += accRows)
- {
- gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
- }
+#ifdef GEMM_MULTIPLE_COLS
+ MICRO_COMPLEX_MMA_COLS(4);
+ MICRO_COMPLEX_MMA_COLS(2);
+#endif
+ MICRO_COMPLEX_MMA_COLS(1);
if (col != cols)
{
diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
index ed4edc1..27d9a82 100644
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -86,6 +86,8 @@
typedef packet_type half; \
};
+SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, Eigen::half, 8)
+SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, const Eigen::half, 8)
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
@@ -100,6 +102,7 @@
struct is_arithmetic<packet_type> { \
enum { value = true }; \
};
+SYCL_ARITHMETIC(cl::sycl::cl_half8)
SYCL_ARITHMETIC(cl::sycl::cl_float4)
SYCL_ARITHMETIC(cl::sycl::cl_double2)
#undef SYCL_ARITHMETIC
@@ -111,6 +114,7 @@
enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \
typedef packet_type half; \
};
+SYCL_UNPACKET_TRAITS(cl::sycl::cl_half8, Eigen::half, 8)
SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
diff --git a/Eigen/src/Core/arch/SYCL/MathFunctions.h b/Eigen/src/Core/arch/SYCL/MathFunctions.h
index bdbb21f..a8adc46 100644
--- a/Eigen/src/Core/arch/SYCL/MathFunctions.h
+++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h
@@ -38,6 +38,7 @@
return cl::sycl::log(a); \
}
+SYCL_PLOG(cl::sycl::cl_half8)
SYCL_PLOG(cl::sycl::cl_float4)
SYCL_PLOG(cl::sycl::cl_double2)
#undef SYCL_PLOG
@@ -49,6 +50,7 @@
return cl::sycl::log1p(a); \
}
+SYCL_PLOG1P(cl::sycl::cl_half8)
SYCL_PLOG1P(cl::sycl::cl_float4)
SYCL_PLOG1P(cl::sycl::cl_double2)
#undef SYCL_PLOG1P
@@ -60,6 +62,7 @@
return cl::sycl::log10(a); \
}
+SYCL_PLOG10(cl::sycl::cl_half8)
SYCL_PLOG10(cl::sycl::cl_float4)
SYCL_PLOG10(cl::sycl::cl_double2)
#undef SYCL_PLOG10
@@ -71,6 +74,8 @@
return cl::sycl::exp(a); \
}
+SYCL_PEXP(cl::sycl::cl_half8)
+SYCL_PEXP(cl::sycl::cl_half)
SYCL_PEXP(cl::sycl::cl_float4)
SYCL_PEXP(cl::sycl::cl_float)
SYCL_PEXP(cl::sycl::cl_double2)
@@ -83,6 +88,7 @@
return cl::sycl::expm1(a); \
}
+SYCL_PEXPM1(cl::sycl::cl_half8)
SYCL_PEXPM1(cl::sycl::cl_float4)
SYCL_PEXPM1(cl::sycl::cl_double2)
#undef SYCL_PEXPM1
@@ -94,6 +100,7 @@
return cl::sycl::sqrt(a); \
}
+SYCL_PSQRT(cl::sycl::cl_half8)
SYCL_PSQRT(cl::sycl::cl_float4)
SYCL_PSQRT(cl::sycl::cl_double2)
#undef SYCL_PSQRT
@@ -105,6 +112,7 @@
return cl::sycl::rsqrt(a); \
}
+SYCL_PRSQRT(cl::sycl::cl_half8)
SYCL_PRSQRT(cl::sycl::cl_float4)
SYCL_PRSQRT(cl::sycl::cl_double2)
#undef SYCL_PRSQRT
@@ -117,6 +125,7 @@
return cl::sycl::sin(a); \
}
+SYCL_PSIN(cl::sycl::cl_half8)
SYCL_PSIN(cl::sycl::cl_float4)
SYCL_PSIN(cl::sycl::cl_double2)
#undef SYCL_PSIN
@@ -129,6 +138,7 @@
return cl::sycl::cos(a); \
}
+SYCL_PCOS(cl::sycl::cl_half8)
SYCL_PCOS(cl::sycl::cl_float4)
SYCL_PCOS(cl::sycl::cl_double2)
#undef SYCL_PCOS
@@ -141,6 +151,7 @@
return cl::sycl::tan(a); \
}
+SYCL_PTAN(cl::sycl::cl_half8)
SYCL_PTAN(cl::sycl::cl_float4)
SYCL_PTAN(cl::sycl::cl_double2)
#undef SYCL_PTAN
@@ -153,6 +164,7 @@
return cl::sycl::asin(a); \
}
+SYCL_PASIN(cl::sycl::cl_half8)
SYCL_PASIN(cl::sycl::cl_float4)
SYCL_PASIN(cl::sycl::cl_double2)
#undef SYCL_PASIN
@@ -165,6 +177,7 @@
return cl::sycl::acos(a); \
}
+SYCL_PACOS(cl::sycl::cl_half8)
SYCL_PACOS(cl::sycl::cl_float4)
SYCL_PACOS(cl::sycl::cl_double2)
#undef SYCL_PACOS
@@ -177,6 +190,7 @@
return cl::sycl::atan(a); \
}
+SYCL_PATAN(cl::sycl::cl_half8)
SYCL_PATAN(cl::sycl::cl_float4)
SYCL_PATAN(cl::sycl::cl_double2)
#undef SYCL_PATAN
@@ -189,6 +203,7 @@
return cl::sycl::sinh(a); \
}
+SYCL_PSINH(cl::sycl::cl_half8)
SYCL_PSINH(cl::sycl::cl_float4)
SYCL_PSINH(cl::sycl::cl_double2)
#undef SYCL_PSINH
@@ -201,6 +216,7 @@
return cl::sycl::cosh(a); \
}
+SYCL_PCOSH(cl::sycl::cl_half8)
SYCL_PCOSH(cl::sycl::cl_float4)
SYCL_PCOSH(cl::sycl::cl_double2)
#undef SYCL_PCOSH
@@ -213,6 +229,7 @@
return cl::sycl::tanh(a); \
}
+SYCL_PTANH(cl::sycl::cl_half8)
SYCL_PTANH(cl::sycl::cl_float4)
SYCL_PTANH(cl::sycl::cl_double2)
#undef SYCL_PTANH
@@ -224,6 +241,7 @@
return cl::sycl::ceil(a); \
}
+SYCL_PCEIL(cl::sycl::cl_half)
SYCL_PCEIL(cl::sycl::cl_float4)
SYCL_PCEIL(cl::sycl::cl_double2)
#undef SYCL_PCEIL
@@ -235,6 +253,7 @@
return cl::sycl::round(a); \
}
+SYCL_PROUND(cl::sycl::cl_half8)
SYCL_PROUND(cl::sycl::cl_float4)
SYCL_PROUND(cl::sycl::cl_double2)
#undef SYCL_PROUND
@@ -246,6 +265,7 @@
return cl::sycl::rint(a); \
}
+SYCL_PRINT(cl::sycl::cl_half8)
SYCL_PRINT(cl::sycl::cl_float4)
SYCL_PRINT(cl::sycl::cl_double2)
#undef SYCL_PRINT
@@ -257,6 +277,7 @@
return cl::sycl::floor(a); \
}
+SYCL_FLOOR(cl::sycl::cl_half8)
SYCL_FLOOR(cl::sycl::cl_float4)
SYCL_FLOOR(cl::sycl::cl_double2)
#undef SYCL_FLOOR
@@ -268,6 +289,7 @@
return expr; \
}
+SYCL_PMIN(cl::sycl::cl_half8, cl::sycl::fmin(a, b))
SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
#undef SYCL_PMIN
@@ -279,6 +301,7 @@
return expr; \
}
+SYCL_PMAX(cl::sycl::cl_half8, cl::sycl::fmax(a, b))
SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
#undef SYCL_PMAX
@@ -292,6 +315,7 @@
cl::sycl::rounding_mode::automatic>()); \
}
+SYCL_PLDEXP(cl::sycl::cl_half8)
SYCL_PLDEXP(cl::sycl::cl_float4)
SYCL_PLDEXP(cl::sycl::cl_double2)
#undef SYCL_PLDEXP
diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h
index fbdcdc0..4b0b1c6 100644
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@@ -44,9 +44,34 @@
SYCL_PLOAD(cl::sycl::cl_float4, )
SYCL_PLOAD(cl::sycl::cl_double2, u)
SYCL_PLOAD(cl::sycl::cl_double2, )
-
#undef SYCL_PLOAD
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
+ pload<cl::sycl::cl_half8>(
+ const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
+ auto ptr = cl::sycl::address_space_cast<
+ cl::sycl::access::address_space::generic_space,
+ cl::sycl::access::decorated::no>(
+ reinterpret_cast<const cl::sycl::cl_half*>(from));
+ cl::sycl::cl_half8 res{};
+ res.load(0, ptr);
+ return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
+ploadu<cl::sycl::cl_half8>(
+ const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
+ auto ptr = cl::sycl::address_space_cast<
+ cl::sycl::access::address_space::generic_space,
+ cl::sycl::access::decorated::no>(
+ reinterpret_cast<const cl::sycl::cl_half*>(from));
+ cl::sycl::cl_half8 res{};
+ res.load(0, ptr);
+ return res;
+}
+
#define SYCL_PSTORE(scalar, packet_type, alignment) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
@@ -59,9 +84,28 @@
SYCL_PSTORE(float, cl::sycl::cl_float4, u)
SYCL_PSTORE(double, cl::sycl::cl_double2, )
SYCL_PSTORE(double, cl::sycl::cl_double2, u)
-
#undef SYCL_PSTORE
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoreu(
+ Eigen::half* to, const cl::sycl::cl_half8& from) {
+ auto ptr = cl::sycl::address_space_cast<
+ cl::sycl::access::address_space::generic_space,
+ cl::sycl::access::decorated::no>(
+ reinterpret_cast<cl::sycl::cl_half*>(to));
+ from.store(0, ptr);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore(
+ Eigen::half* to, const cl::sycl::cl_half8& from) {
+ auto ptr = cl::sycl::address_space_cast<
+ cl::sycl::access::address_space::generic_space,
+ cl::sycl::access::decorated::no>(
+ reinterpret_cast<cl::sycl::cl_half*>(to));
+ from.store(0, ptr);
+}
+
#define SYCL_PSET1(packet_type) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
@@ -70,6 +114,7 @@
}
// global space
+SYCL_PSET1(cl::sycl::cl_half8)
SYCL_PSET1(cl::sycl::cl_float4)
SYCL_PSET1(cl::sycl::cl_double2)
@@ -87,6 +132,58 @@
};
template <>
+struct get_base_packet<cl::sycl::cl_half8> {
+ template <typename sycl_multi_pointer>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_ploaddup(
+ sycl_multi_pointer from) {
+ return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0]),
+ static_cast<cl::sycl::half>(from[0]),
+ static_cast<cl::sycl::half>(from[1]),
+ static_cast<cl::sycl::half>(from[1]),
+ static_cast<cl::sycl::half>(from[2]),
+ static_cast<cl::sycl::half>(from[2]),
+ static_cast<cl::sycl::half>(from[3]),
+ static_cast<cl::sycl::half>(from[3]));
+ }
+ template <typename sycl_multi_pointer>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_pgather(
+ sycl_multi_pointer from, Index stride) {
+ return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0 * stride]),
+ static_cast<cl::sycl::half>(from[1 * stride]),
+ static_cast<cl::sycl::half>(from[2 * stride]),
+ static_cast<cl::sycl::half>(from[3 * stride]),
+ static_cast<cl::sycl::half>(from[4 * stride]),
+ static_cast<cl::sycl::half>(from[5 * stride]),
+ static_cast<cl::sycl::half>(from[6 * stride]),
+ static_cast<cl::sycl::half>(from[7 * stride]));
+ }
+
+ template <typename sycl_multi_pointer>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
+ sycl_multi_pointer to, const cl::sycl::cl_half8& from, Index stride) {
+ auto tmp = stride;
+ to[0] = Eigen::half(from.s0());
+ to[tmp] = Eigen::half(from.s1());
+ to[tmp += stride] = Eigen::half(from.s2());
+ to[tmp += stride] = Eigen::half(from.s3());
+ to[tmp += stride] = Eigen::half(from.s4());
+ to[tmp += stride] = Eigen::half(from.s5());
+ to[tmp += stride] = Eigen::half(from.s6());
+ to[tmp += stride] = Eigen::half(from.s7());
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 set_plset(
+ const cl::sycl::half& a) {
+ return cl::sycl::cl_half8(static_cast<cl::sycl::half>(a), static_cast<cl::sycl::half>(a + 1),
+ static_cast<cl::sycl::half>(a + 2),
+ static_cast<cl::sycl::half>(a + 3),
+ static_cast<cl::sycl::half>(a + 4),
+ static_cast<cl::sycl::half>(a + 5),
+ static_cast<cl::sycl::half>(a + 6),
+ static_cast<cl::sycl::half>(a + 7));
+ }
+};
+
+template <>
struct get_base_packet<cl::sycl::cl_float4> {
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(
@@ -152,6 +249,7 @@
return get_base_packet<packet_type>::get_ploaddup(from); \
}
+SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_half8)
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
@@ -165,9 +263,14 @@
}
SYCL_PLSET(cl::sycl::cl_float4)
SYCL_PLSET(cl::sycl::cl_double2)
-
#undef SYCL_PLSET
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 plset<cl::sycl::cl_half8>(
+ const typename unpacket_traits<cl::sycl::cl_half8>::type& a) {
+ return get_base_packet<cl::sycl::cl_half8>::set_plset((const cl::sycl::half &) a);
+}
+
#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
@@ -176,9 +279,9 @@
return get_base_packet<packet_type>::get_pgather(from, stride); \
}
+SYCL_PGATHER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
-
#undef SYCL_PGATHER_SPECILIZE
#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \
@@ -189,6 +292,7 @@
get_base_packet<packet_type>::set_pscatter(to, from, stride); \
}
+SYCL_PSCATTER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
@@ -201,11 +305,17 @@
return cl::sycl::mad(a, b, c); \
}
+SYCL_PMAD(cl::sycl::cl_half8)
SYCL_PMAD(cl::sycl::cl_float4)
SYCL_PMAD(cl::sycl::cl_double2)
#undef SYCL_PMAD
template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half pfirst<cl::sycl::cl_half8>(
+ const cl::sycl::cl_half8& a) {
+ return Eigen::half(a.s0());
+}
+template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(
const cl::sycl::cl_float4& a) {
return a.x();
@@ -217,6 +327,13 @@
}
template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux<cl::sycl::cl_half8>(
+ const cl::sycl::cl_half8& a) {
+ return Eigen::half(a.s0() + a.s1() + a.s2() + a.s3() + a.s4() + a.s5()
+ + a.s6() + a.s7());
+}
+
+template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(
const cl::sycl::cl_float4& a) {
return a.x() + a.y() + a.z() + a.w();
@@ -229,6 +346,17 @@
}
template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_max<cl::sycl::cl_half8>(
+ const cl::sycl::cl_half8& a) {
+ return Eigen::half(cl::sycl::fmax(
+ cl::sycl::fmax(
+ cl::sycl::fmax(a.s0(), a.s1()),
+ cl::sycl::fmax(a.s2(), a.s3())),
+ cl::sycl::fmax(
+ cl::sycl::fmax(a.s4(), a.s5()),
+ cl::sycl::fmax(a.s6(), a.s7()))));
+}
+template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(
const cl::sycl::cl_float4& a) {
return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()),
@@ -241,6 +369,17 @@
}
template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_min<cl::sycl::cl_half8>(
+ const cl::sycl::cl_half8& a) {
+ return Eigen::half(cl::sycl::fmin(
+ cl::sycl::fmin(
+ cl::sycl::fmin(a.s0(), a.s1()),
+ cl::sycl::fmin(a.s2(), a.s3())),
+ cl::sycl::fmin(
+ cl::sycl::fmin(a.s4(), a.s5()),
+ cl::sycl::fmin(a.s6(), a.s7()))));
+}
+template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(
const cl::sycl::cl_float4& a) {
return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()),
@@ -253,6 +392,12 @@
}
template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_mul<cl::sycl::cl_half8>(
+ const cl::sycl::cl_half8& a) {
+ return Eigen::half(a.s0() * a.s1() * a.s2() * a.s3() * a.s4() * a.s5() *
+ a.s6() * a.s7());
+}
+template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(
const cl::sycl::cl_float4& a) {
return a.x() * a.y() * a.z() * a.w();
@@ -264,6 +409,14 @@
}
template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
+pabs<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+ return cl::sycl::cl_half8(cl::sycl::fabs(a.s0()), cl::sycl::fabs(a.s1()),
+ cl::sycl::fabs(a.s2()), cl::sycl::fabs(a.s3()),
+ cl::sycl::fabs(a.s4()), cl::sycl::fabs(a.s5()),
+ cl::sycl::fabs(a.s6()), cl::sycl::fabs(a.s7()));
+}
+template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()),
@@ -300,6 +453,9 @@
return sycl_pcmp_##OP<TYPE>(a, b); \
}
+SYCL_PCMP(le, cl::sycl::cl_half8)
+SYCL_PCMP(lt, cl::sycl::cl_half8)
+SYCL_PCMP(eq, cl::sycl::cl_half8)
SYCL_PCMP(le, cl::sycl::cl_float4)
SYCL_PCMP(lt, cl::sycl::cl_float4)
SYCL_PCMP(eq, cl::sycl::cl_float4)
@@ -309,6 +465,121 @@
#undef SYCL_PCMP
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
+ PacketBlock<cl::sycl::cl_half8, 8>& kernel) {
+ cl::sycl::cl_half tmp = kernel.packet[0].s1();
+ kernel.packet[0].s1() = kernel.packet[1].s0();
+ kernel.packet[1].s0() = tmp;
+
+ tmp = kernel.packet[0].s2();
+ kernel.packet[0].s2() = kernel.packet[2].s0();
+ kernel.packet[2].s0() = tmp;
+
+ tmp = kernel.packet[0].s3();
+ kernel.packet[0].s3() = kernel.packet[3].s0();
+ kernel.packet[3].s0() = tmp;
+
+ tmp = kernel.packet[0].s4();
+ kernel.packet[0].s4() = kernel.packet[4].s0();
+ kernel.packet[4].s0() = tmp;
+
+ tmp = kernel.packet[0].s5();
+ kernel.packet[0].s5() = kernel.packet[5].s0();
+ kernel.packet[5].s0() = tmp;
+
+ tmp = kernel.packet[0].s6();
+ kernel.packet[0].s6() = kernel.packet[6].s0();
+ kernel.packet[6].s0() = tmp;
+
+ tmp = kernel.packet[0].s7();
+ kernel.packet[0].s7() = kernel.packet[7].s0();
+ kernel.packet[7].s0() = tmp;
+
+ tmp = kernel.packet[1].s2();
+ kernel.packet[1].s2() = kernel.packet[2].s1();
+ kernel.packet[2].s1() = tmp;
+
+ tmp = kernel.packet[1].s3();
+ kernel.packet[1].s3() = kernel.packet[3].s1();
+ kernel.packet[3].s1() = tmp;
+
+ tmp = kernel.packet[1].s4();
+ kernel.packet[1].s4() = kernel.packet[4].s1();
+ kernel.packet[4].s1() = tmp;
+
+ tmp = kernel.packet[1].s5();
+ kernel.packet[1].s5() = kernel.packet[5].s1();
+ kernel.packet[5].s1() = tmp;
+
+ tmp = kernel.packet[1].s6();
+ kernel.packet[1].s6() = kernel.packet[6].s1();
+ kernel.packet[6].s1() = tmp;
+
+ tmp = kernel.packet[1].s7();
+ kernel.packet[1].s7() = kernel.packet[7].s1();
+ kernel.packet[7].s1() = tmp;
+
+ tmp = kernel.packet[2].s3();
+ kernel.packet[2].s3() = kernel.packet[3].s2();
+ kernel.packet[3].s2() = tmp;
+
+ tmp = kernel.packet[2].s4();
+ kernel.packet[2].s4() = kernel.packet[4].s2();
+ kernel.packet[4].s2() = tmp;
+
+ tmp = kernel.packet[2].s5();
+ kernel.packet[2].s5() = kernel.packet[5].s2();
+ kernel.packet[5].s2() = tmp;
+
+ tmp = kernel.packet[2].s6();
+ kernel.packet[2].s6() = kernel.packet[6].s2();
+ kernel.packet[6].s2() = tmp;
+
+ tmp = kernel.packet[2].s7();
+ kernel.packet[2].s7() = kernel.packet[7].s2();
+ kernel.packet[7].s2() = tmp;
+
+ tmp = kernel.packet[3].s4();
+ kernel.packet[3].s4() = kernel.packet[4].s3();
+ kernel.packet[4].s3() = tmp;
+
+ tmp = kernel.packet[3].s5();
+ kernel.packet[3].s5() = kernel.packet[5].s3();
+ kernel.packet[5].s3() = tmp;
+
+ tmp = kernel.packet[3].s6();
+ kernel.packet[3].s6() = kernel.packet[6].s3();
+ kernel.packet[6].s3() = tmp;
+
+ tmp = kernel.packet[3].s7();
+ kernel.packet[3].s7() = kernel.packet[7].s3();
+ kernel.packet[7].s3() = tmp;
+
+ tmp = kernel.packet[4].s5();
+ kernel.packet[4].s5() = kernel.packet[5].s4();
+ kernel.packet[5].s4() = tmp;
+
+ tmp = kernel.packet[4].s6();
+ kernel.packet[4].s6() = kernel.packet[6].s4();
+ kernel.packet[6].s4() = tmp;
+
+ tmp = kernel.packet[4].s7();
+ kernel.packet[4].s7() = kernel.packet[7].s4();
+ kernel.packet[7].s4() = tmp;
+
+ tmp = kernel.packet[5].s6();
+ kernel.packet[5].s6() = kernel.packet[6].s5();
+ kernel.packet[6].s5() = tmp;
+
+ tmp = kernel.packet[5].s7();
+ kernel.packet[5].s7() = kernel.packet[7].s5();
+ kernel.packet[7].s5() = tmp;
+
+ tmp = kernel.packet[6].s7();
+ kernel.packet[6].s7() = kernel.packet[7].s6();
+ kernel.packet[7].s6() = tmp;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
float tmp = kernel.packet[0].y();
kernel.packet[0].y() = kernel.packet[1].x();
@@ -343,6 +614,19 @@
}
template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pblend(
+ const Selector<unpacket_traits<cl::sycl::cl_half8>::size>& ifPacket,
+ const cl::sycl::cl_half8& thenPacket,
+ const cl::sycl::cl_half8& elsePacket) {
+ cl::sycl::cl_short8 condition(
+ ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
+ ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1,
+ ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1,
+ ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1);
+ return cl::sycl::select(thenPacket, elsePacket, condition);
+}
+
+template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
const cl::sycl::cl_float4& thenPacket,
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index bb382a7..d05e7d1 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -825,7 +825,7 @@
// For regular block expressions, simply forward along the InnerPanel argument,
// which is set when calling row/column expressions.
- static constexpr bool is_inner_panel(bool inner_panel) { return inner_panel; };
+ static constexpr bool is_inner_panel(bool inner_panel) { return inner_panel; }
// Only enable non-const base function if XprType is not const (otherwise we get a duplicate definition).
template<typename T = XprType, typename EnableIf=std::enable_if_t<!std::is_const<T>::value>>
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 8f64863..1cfff1b 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -1432,7 +1432,7 @@
typedef Transform<typename Other::Scalar,Dim,Mode,Options> TransformType;
typedef typename TransformType::MatrixType MatrixType;
typedef Transform<typename Other::Scalar,Dim,Projective,Options> ResultType;
- static ResultType run(const Other& other,const TransformType& tr)
+ static EIGEN_DEVICE_FUNC ResultType run(const Other& other,const TransformType& tr)
{ return ResultType(other * tr.matrix()); }
};
@@ -1443,7 +1443,7 @@
typedef Transform<typename Other::Scalar,Dim,AffineCompact,Options> TransformType;
typedef typename TransformType::MatrixType MatrixType;
typedef Transform<typename Other::Scalar,Dim,Projective,Options> ResultType;
- static ResultType run(const Other& other,const TransformType& tr)
+ static EIGEN_DEVICE_FUNC ResultType run(const Other& other,const TransformType& tr)
{
ResultType res;
res.matrix().noalias() = other.template block<HDim,Dim>(0,0) * tr.matrix();
@@ -1459,7 +1459,7 @@
typedef Transform<typename Other::Scalar,Dim,Mode,Options> TransformType;
typedef typename TransformType::MatrixType MatrixType;
typedef TransformType ResultType;
- static ResultType run(const Other& other,const TransformType& tr)
+ static EIGEN_DEVICE_FUNC ResultType run(const Other& other,const TransformType& tr)
{
ResultType res;
res.affine().noalias() = other * tr.matrix();
@@ -1475,7 +1475,7 @@
typedef Transform<typename Other::Scalar,Dim,AffineCompact,Options> TransformType;
typedef typename TransformType::MatrixType MatrixType;
typedef TransformType ResultType;
- static ResultType run(const Other& other,const TransformType& tr)
+ static EIGEN_DEVICE_FUNC ResultType run(const Other& other,const TransformType& tr)
{
ResultType res;
res.matrix().noalias() = other.template block<Dim,Dim>(0,0) * tr.matrix();
@@ -1491,7 +1491,7 @@
typedef Transform<typename Other::Scalar,Dim,Mode,Options> TransformType;
typedef typename TransformType::MatrixType MatrixType;
typedef TransformType ResultType;
- static ResultType run(const Other& other, const TransformType& tr)
+ static EIGEN_DEVICE_FUNC ResultType run(const Other& other, const TransformType& tr)
{
TransformType res;
if(Mode!=int(AffineCompact))
@@ -1513,7 +1513,7 @@
typedef Transform<Scalar,Dim,LhsMode,LhsOptions> Lhs;
typedef Transform<Scalar,Dim,RhsMode,RhsOptions> Rhs;
typedef Transform<Scalar,Dim,ResultMode,LhsOptions> ResultType;
- static ResultType run(const Lhs& lhs, const Rhs& rhs)
+ static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs)
{
ResultType res;
res.linear() = lhs.linear() * rhs.linear();
@@ -1529,7 +1529,7 @@
typedef Transform<Scalar,Dim,LhsMode,LhsOptions> Lhs;
typedef Transform<Scalar,Dim,RhsMode,RhsOptions> Rhs;
typedef Transform<Scalar,Dim,Projective> ResultType;
- static ResultType run(const Lhs& lhs, const Rhs& rhs)
+ static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs)
{
return ResultType( lhs.matrix() * rhs.matrix() );
}
@@ -1541,7 +1541,7 @@
typedef Transform<Scalar,Dim,AffineCompact,LhsOptions> Lhs;
typedef Transform<Scalar,Dim,Projective,RhsOptions> Rhs;
typedef Transform<Scalar,Dim,Projective> ResultType;
- static ResultType run(const Lhs& lhs, const Rhs& rhs)
+ static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs)
{
ResultType res;
res.matrix().template topRows<Dim>() = lhs.matrix() * rhs.matrix();
@@ -1556,7 +1556,7 @@
typedef Transform<Scalar,Dim,Projective,LhsOptions> Lhs;
typedef Transform<Scalar,Dim,AffineCompact,RhsOptions> Rhs;
typedef Transform<Scalar,Dim,Projective> ResultType;
- static ResultType run(const Lhs& lhs, const Rhs& rhs)
+ static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs)
{
ResultType res(lhs.matrix().template leftCols<Dim>() * rhs.matrix());
res.matrix().col(Dim) += lhs.matrix().col(Dim);
diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index ced7595..f9f7979 100644
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -274,8 +274,8 @@
m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
manageErrorCode(error);
- m_analysisIsOk = true;
- m_factorizationIsOk = true;
+ m_analysisIsOk = m_info == Eigen::Success;
+ m_factorizationIsOk = m_info == Eigen::Success;
m_isInitialized = true;
return derived();
}
@@ -296,7 +296,7 @@
m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
manageErrorCode(error);
- m_analysisIsOk = true;
+ m_analysisIsOk = m_info == Eigen::Success;
m_factorizationIsOk = false;
m_isInitialized = true;
return derived();
@@ -316,7 +316,7 @@
m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
manageErrorCode(error);
- m_factorizationIsOk = true;
+ m_factorizationIsOk = m_info == Eigen::Success;
return derived();
}
diff --git a/Eigen/src/SparseCore/SparseTriangularView.h b/Eigen/src/SparseCore/SparseTriangularView.h
index 5cac825..2e53114 100644
--- a/Eigen/src/SparseCore/SparseTriangularView.h
+++ b/Eigen/src/SparseCore/SparseTriangularView.h
@@ -152,8 +152,8 @@
}
}
-// inline Index row() const { return (ArgType::Flags&RowMajorBit ? Base::outer() : this->index()); }
-// inline Index col() const { return (ArgType::Flags&RowMajorBit ? this->index() : Base::outer()); }
+ inline Index row() const { return (ArgType::Flags&RowMajorBit ? Base::outer() : this->index()); }
+ inline Index col() const { return (ArgType::Flags&RowMajorBit ? this->index() : Base::outer()); }
inline StorageIndex index() const
{
if(HasUnitDiag && m_returnOne) return internal::convert_index<StorageIndex>(Base::outer());
diff --git a/debug/msvc/eigen.natvis b/debug/msvc/eigen.natvis
index 22cf346..da89857 100644
--- a/debug/msvc/eigen.natvis
+++ b/debug/msvc/eigen.natvis
@@ -1,235 +1,235 @@
-<?xml version="1.0" encoding="utf-8"?>
-
-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-
- <!-- Fixed x Fixed Matrix -->
- <Type Name="Eigen::Matrix<*,*,*,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
- <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : $T3</Size>
- <ValuePointer>m_storage.m_data.array</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : $T3</Size>
- <ValuePointer>m_storage.m_data.array</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- 2 x 2 Matrix -->
- <Type Name="Eigen::Matrix<*,2,2,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,2,2,*,*,*>"/>
- <DisplayString>[2, 2] (fixed matrix)</DisplayString>
- <Expand>
- <Synthetic Name="[row 0]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 0]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>
- </Synthetic>
- </Expand>
- </Type>
-
- <!-- 3 x 3 Matrix -->
- <Type Name="Eigen::Matrix<*,3,3,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,3,3,*,*,*>"/>
- <DisplayString>[3, 3] (fixed matrix)</DisplayString>
- <Expand>
- <Synthetic Name="[row 0]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 0]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>
- </Synthetic>
- </Expand>
- </Type>
-
- <!-- 4 x 4 Matrix -->
- <Type Name="Eigen::Matrix<*,4,4,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,4,4,*,*,*>"/>
- <DisplayString>[4, 4] (fixed matrix)</DisplayString>
- <Expand>
- <Synthetic Name="[row 0]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 0]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 1]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 2]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 3]" Condition="Flags%2">
- <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>
- </Synthetic>
- <Synthetic Name="[row 3]" Condition="!(Flags%2)">
- <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>
- </Synthetic>
- </Expand>
- </Type>
-
- <!-- Dynamic x Dynamic Matrix -->
- <Type Name="Eigen::Matrix<*,-1,-1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Fixed x Dynamic Matrix -->
- <Type Name="Eigen::Matrix<*,*,-1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,*,-1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Dynamic x Fixed Matrix -->
- <Type Name="Eigen::Matrix<*,-1,*,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,*,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>
- <Expand>
- <ArrayItems Condition="Flags%2"> <!-- row major layout -->
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : $T2</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
- <Direction>Backward</Direction>
- <Rank>2</Rank>
- <Size>$i==0 ? m_storage.m_rows : $T2</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Dynamic Column Vector -->
- <Type Name="Eigen::Matrix<*,1,-1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,1,-1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>
- <Expand>
- <Item Name="[size]">m_storage.m_cols</Item>
- <ArrayItems>
- <Size>m_storage.m_cols</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Dynamic Row Vector -->
- <Type Name="Eigen::Matrix<*,-1,1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,-1,1,*,*,*>"/>
- <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
- <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>
- <Expand>
- <Item Name="[size]">m_storage.m_rows</Item>
- <ArrayItems>
- <Size>m_storage.m_rows</Size>
- <ValuePointer>m_storage.m_data</ValuePointer>
- </ArrayItems>
- </Expand>
- </Type>
-
- <!-- Fixed Vector -->
- <Type Name="Eigen::Matrix<*,1,1,*,*,*>">
- <AlternativeType Name="Eigen::Array<*,1,1,*,*,*>"/>
- <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- </Expand>
- </Type>
-
- <Type Name="Eigen::Matrix<*,2,1,*,*,*>">
- <AlternativeType Name="Eigen::Matrix<*,1,2,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,2,1,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,1,2,*,*,*>"/>
- <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- <Item Name="[y]">m_storage.m_data.array[1]</Item>
- </Expand>
- </Type>
-
- <Type Name="Eigen::Matrix<*,3,1,*,*,*>">
- <AlternativeType Name="Eigen::Matrix<*,1,3,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,3,1,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,1,3,*,*,*>"/>
- <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- <Item Name="[y]">m_storage.m_data.array[1]</Item>
- <Item Name="[z]">m_storage.m_data.array[2]</Item>
- </Expand>
- </Type>
-
- <Type Name="Eigen::Matrix<*,4,1,*,*,*>">
- <AlternativeType Name="Eigen::Matrix<*,1,4,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,4,1,*,*,*>"/>
- <AlternativeType Name="Eigen::Array<*,1,4,*,*,*>"/>
- <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
- <Expand>
- <Item Name="[x]">m_storage.m_data.array[0]</Item>
- <Item Name="[y]">m_storage.m_data.array[1]</Item>
- <Item Name="[z]">m_storage.m_data.array[2]</Item>
- <Item Name="[w]">m_storage.m_data.array[3]</Item>
- </Expand>
- </Type>
-
-</AutoVisualizer>
+<?xml version="1.0" encoding="utf-8"?>
+
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+
+ <!-- Fixed x Fixed Matrix -->
+ <Type Name="Eigen::Matrix<*,*,*,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
+ <DisplayString>[{$T2}, {$T3}] (fixed matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : $T3</Size>
+ <ValuePointer>m_storage.m_data.array</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : $T3</Size>
+ <ValuePointer>m_storage.m_data.array</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- 2 x 2 Matrix -->
+ <Type Name="Eigen::Matrix<*,2,2,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,2,2,*,*,*>"/>
+ <DisplayString>[2, 2] (fixed matrix)</DisplayString>
+ <Expand>
+ <Synthetic Name="[row 0]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[2]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[3]})</DisplayString>
+ </Synthetic>
+ </Expand>
+ </Type>
+
+ <!-- 3 x 3 Matrix -->
+ <Type Name="Eigen::Matrix<*,3,3,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,3,3,*,*,*>"/>
+ <DisplayString>[3, 3] (fixed matrix)</DisplayString>
+ <Expand>
+ <Synthetic Name="[row 0]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[3]}, {m_storage.m_data.array[6]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[5]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[7]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[6]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[8]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[8]})</DisplayString>
+ </Synthetic>
+ </Expand>
+ </Type>
+
+ <!-- 4 x 4 Matrix -->
+ <Type Name="Eigen::Matrix<*,4,4,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,4,4,*,*,*>"/>
+ <DisplayString>[4, 4] (fixed matrix)</DisplayString>
+ <Expand>
+ <Synthetic Name="[row 0]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 0]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[0]}, {m_storage.m_data.array[4]}, {m_storage.m_data.array[8]}, {m_storage.m_data.array[12]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[4]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[7]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 1]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[1]}, {m_storage.m_data.array[5]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[13]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[8]}, {m_storage.m_data.array[9]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[11]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 2]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[2]}, {m_storage.m_data.array[6]}, {m_storage.m_data.array[10]}, {m_storage.m_data.array[14]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 3]" Condition="Flags%2">
+ <DisplayString>({m_storage.m_data.array[12]}, {m_storage.m_data.array[13]}, {m_storage.m_data.array[14]}, {m_storage.m_data.array[15]})</DisplayString>
+ </Synthetic>
+ <Synthetic Name="[row 3]" Condition="!(Flags%2)">
+ <DisplayString>({m_storage.m_data.array[3]}, {m_storage.m_data.array[7]}, {m_storage.m_data.array[11]}, {m_storage.m_data.array[15]})</DisplayString>
+ </Synthetic>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic x Dynamic Matrix -->
+ <Type Name="Eigen::Matrix<*,-1,-1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,-1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {m_storage.m_cols}] (dynamic matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Fixed x Dynamic Matrix -->
+ <Type Name="Eigen::Matrix<*,*,-1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,*,-1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{$T2}, {m_storage.m_cols}] (dynamic column matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? $T2 : m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic x Fixed Matrix -->
+ <Type Name="Eigen::Matrix<*,-1,*,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,*,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}, {$T2}] (dynamic row matrix)</DisplayString>
+ <Expand>
+ <ArrayItems Condition="Flags%2"> <!-- row major layout -->
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : $T2</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ <ArrayItems Condition="!(Flags%2)"> <!-- column major layout -->
+ <Direction>Backward</Direction>
+ <Rank>2</Rank>
+ <Size>$i==0 ? m_storage.m_rows : $T2</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic Column Vector -->
+ <Type Name="Eigen::Matrix<*,1,-1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,1,-1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_cols}] (dynamic column vector)</DisplayString>
+ <Expand>
+ <Item Name="[size]">m_storage.m_cols</Item>
+ <ArrayItems>
+ <Size>m_storage.m_cols</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Dynamic Row Vector -->
+ <Type Name="Eigen::Matrix<*,-1,1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,-1,1,*,*,*>"/>
+ <DisplayString Condition="m_storage.m_data == 0">empty</DisplayString>
+ <DisplayString Condition="m_storage.m_data != 0">[{m_storage.m_rows}] (dynamic row vector)</DisplayString>
+ <Expand>
+ <Item Name="[size]">m_storage.m_rows</Item>
+ <ArrayItems>
+ <Size>m_storage.m_rows</Size>
+ <ValuePointer>m_storage.m_data</ValuePointer>
+ </ArrayItems>
+ </Expand>
+ </Type>
+
+ <!-- Fixed Vector -->
+ <Type Name="Eigen::Matrix<*,1,1,*,*,*>">
+ <AlternativeType Name="Eigen::Array<*,1,1,*,*,*>"/>
+ <DisplayString>[1] ({m_storage.m_data.array[0]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ </Expand>
+ </Type>
+
+ <Type Name="Eigen::Matrix<*,2,1,*,*,*>">
+ <AlternativeType Name="Eigen::Matrix<*,1,2,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,2,1,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,1,2,*,*,*>"/>
+ <DisplayString>[2] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ <Item Name="[y]">m_storage.m_data.array[1]</Item>
+ </Expand>
+ </Type>
+
+ <Type Name="Eigen::Matrix<*,3,1,*,*,*>">
+ <AlternativeType Name="Eigen::Matrix<*,1,3,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,3,1,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,1,3,*,*,*>"/>
+ <DisplayString>[3] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ <Item Name="[y]">m_storage.m_data.array[1]</Item>
+ <Item Name="[z]">m_storage.m_data.array[2]</Item>
+ </Expand>
+ </Type>
+
+ <Type Name="Eigen::Matrix<*,4,1,*,*,*>">
+ <AlternativeType Name="Eigen::Matrix<*,1,4,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,4,1,*,*,*>"/>
+ <AlternativeType Name="Eigen::Array<*,1,4,*,*,*>"/>
+ <DisplayString>[4] ({m_storage.m_data.array[0]}, {m_storage.m_data.array[1]}, {m_storage.m_data.array[2]}, {m_storage.m_data.array[3]})</DisplayString>
+ <Expand>
+ <Item Name="[x]">m_storage.m_data.array[0]</Item>
+ <Item Name="[y]">m_storage.m_data.array[1]</Item>
+ <Item Name="[z]">m_storage.m_data.array[2]</Item>
+ <Item Name="[w]">m_storage.m_data.array[3]</Item>
+ </Expand>
+ </Type>
+
+</AutoVisualizer>
diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat
index 273c10d..35ef580 100644
--- a/debug/msvc/eigen_autoexp_part.dat
+++ b/debug/msvc/eigen_autoexp_part.dat
@@ -1,295 +1,295 @@
-; ***************************************************************
-; * Eigen Visualizer
-; *
-; * Author: Hauke Heibel <hauke.heibel@gmail.com>
-; *
-; * Support the enhanced debugging of the following Eigen
-; * types (*: any, +:fixed dimension) :
-; *
-; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>
-; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>
-; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>
-; * - Eigen::Matrix<*,-1,-1,*,*,*>
-; * - Eigen::Matrix<*,+,-1,*,*,*>
-; * - Eigen::Matrix<*,-1,+,*,*,*>
-; * - Eigen::Matrix<*,+,+,*,*,*>
-; *
-; * Matrices are displayed properly independently of the memory
-; * alignment (RowMajor vs. ColMajor).
-; *
-; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
-; * that your original autoexp.dat file is copied to a safe
-; * place before proceeding with its modification.
-; ***************************************************************
-
-[Visualizer]
-
-; Fixed size 4-vectors
-Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0],
- y : ($c.m_storage.m_data.array)[1],
- z : ($c.m_storage.m_data.array)[2],
- w : ($c.m_storage.m_data.array)[3]
- )
- )
-
- preview
- (
- #(
- "[",
- 4,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 4),
- ")"
- )
- )
-}
-
-; Fixed size 3-vectors
-Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0],
- y : ($c.m_storage.m_data.array)[1],
- z : ($c.m_storage.m_data.array)[2]
- )
- )
-
- preview
- (
- #(
- "[",
- 3,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 3),
- ")"
- )
- )
-}
-
-; Fixed size 2-vectors
-Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0],
- y : ($c.m_storage.m_data.array)[1]
- )
- )
-
- preview
- (
- #(
- "[",
- 2,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 2),
- ")"
- )
- )
-}
-
-; Fixed size 1-vectors
-Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- x : ($c.m_storage.m_data.array)[0]
- )
- )
-
- preview
- (
- #(
- "[",
- 1,
- "](",
- #array(expr: $e.m_storage.m_data.array[$i], size: 1),
- ")"
- )
- )
-}
-
-; Dynamic matrices (ColMajor and RowMajor support)
-Eigen::Matrix<*,-1,-1,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.m_storage.m_rows,
- cols: $c.m_storage.m_cols,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[$i],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.m_storage.m_rows,
- ",",
- $c.m_storage.m_cols,
- "](",
- #array(
- expr : [($c.m_storage.m_data)[$i],g],
- size : $c.m_storage.m_rows*$c.m_storage.m_cols
- ),
- ")"
- )
- )
-}
-
-; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,*,-1,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.RowsAtCompileTime,
- cols: $c.m_storage.m_cols,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[$i],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.RowsAtCompileTime,
- ",",
- $c.m_storage.m_cols,
- "](",
- #array(
- expr : [($c.m_storage.m_data)[$i],g],
- size : $c.RowsAtCompileTime*$c.m_storage.m_cols
- ),
- ")"
- )
- )
-}
-
-; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,-1,*,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.m_storage.m_rows,
- cols: $c.ColsAtCompileTime,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data)[$i],
- size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.m_storage.m_rows,
- ",",
- $c.ColsAtCompileTime,
- "](",
- #array(
- expr : [($c.m_storage.m_data)[$i],g],
- size : $c.m_storage.m_rows*$c.ColsAtCompileTime
- ),
- ")"
- )
- )
-}
-
-; Fixed size matrix (ColMajor and RowMajor support)
-Eigen::Matrix<*,*,*,*,*,*>{
- children
- (
- #(
- [internals]: [$c,!],
- rows: $c.RowsAtCompileTime,
- cols: $c.ColsAtCompileTime,
- ; Check for RowMajorBit
- #if ($c.Flags & 0x1) (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
- )
- ) #else (
- #array(
- rank: 2,
- base: 0,
- expr: ($c.m_storage.m_data.array)[$i],
- size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
- )
- )
- )
- )
-
- preview
- (
- #(
- "[",
- $c.RowsAtCompileTime,
- ",",
- $c.ColsAtCompileTime,
- "](",
- #array(
- expr : [($c.m_storage.m_data.array)[$i],g],
- size : $c.RowsAtCompileTime*$c.ColsAtCompileTime
- ),
- ")"
- )
- )
-}
+; ***************************************************************
+; * Eigen Visualizer
+; *
+; * Author: Hauke Heibel <hauke.heibel@gmail.com>
+; *
+; * Support the enhanced debugging of the following Eigen
+; * types (*: any, +:fixed dimension) :
+; *
+; * - Eigen::Matrix<*,4,1,*,*,*> and Eigen::Matrix<*,1,4,*,*,*>
+; * - Eigen::Matrix<*,3,1,*,*,*> and Eigen::Matrix<*,1,3,*,*,*>
+; * - Eigen::Matrix<*,2,1,*,*,*> and Eigen::Matrix<*,1,2,*,*,*>
+; * - Eigen::Matrix<*,-1,-1,*,*,*>
+; * - Eigen::Matrix<*,+,-1,*,*,*>
+; * - Eigen::Matrix<*,-1,+,*,*,*>
+; * - Eigen::Matrix<*,+,+,*,*,*>
+; *
+; * Matrices are displayed properly independently of the memory
+; * alignment (RowMajor vs. ColMajor).
+; *
+; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
+; * that your original autoexp.dat file is copied to a safe
+; * place before proceeding with its modification.
+; ***************************************************************
+
+[Visualizer]
+
+; Fixed size 4-vectors
+Eigen::Matrix<*,4,1,*,*,*>|Eigen::Matrix<*,1,4,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0],
+ y : ($c.m_storage.m_data.array)[1],
+ z : ($c.m_storage.m_data.array)[2],
+ w : ($c.m_storage.m_data.array)[3]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 4,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 4),
+ ")"
+ )
+ )
+}
+
+; Fixed size 3-vectors
+Eigen::Matrix<*,3,1,*,*,*>|Eigen::Matrix<*,1,3,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0],
+ y : ($c.m_storage.m_data.array)[1],
+ z : ($c.m_storage.m_data.array)[2]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 3,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 3),
+ ")"
+ )
+ )
+}
+
+; Fixed size 2-vectors
+Eigen::Matrix<*,2,1,*,*,*>|Eigen::Matrix<*,1,2,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0],
+ y : ($c.m_storage.m_data.array)[1]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 2,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 2),
+ ")"
+ )
+ )
+}
+
+; Fixed size 1-vectors
+Eigen::Matrix<*,1,1,*,*,*>|Eigen::Matrix<*,1,1,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ x : ($c.m_storage.m_data.array)[0]
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ 1,
+ "](",
+ #array(expr: $e.m_storage.m_data.array[$i], size: 1),
+ ")"
+ )
+ )
+}
+
+; Dynamic matrices (ColMajor and RowMajor support)
+Eigen::Matrix<*,-1,-1,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.m_storage.m_rows,
+ cols: $c.m_storage.m_cols,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.m_storage.m_cols + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[$i],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.m_storage.m_cols
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.m_storage.m_rows,
+ ",",
+ $c.m_storage.m_cols,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data)[$i],g],
+ size : $c.m_storage.m_rows*$c.m_storage.m_cols
+ ),
+ ")"
+ )
+ )
+}
+
+; Fixed rows, dynamic columns matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,*,-1,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.RowsAtCompileTime,
+ cols: $c.m_storage.m_cols,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[($i % $c.RowsAtCompileTime)*$c.m_storage.m_cols + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[$i],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.m_storage.m_cols
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.RowsAtCompileTime,
+ ",",
+ $c.m_storage.m_cols,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data)[$i],g],
+ size : $c.RowsAtCompileTime*$c.m_storage.m_cols
+ ),
+ ")"
+ )
+ )
+}
+
+; Dynamic rows, fixed columns matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,-1,*,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.m_storage.m_rows,
+ cols: $c.ColsAtCompileTime,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[($i % $c.m_storage.m_rows)*$c.ColsAtCompileTime + (($i- $i % $c.m_storage.m_rows)/$c.m_storage.m_rows)],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data)[$i],
+ size: ($r==1)*$c.m_storage.m_rows+($r==0)*$c.ColsAtCompileTime
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.m_storage.m_rows,
+ ",",
+ $c.ColsAtCompileTime,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data)[$i],g],
+ size : $c.m_storage.m_rows*$c.ColsAtCompileTime
+ ),
+ ")"
+ )
+ )
+}
+
+; Fixed size matrix (ColMajor and RowMajor support)
+Eigen::Matrix<*,*,*,*,*,*>{
+ children
+ (
+ #(
+ [internals]: [$c,!],
+ rows: $c.RowsAtCompileTime,
+ cols: $c.ColsAtCompileTime,
+ ; Check for RowMajorBit
+ #if ($c.Flags & 0x1) (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data.array)[($i % $c.RowsAtCompileTime)*$c.ColsAtCompileTime + (($i- $i % $c.RowsAtCompileTime)/$c.RowsAtCompileTime)],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
+ )
+ ) #else (
+ #array(
+ rank: 2,
+ base: 0,
+ expr: ($c.m_storage.m_data.array)[$i],
+ size: ($r==1)*$c.RowsAtCompileTime+($r==0)*$c.ColsAtCompileTime
+ )
+ )
+ )
+ )
+
+ preview
+ (
+ #(
+ "[",
+ $c.RowsAtCompileTime,
+ ",",
+ $c.ColsAtCompileTime,
+ "](",
+ #array(
+ expr : [($c.m_storage.m_data.array)[$i],g],
+ size : $c.RowsAtCompileTime*$c.ColsAtCompileTime
+ ),
+ ")"
+ )
+ )
+}
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 9e102b7..298619e 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -746,6 +746,11 @@
// check sparse-triangular to dense
refMat3 = m2.template triangularView<StrictlyUpper>();
VERIFY_IS_APPROX(refMat3, DenseMatrix(refMat2.template triangularView<StrictlyUpper>()));
+
+ // check sparse triangular view iteration-based evaluation
+ m2.setZero();
+ VERIFY_IS_CWISE_EQUAL(m2.template triangularView<UnitLower>().toDense(), DenseMatrix::Identity(rows, cols));
+ VERIFY_IS_CWISE_EQUAL(m2.template triangularView<UnitUpper>().toDense(), DenseMatrix::Identity(rows, cols));
}
// test selfadjointView
diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index 9cdf2c0..d487333 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -1 +1 @@
-#include "../../../Eigen/ThreadPool"
\ No newline at end of file
+#include "../../../Eigen/ThreadPool" // IWYU pragma: export
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index cb74031..480e3ac 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -1118,7 +1118,7 @@
: globalContractDimOffset + privateOffsetC)
: OutScalar(0);
- outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
+ outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]);
privateOffsetNC += Properties::LocalThreadSizeNC;
}
privateOffsetC += Properties::LocalThreadSizeC;
@@ -1263,7 +1263,7 @@
StorageIndex localid = itemID.get_local_id(0);
OutScalar accumulator = OutScalar(0);
for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
- accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
+ accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator);
}
auto out_scratch_ptr = scratch_ptr + localid;
*out_scratch_ptr = accumulator;
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 1d40ae5..d41baf2 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -151,6 +151,7 @@
ei_add_test(cxx11_tensor_argmax_sycl)
ei_add_test(cxx11_tensor_custom_op_sycl)
ei_add_test(cxx11_tensor_scan_sycl)
+ ei_add_test(cxx11_tensor_of_float16_sycl)
set(EIGEN_SYCL OFF)
endif()
diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
index 41ea3cf..8f4e095 100644
--- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@@ -32,9 +32,9 @@
Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
in.setRandom();
- in *= in.constant(100.0);
- in(0, 0, 0) = -1000.0;
- in(1, 1, 1) = 1000.0;
+ in *= in.constant(static_cast<DataType>(100.0));
+ in(0, 0, 0) = static_cast<DataType>(-1000.0);
+ in(1, 1, 1) = static_cast<DataType>(1000.0);
std::size_t in_bytes = in.size() * sizeof(DataType);
std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
@@ -93,7 +93,7 @@
ix[3] = l;
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
// = 10.0
- tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
+ tensor(ix) = static_cast<DataType>((ix[dim] != 0) ? -1.0 : 10.0);
}
}
}
@@ -132,7 +132,7 @@
ix[2] = k;
ix[3] = l;
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
- tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
+ tensor(ix) = static_cast<DataType>((ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0);
}
}
}
@@ -180,7 +180,7 @@
ix[2] = k;
ix[3] = l;
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
- tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
+ tensor(ix) = static_cast<DataType>((ix[dim] != 0) ? 1.0 : -10.0);
}
}
}
@@ -219,7 +219,7 @@
ix[2] = k;
ix[3] = l;
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
- tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
+ tensor(ix) = static_cast<DataType>((ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0);
}
}
}
@@ -252,6 +252,7 @@
EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
for (const auto& device : Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_argmax_test_per_device<half>(device));
CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
}
}
diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
index 20f84b8..6ca5ac7 100644
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -139,6 +139,7 @@
EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_broadcast_test_per_device<half>(device));
CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
}
}
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index be7afd2..c8668d8 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -28,26 +28,135 @@
// Functions used to compare the TensorMap implementation on the device with
// the equivalent on the host
namespace SYCL {
-template <typename T> T abs(T x) {
+
+template <typename T> T abs(T x) {
return cl::sycl::abs(x);
}
+template <> Eigen::half abs(Eigen::half x) {
+ return Eigen::half(cl::sycl::fabs(static_cast<cl::sycl::half>(x)));
+}
-template <> float abs(float x) {
+template <> float abs(float x) {
return cl::sycl::fabs(x);
}
-template <> double abs(double x) {
+template <> double abs(double x) {
return cl::sycl::fabs(x);
}
template <typename T> T square(T x) { return x * x; }
template <typename T> T cube(T x) { return x * x * x; }
template <typename T> T inverse(T x) { return T(1) / x; }
-template <typename T> T cwiseMax(T x, T y) {
-return cl::sycl::max(x, y);
+template <typename T> T cwiseMax(T x, T y) {
+ return cl::sycl::max(x, y);
}
-template <typename T> T cwiseMin(T x, T y) {
- return cl::sycl::min(x, y);
+template <> Eigen::half cwiseMax(Eigen::half x, Eigen::half y) {
+return Eigen::half(cl::sycl::max(static_cast<cl::sycl::half>(x), static_cast<cl::sycl::half>(y)));
+}
+
+template <typename T> T cwiseMin(T x, T y) {
+ return cl::sycl::min(x, y);
+}
+template <> Eigen::half cwiseMin(Eigen::half x, Eigen::half y) {
+ return Eigen::half(cl::sycl::min(static_cast<cl::sycl::half>(x), static_cast<cl::sycl::half>(y)));
+}
+
+template <typename T> T sqrt(T x) {
+ return cl::sycl::sqrt(x);
+}
+template <> Eigen::half sqrt(Eigen::half x) {
+ return Eigen::half(cl::sycl::sqrt(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T rsqrt(T x) {
+ return cl::sycl::rsqrt(x);
+}
+template <> Eigen::half rsqrt(Eigen::half x) {
+ return Eigen::half(cl::sycl::rsqrt(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T tanh(T x) {
+ return cl::sycl::tanh(x);
+}
+template <> Eigen::half tanh(Eigen::half x) {
+ return Eigen::half(cl::sycl::tanh(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T exp(T x) {
+ return cl::sycl::exp(x);
+}
+template <> Eigen::half exp(Eigen::half x) {
+ return Eigen::half(cl::sycl::exp(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T expm1(T x) {
+ return cl::sycl::expm1(x);
+}
+template <> Eigen::half expm1(Eigen::half x) {
+ return Eigen::half(cl::sycl::expm1(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T log(T x) {
+ return cl::sycl::log(x);
+}
+template <> Eigen::half log(Eigen::half x) {
+ return Eigen::half(cl::sycl::log(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T ceil(T x) {
+ return cl::sycl::ceil(x);
+}
+template <> Eigen::half ceil(Eigen::half x) {
+ return Eigen::half(cl::sycl::ceil(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T floor(T x) {
+ return cl::sycl::floor(x);
+}
+template <> Eigen::half floor(Eigen::half x) {
+ return Eigen::half(cl::sycl::floor(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T round(T x) {
+ return cl::sycl::round(x);
+}
+template <> Eigen::half round(Eigen::half x) {
+ return Eigen::half(cl::sycl::round(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T log1p(T x) {
+ return cl::sycl::log1p(x);
+}
+template <> Eigen::half log1p(Eigen::half x) {
+ return Eigen::half(cl::sycl::log1p(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T sign(T x) {
+ return cl::sycl::sign(x);
+}
+template <> Eigen::half sign(Eigen::half x) {
+ return Eigen::half(cl::sycl::sign(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T isnan(T x) {
+ return cl::sycl::isnan(x);
+}
+template <> Eigen::half isnan(Eigen::half x) {
+ return Eigen::half(cl::sycl::isnan(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T isfinite(T x) {
+ return cl::sycl::isfinite(x);
+}
+template <> Eigen::half isfinite(Eigen::half x) {
+ return Eigen::half(cl::sycl::isfinite(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T isinf(T x) {
+ return cl::sycl::isinf(x);
+}
+template <> Eigen::half isinf(Eigen::half x) {
+ return Eigen::half(cl::sycl::isinf(static_cast<cl::sycl::half>(x)));
}
}
@@ -157,15 +266,14 @@
#define DECLARE_UNARY_STRUCT(FUNC) \
struct op_##FUNC { \
template <typename T> \
- auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) { \
- return cl::sycl::FUNC(x); \
+ auto operator()(const T& x) -> decltype(SYCL::FUNC(x)) { \
+ return SYCL::FUNC(x); \
} \
template <typename T> \
auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
return x.FUNC(); \
} \
- };
-
+};
DECLARE_UNARY_STRUCT(sqrt)
DECLARE_UNARY_STRUCT(rsqrt)
@@ -390,8 +498,10 @@
for (const auto& device :Eigen::get_sycl_supported_devices()) {
QueueInterface queueInterface(device);
Eigen::SyclDevice sycl_device(&queueInterface);
- CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
- CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
- CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
+ CALL_SUBTEST_1(test_builtin_unary_sycl<half>(sycl_device));
+ CALL_SUBTEST_2(test_floating_builtin_binary_sycl<half>(sycl_device));
+ CALL_SUBTEST_3(test_builtin_unary_sycl<float>(sycl_device));
+ CALL_SUBTEST_4(test_floating_builtin_binary_sycl<float>(sycl_device));
+ CALL_SUBTEST_5(test_integer_builtin_binary_sycl<int>(sycl_device));
}
}
diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
index 1e70931..b018da8 100644
--- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@@ -619,5 +619,6 @@
{
for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
+ CALL_SUBTEST(sycl_chipping_test_per_device<half>(device));
}
}
diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
index 765991b..18dc95b 100644
--- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
@@ -175,6 +175,7 @@
}
EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(tensorConcat_perDevice<half>(device));
CALL_SUBTEST(tensorConcat_perDevice<float>(device));
}
}
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index 2f75ef8..66a13e8 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -82,6 +82,7 @@
EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_device_test_per_device<half>(device));
CALL_SUBTEST(sycl_device_test_per_device<float>(device));
CALL_SUBTEST(sycl_device_test_per_device<OffByOneScalar<int>>(device));
}
diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
index db1c020..083c004 100644
--- a/unsupported/test/cxx11_tensor_image_op_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
@@ -95,6 +95,7 @@
EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_computing_test_per_device<half>(device));
CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
index 521ae0c..75c2c0e 100644
--- a/unsupported/test/cxx11_tensor_inflation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
@@ -131,6 +131,7 @@
EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
{
for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_inflation_test_per_device<half>(device));
CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
}
}
diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
index 9546b91..f556d84 100644
--- a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
@@ -121,6 +121,7 @@
EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
{
for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<half>(device));
CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
}
}
diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp
index 029653e..84d638e 100644
--- a/unsupported/test/cxx11_tensor_math_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_math_sycl.cpp
@@ -100,6 +100,7 @@
EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_computing_test_per_device<half>(device));
CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
}
}
diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
index bf001b4..a1545d4 100644
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@@ -381,6 +381,7 @@
EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
{
for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_morphing_test_per_device<half>(device));
CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
}
}
diff --git a/unsupported/test/cxx11_tensor_of_float16_sycl.cpp b/unsupported/test/cxx11_tensor_of_float16_sycl.cpp
new file mode 100644
index 0000000..1624acc
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_float16_sycl.cpp
@@ -0,0 +1,406 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023
+// Alejandro Acosta Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+#define EIGEN_SYCL_HALF_SUPPORT
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::SyclDevice;
+
+void test_gpu_numext(const Eigen::SyclDevice &sycl_device) {
+ int num_elem = 101;
+
+ float* d_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+ bool* d_res_half = static_cast<bool*>(sycl_device.allocate(num_elem * sizeof(bool)));
+ bool* d_res_float = static_cast<bool*>(sycl_device.allocate(num_elem * sizeof(bool)));
+
+ Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float(d_float, num_elem);
+ Eigen::TensorMap<Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(d_res_half, num_elem);
+ Eigen::TensorMap<Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(d_res_float, num_elem);
+
+ gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
+ gpu_res_float.device(sycl_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
+ gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
+
+ Tensor<bool, 1> half_prec(num_elem);
+ Tensor<bool, 1> full_prec(num_elem);
+
+ sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(bool));
+ sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(bool));
+
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking numext " << i << std::endl;
+ VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
+ }
+}
+
+void test_gpu_conversion(const Eigen::SyclDevice &sycl_device) {
+ int num_elem = 101;
+
+ float* d_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+ Eigen::half* d_half = static_cast<Eigen::half*>(sycl_device.allocate(num_elem * sizeof(Eigen::half)));
+ float* d_conv = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+ d_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+ d_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+ d_conv, num_elem);
+
+ gpu_float.device(sycl_device) = gpu_float.random();
+ gpu_half.device(sycl_device) = gpu_float.cast<Eigen::half>();
+ gpu_conv.device(sycl_device) = gpu_half.cast<float>();
+
+ Tensor<float, 1> initial(num_elem);
+ Tensor<float, 1> final(num_elem);
+ sycl_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+ sycl_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+
+ for (int i = 0; i < num_elem; ++i) {
+ VERIFY_IS_APPROX(initial(i), final(i));
+ }
+}
+
+void test_gpu_unary(const Eigen::SyclDevice &sycl_device) {
+ int num_elem = 101;
+
+ float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+ float* d_res_half = (float*)sycl_device.allocate(num_elem * sizeof(float));
+ float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+ d_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+ d_res_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+ d_res_float, num_elem);
+
+ gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
+ gpu_res_float.device(sycl_device) = gpu_float.abs();
+ gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
+
+ Tensor<float, 1> half_prec(num_elem);
+ Tensor<float, 1> full_prec(num_elem);
+ sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+ sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+ sycl_device.synchronize();
+
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking unary " << i << std::endl;
+ VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+ }
+}
+
+void test_gpu_elementwise(const Eigen::SyclDevice &sycl_device) {
+ int num_elem = 101;
+
+ float* d_float1 = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+ float* d_float2 = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+ float* d_res_half = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+ float* d_res_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+
+ Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+ Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+ Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_res_half(d_res_half, num_elem);
+ Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_res_float(d_res_float, num_elem);
+
+ gpu_float1.device(sycl_device) = gpu_float1.random();
+ gpu_float2.device(sycl_device) = gpu_float2.random();
+ gpu_res_float.device(sycl_device) = (gpu_float1 + gpu_float2) * gpu_float1;
+ gpu_res_half.device(sycl_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
+
+ Tensor<float, 1> half_prec(num_elem);
+ Tensor<float, 1> full_prec(num_elem);
+
+ sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(float));
+ sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(float));
+
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
+ VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
+ }
+}
+
+void test_gpu_trancendental(const Eigen::SyclDevice &sycl_device) {
+ int num_elem = 101;
+
+ float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+ float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+ float* d_float3 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+ Eigen::half* d_res1_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res1_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res2_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res2_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res3_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res3_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
+
+ gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+ gpu_float2.device(sycl_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+ gpu_float3.device(sycl_device) = gpu_float3.random();
+ gpu_res1_float.device(sycl_device) = gpu_float1.exp().cast<Eigen::half>();
+ gpu_res2_float.device(sycl_device) = gpu_float2.log().cast<Eigen::half>();
+ gpu_res3_float.device(sycl_device) = gpu_float3.log1p().cast<Eigen::half>();
+ gpu_res4_float.device(sycl_device) = gpu_float3.expm1().cast<Eigen::half>();
+
+ gpu_res1_half.device(sycl_device) = gpu_float1.cast<Eigen::half>();
+ gpu_res1_half.device(sycl_device) = gpu_res1_half.exp();
+
+ gpu_res2_half.device(sycl_device) = gpu_float2.cast<Eigen::half>();
+ gpu_res2_half.device(sycl_device) = gpu_res2_half.log();
+
+ gpu_res3_half.device(sycl_device) = gpu_float3.cast<Eigen::half>();
+ gpu_res3_half.device(sycl_device) = gpu_res3_half.log1p();
+
+ gpu_res3_half.device(sycl_device) = gpu_float3.cast<Eigen::half>();
+ gpu_res3_half.device(sycl_device) = gpu_res3_half.expm1();
+
+ Tensor<float, 1> input1(num_elem);
+ Tensor<Eigen::half, 1> half_prec1(num_elem);
+ Tensor<Eigen::half, 1> full_prec1(num_elem);
+ Tensor<float, 1> input2(num_elem);
+ Tensor<Eigen::half, 1> half_prec2(num_elem);
+ Tensor<Eigen::half, 1> full_prec2(num_elem);
+ Tensor<float, 1> input3(num_elem);
+ Tensor<Eigen::half, 1> half_prec3(num_elem);
+ Tensor<Eigen::half, 1> full_prec3(num_elem);
+ sycl_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
+ sycl_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+ sycl_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
+ sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
+ sycl_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
+ sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
+ sycl_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+ sycl_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+ sycl_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
+ sycl_device.synchronize();
+
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
+ VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
+ }
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
+ if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
+ VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
+ else
+ VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+ }
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+ VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
+ }
+}
+
+void test_gpu_contractions(const Eigen::SyclDevice &sycl_device) {
+ int rows = 23;
+ int cols = 23;
+ int num_elem = rows*cols;
+
+ float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+ float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+ Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+ d_float1, rows, cols);
+ Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+ d_float2, rows, cols);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
+ d_res_half, rows, cols);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
+ d_res_float, rows, cols);
+
+ gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+ gpu_float2.device(sycl_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
+
+ typedef typename Tensor<float, 2>::DimensionPair DimPair;
+ Eigen::array<DimPair, 1> dims;
+ gpu_res_float.device(sycl_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
+ gpu_res_half.device(sycl_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
+
+ Tensor<Eigen::half, 2> half_prec(rows, cols);
+ Tensor<Eigen::half, 2> full_prec(rows, cols);
+ sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
+ sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
+ sycl_device.synchronize();
+
+ for (int i = 0; i < rows; ++i) {
+ for (int j = 0; j < cols; ++j) {
+ std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
+ if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
+ VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+ }
+ }
+ }
+}
+
+void test_gpu_reductions(const Eigen::SyclDevice &sycl_device, int size1, int size2, int redux) {
+ std::cout << "Reducing " << size1 << " by " << size2
+ << " tensor along dim " << redux << std::endl;
+
+ int num_elem = size1*size2;
+ int result_size = (redux == 1 ? size1 : size2);
+
+ float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+ Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half));
+ Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+ d_float, size1, size2);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
+ d_res_half, result_size);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
+ d_res_float, result_size);
+
+ gpu_float.device(sycl_device) = gpu_float.random() * 2.0f;
+
+ Eigen::array<int, 1> redux_dim = {redux};
+ gpu_res_float.device(sycl_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
+ gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
+
+ Tensor<Eigen::half, 1> half_prec(result_size);
+ Tensor<Eigen::half, 1> full_prec(result_size);
+ sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
+ sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
+ sycl_device.synchronize();
+
+ for (int i = 0; i < result_size; ++i) {
+ std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
+ VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+ }
+}
+
+void test_gpu_reductions(const Eigen::SyclDevice &sycl_device) {
+ test_gpu_reductions(sycl_device, 13, 13, 0);
+ test_gpu_reductions(sycl_device, 13, 13, 1);
+
+ test_gpu_reductions(sycl_device, 35, 36, 0);
+ test_gpu_reductions(sycl_device, 35, 36, 1);
+
+ test_gpu_reductions(sycl_device, 36, 35, 0);
+ test_gpu_reductions(sycl_device, 36, 35, 1);
+}
+
+void test_gpu_full_reductions(const Eigen::SyclDevice &sycl_device) {
+ int size = 13;
+ int num_elem = size*size;
+
+ float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+ Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half));
+ Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+ d_float, size, size);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
+ d_res_half);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
+ d_res_float);
+
+ gpu_float.device(sycl_device) = gpu_float.random();
+
+ gpu_res_float.device(sycl_device) = gpu_float.sum().cast<Eigen::half>();
+ gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().sum();
+
+ Tensor<Eigen::half, 0> half_prec;
+ Tensor<Eigen::half, 0> full_prec;
+ sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+ sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+ sycl_device.synchronize();
+
+ VERIFY_IS_APPROX(full_prec(), half_prec());
+
+ gpu_res_float.device(sycl_device) = gpu_float.maximum().cast<Eigen::half>();
+ gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().maximum();
+ sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+ sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+ sycl_device.synchronize();
+
+ VERIFY_IS_APPROX(full_prec(), half_prec());
+}
+
+void test_gpu_forced_evals(const Eigen::SyclDevice &sycl_device) {
+ int num_elem = 101;
+
+ float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+ float* d_res_half1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+ float* d_res_half2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+ float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+ d_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
+ d_res_half1, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+ d_res_half2, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+ d_res_float, num_elem);
+
+ Eigen::array<int, 1> no_bcast;
+ no_bcast[0] = 1;
+
+ gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
+ gpu_res_float.device(sycl_device) = gpu_float.abs();
+ gpu_res_half1.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
+ gpu_res_half2.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
+
+ Tensor<float, 1> half_prec1(num_elem);
+ Tensor<float, 1> half_prec2(num_elem);
+ Tensor<float, 1> full_prec(num_elem);
+ sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
+ sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
+ sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+ sycl_device.synchronize();
+
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
+ VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
+ VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
+ }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_sycl)
+{
+ for (const auto& s : Eigen::get_sycl_supported_devices()) {
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+
+ CALL_SUBTEST_1(test_gpu_numext(sycl_device));
+ CALL_SUBTEST_1(test_gpu_conversion(sycl_device));
+ CALL_SUBTEST_1(test_gpu_unary(sycl_device));
+ CALL_SUBTEST_1(test_gpu_elementwise(sycl_device));
+ CALL_SUBTEST_1(test_gpu_trancendental(sycl_device));
+ CALL_SUBTEST_2(test_gpu_contractions(sycl_device));
+ CALL_SUBTEST_3(test_gpu_reductions(sycl_device));
+ CALL_SUBTEST_4(test_gpu_full_reductions(sycl_device));
+ CALL_SUBTEST_5(test_gpu_forced_evals(sycl_device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp
index 727a9ff..9b16bb1 100644
--- a/unsupported/test/cxx11_tensor_padding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp
@@ -152,6 +152,7 @@
EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
{
for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_padding_test_per_device<half>(device));
CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
}
}
diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp
index 7f92bec..1216ad2 100644
--- a/unsupported/test/cxx11_tensor_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp
@@ -244,6 +244,7 @@
EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
{
for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_tensor_patch_test_per_device<half>(device));
CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
}
}
diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp
index 14a7c48..dcdfdd3 100644
--- a/unsupported/test/cxx11_tensor_random_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_random_sycl.cpp
@@ -78,6 +78,7 @@
EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
{
for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_random_test_per_device<half>(device));
CALL_SUBTEST(sycl_random_test_per_device<float>(device));
#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
CALL_SUBTEST(sycl_random_test_per_device<double>(device));
diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
index dd30c23..5ed007f 100644
--- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@@ -248,6 +248,7 @@
#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
#endif
- CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
+ CALL_SUBTEST_5(sycl_reverse_test_per_device<half>(device));
+ CALL_SUBTEST_6(sycl_reverse_test_per_device<float>(device));
}
}
diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
index ca4e8b5..70c4116 100644
--- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@@ -112,6 +112,7 @@
}
EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
for (const auto& device : Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_shuffling_test_per_device<half>(device));
CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
}
}
diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
index 8d99a48..dc86dfe 100644
--- a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
@@ -217,6 +217,7 @@
EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
{
for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<half>(device));
CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
}
}