Update Eigen to commit:81044ec13df7608d0d9d86aff2ef9805fc69bed1

CHANGELOG
=========
81044ec13 - Provide macro to explicitly disable alloca
bcce88c99 - Faster emulated half comparisons
ac6955ebc - Remove MSVC warnings in FindCoeff.h
67a898a07 - Fix unprotected SIZE in macro.
cdf6a1f5e - Add OpenBLAS sbgemm.
d228bcdf8 - Fix neon compilation bug
994f3d107 - Fix neon packet math tests, add missing neon intrinsics
cda19a625 - Make `Eigen::Map<const Vector>::operator[]` return correct type

PiperOrigin-RevId: 774907248
Change-Id: Ic572084cfda6bbbe9bb0f5ec94da1ea07671e3e4
diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index cff104c..377df57 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -45,10 +45,16 @@
   // - This is the return type of the coeff() method.
   // - The LvalueBit means exactly that we can offer a coeffRef() method, which means exactly that we can get references
   // to coeffs, which means exactly that we can have coeff() return a const reference (as opposed to returning a value).
+  // - The DirectAccessBit means exactly that the underlying data of coefficients can be directly accessed as a plain
+  // strided array, which means exactly that the underlying data of coefficients does exist in memory, which means
+  // exactly that the coefficients is const-referencable, which means exactly that we can have coeff() return a const
+  // reference. For example, Map<const Matrix> have DirectAccessBit but not LvalueBit, so that Map<const Matrix>.coeff()
+  // does points to a const Scalar& which exists in memory, while does not allow coeffRef() as it would not provide a
+  // lvalue. Notice that DirectAccessBit and LvalueBit are mutually orthogonal.
   // - The is_arithmetic check is required since "const int", "const double", etc. will cause warnings on some systems
   // while the declaration of "const T", where T is a non arithmetic type does not. Always returning "const Scalar&" is
   // not possible, since the underlying expressions might not offer a valid address the reference could be referring to.
-  typedef std::conditional_t<bool(internal::traits<Derived>::Flags& LvalueBit), const Scalar&,
+  typedef std::conditional_t<bool(internal::traits<Derived>::Flags&(LvalueBit | DirectAccessBit)), const Scalar&,
                              std::conditional_t<internal::is_arithmetic<Scalar>::value, Scalar, const Scalar>>
       CoeffReturnType;
 
diff --git a/Eigen/src/Core/FindCoeff.h b/Eigen/src/Core/FindCoeff.h
index 4569256..0102e8a 100644
--- a/Eigen/src/Core/FindCoeff.h
+++ b/Eigen/src/Core/FindCoeff.h
@@ -277,7 +277,7 @@
   using Scalar = typename Derived::Scalar;
   using Packet = typename packet_traits<Scalar>::type;
   static constexpr int Flags = Base::Flags;
-  static constexpr bool IsRowMajor = Flags & RowMajorBit;
+  static constexpr bool IsRowMajor = bool(Flags & RowMajorBit);
   EIGEN_DEVICE_FUNC inline find_coeff_evaluator(const Derived& xpr) : Base(xpr), m_xpr(xpr) {}
 
   EIGEN_DEVICE_FUNC inline Scalar coeffByOuterInner(Index outer, Index inner) const {
@@ -313,7 +313,7 @@
   using Packet = typename Evaluator::Packet;
 
   static constexpr int PacketSize = unpacket_traits<Packet>::size;
-  static constexpr bool Linearize = Flags & LinearAccessBit;
+  static constexpr bool Linearize = bool(Flags & LinearAccessBit);
   static constexpr bool DontVectorize =
       enum_lt_not_dynamic(Linearize ? MaxSizeAtCompileTime : MaxInnerSizeAtCompileTime, PacketSize);
   static constexpr bool Vectorize =
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 1b1d326..eb5da53 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -2250,23 +2250,63 @@
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet8h pisinf<Packet8h>(const Packet8h& a) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  return _mm_cmpeq_epi16(_mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask)), _mm_set1_epi16(kInf));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pisnan<Packet8h>(const Packet8h& a) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  return _mm_cmpgt_epi16(_mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask)), _mm_set1_epi16(kInf));
+}
+
+// convert the sign-magnitude representation to two's complement
+EIGEN_STRONG_INLINE __m128i pmaptosigned(const __m128i& a) {
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  // if 'a' has the sign bit set, clear the sign bit and negate the result as if it were an integer
+  return _mm_sign_epi16(_mm_and_si128(a, _mm_set1_epi16(kAbsMask)), a);
+}
+
+// return true if both `a` and `b` are not NaN
+EIGEN_STRONG_INLINE Packet8h pisordered(const Packet8h& a, const Packet8h& b) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  __m128i abs_a = _mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask));
+  __m128i abs_b = _mm_and_si128(b.m_val, _mm_set1_epi16(kAbsMask));
+  // check if both `abs_a <= kInf` and `abs_b <= kInf` by checking if max(abs_a, abs_b) <= kInf
+  // SSE has no `lesser or equal` instruction for integers, but comparing against kInf + 1 accomplishes the same goal
+  return _mm_cmplt_epi16(_mm_max_epu16(abs_a, abs_b), _mm_set1_epi16(kInf + 1));
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
-  return Pack16To8(pcmp_eq(half2float(a), half2float(b)));
+  __m128i isOrdered = pisordered(a, b);
+  __m128i isEqual = _mm_cmpeq_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_and_si128(isOrdered, isEqual);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
-  return Pack16To8(pcmp_le(half2float(a), half2float(b)));
+  __m128i isOrdered = pisordered(a, b);
+  __m128i isGreater = _mm_cmpgt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_andnot_si128(isGreater, isOrdered);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
-  return Pack16To8(pcmp_lt(half2float(a), half2float(b)));
+  __m128i isOrdered = pisordered(a, b);
+  __m128i isLess = _mm_cmplt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_and_si128(isOrdered, isLess);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
-  return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b)));
+  __m128i isUnordered = por(pisnan(a), pisnan(b));
+  __m128i isLess = _mm_cmplt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_or_si128(isUnordered, isLess);
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index ba70d5f..c073fe8 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -497,16 +497,56 @@
   a = half(float(a) / float(b));
   return a;
 }
+
+// Non-negative floating point numbers have a monotonic mapping to non-negative integers.
+// This property allows floating point numbers to be reinterpreted as integers for comparisons, which is useful if there
+// is no native floating point comparison operator. Floating point signedness is handled by the sign-magnitude
+// representation, whereas integers typically use two's complement. Converting the bit pattern from sign-magnitude to
+// two's complement allows the transformed bit patterns be compared as signed integers. All edge cases (+/-0 and +/-
+// infinity) are handled automatically, except NaN.
+//
+// fp16 uses 1 sign bit, 5 exponent bits, and 10 mantissa bits. The bit pattern conveys NaN when all the exponent
+// bits (5) are set, and at least one mantissa bit is set. The sign bit is irrelevant for determining NaN. To check for
+// NaN, clear the sign bit and check if the integral representation is greater than 01111100000000. To test
+// for non-NaN, clear the sign bit and check if the integeral representation is less than or equal to 01111100000000.
+
+// convert sign-magnitude representation to two's complement
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int16_t mapToSigned(uint16_t a) {
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  // If the sign bit is set, clear the sign bit and return the (integer) negation. Otherwise, return the input.
+  return (a >> 15) ? -(a & kAbsMask) : a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool isOrdered(const half& a, const half& b) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  return numext::maxi(a.x & kAbsMask, b.x & kAbsMask) <= kInf;
+}
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) {
-  return numext::equal_strict(float(a), float(b));
+  bool result = mapToSigned(a.x) == mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) {
-  return numext::not_equal_strict(float(a), float(b));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return !(a == b); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) < mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return float(a) < float(b); }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return float(a) <= float(b); }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return float(a) > float(b); }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return float(a) >= float(b); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) <= mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) > mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) >= mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
 
 #if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
 #pragma pop_macro("EIGEN_DEVICE_FUNC")
@@ -706,7 +746,11 @@
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
-  return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) < 0x7c00;
+#else
+  return (a.x & 0x7fff) < 0x7c00;
+#endif
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 6d7f038..9364cff 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -1287,6 +1287,14 @@
 EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
   return vfma_f32(c, a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vfmsq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vfms_f32(c, a, b);
+}
 #else
 template <>
 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
@@ -1296,7 +1304,31 @@
 EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
   return vmla_f32(c, a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vmlsq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vmls_f32(c, a, b);
+}
 #endif
+template <>
+EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return pnegate(pmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pnmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return pnegate(pmadd(a, b, c));
+}
 
 // No FMA instruction for int, so use MLA unconditionally.
 template <>
@@ -5242,13 +5274,28 @@
 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
   return vfmaq_f64(c, a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vfmsq_f64(c, a, b);
+}
 #else
 template <>
 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
   return vmlaq_f64(c, a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vmlsq_f64(c, a, b);
+}
 #endif
-
+template <>
+EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return pnegate(pmadd(a, b, c));
+}
 template <>
 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
   return vminq_f64(a, b);
@@ -5657,18 +5704,33 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
-  return vfmaq_f16(pnegate(c), a, b);
+EIGEN_STRONG_INLINE Packet8hf pnmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return vfmsq_f16(c, a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4hf pnmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
-  return vfma_f16(c, pnegate(a), b);
+  return vfms_f16(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pnmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return pnegate(pmadd(a, b, c));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4hf pnmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
-  return vfma_f16(pnegate(c), pnegate(a), b);
+  return pnegate(pmadd(a, b, c));
 }
 
 template <>
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
index 56743da..913beb6 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
@@ -55,7 +55,7 @@
                                        ConjugateRhs, ColMajor, 1> {                                                 \
     typedef gebp_traits<EIGTYPE, EIGTYPE> Traits;                                                                   \
                                                                                                                     \
-    static void run(Index rows, Index cols, Index depth, const EIGTYPE* _lhs, Index lhsStride, const EIGTYPE* _rhs, \
+    static void run(Index rows, Index cols, Index depth, const EIGTYPE* lhs_, Index lhsStride, const EIGTYPE* rhs_, \
                     Index rhsStride, EIGTYPE* res, Index resIncr, Index resStride, EIGTYPE alpha,                   \
                     level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, GemmParallelInfo<Index>* /*info = 0*/) {       \
       using std::conj;                                                                                              \
@@ -84,20 +84,20 @@
                                                                                                                     \
       /* Set a, b, c */                                                                                             \
       if ((LhsStorageOrder == ColMajor) && (ConjugateLhs)) {                                                        \
-        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs, m, k, OuterStride<>(lhsStride));                 \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(lhs_, m, k, OuterStride<>(lhsStride));                 \
         a_tmp = lhs.conjugate();                                                                                    \
         a = a_tmp.data();                                                                                           \
         lda = convert_index<BlasIndex>(a_tmp.outerStride());                                                        \
       } else                                                                                                        \
-        a = _lhs;                                                                                                   \
+        a = lhs_;                                                                                                   \
                                                                                                                     \
       if ((RhsStorageOrder == ColMajor) && (ConjugateRhs)) {                                                        \
-        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs, k, n, OuterStride<>(rhsStride));                 \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(rhs_, k, n, OuterStride<>(rhsStride));                 \
         b_tmp = rhs.conjugate();                                                                                    \
         b = b_tmp.data();                                                                                           \
         ldb = convert_index<BlasIndex>(b_tmp.outerStride());                                                        \
       } else                                                                                                        \
-        b = _rhs;                                                                                                   \
+        b = rhs_;                                                                                                   \
                                                                                                                     \
       BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda,   \
                (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc);           \
@@ -116,6 +116,88 @@
 GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
 #endif
 
+// If OpenBLAS with BUILD_BFLOAT16=1 support is available,
+// use sbgemm for bfloat16.
+#if EIGEN_USE_OPENBLAS_BFLOAT16
+
+extern "C" {
+// OpenBLAS prototype.
+void sbgemm_(const char* trans_a, const char* trans_b, const int* M, const int* N, const int* K, const float* alpha,
+             const Eigen::bfloat16* A, const int* lda, const Eigen::bfloat16* B, const int* ldb, const float* beta,
+             float* C, const int* ldc);
+}  // extern "C"
+
+template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>
+struct general_matrix_matrix_product<Index, Eigen::bfloat16, LhsStorageOrder, ConjugateLhs, Eigen::bfloat16,
+                                     RhsStorageOrder, ConjugateRhs, ColMajor, 1> {
+  typedef gebp_traits<Eigen::bfloat16, Eigen::bfloat16> Traits;
+
+  static void run(Index rows, Index cols, Index depth, const Eigen::bfloat16* lhs_, Index lhsStride,
+                  const Eigen::bfloat16* rhs_, Index rhsStride, Eigen::bfloat16* res, Index resIncr, Index resStride,
+                  Eigen::bfloat16 alpha, level3_blocking<Eigen::bfloat16, Eigen::bfloat16>& /*blocking*/,
+                  GemmParallelInfo<Index>* /*info = 0*/) {
+    using std::conj;
+    if (rows == 0 || cols == 0 || depth == 0) return;
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr);
+    eigen_assert(resIncr == 1);
+    char transa, transb;
+    BlasIndex m, n, k, lda, ldb, ldc;
+    const Eigen::bfloat16 *a, *b;
+
+    float falpha = static_cast<float>(alpha);
+    float fbeta = float(1.0);
+
+    using MatrixXbf16 = Matrix<Eigen::bfloat16, Dynamic, Dynamic>;
+    MatrixXbf16 a_tmp, b_tmp;
+    MatrixXf r_tmp;
+
+    /* Set transpose options */
+    transa = (LhsStorageOrder == RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N';
+    transb = (RhsStorageOrder == RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N';
+
+    /* Set m, n, k */
+    m = convert_index<BlasIndex>(rows);
+    n = convert_index<BlasIndex>(cols);
+    k = convert_index<BlasIndex>(depth);
+
+    /* Set lda, ldb, ldc */
+    lda = convert_index<BlasIndex>(lhsStride);
+    ldb = convert_index<BlasIndex>(rhsStride);
+    ldc = convert_index<BlasIndex>(m);
+
+    /* Set a, b, c */
+    if ((LhsStorageOrder == ColMajor) && (ConjugateLhs)) {
+      Map<const MatrixXbf16, 0, OuterStride<> > lhs(lhs_, m, k, OuterStride<>(lhsStride));
+      a_tmp = lhs.conjugate();
+      a = a_tmp.data();
+      lda = convert_index<BlasIndex>(a_tmp.outerStride());
+    } else {
+      a = lhs_;
+    }
+
+    if ((RhsStorageOrder == ColMajor) && (ConjugateRhs)) {
+      Map<const MatrixXbf16, 0, OuterStride<> > rhs(rhs_, k, n, OuterStride<>(rhsStride));
+      b_tmp = rhs.conjugate();
+      b = b_tmp.data();
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride());
+    } else {
+      b = rhs_;
+    }
+
+    // Evaluate to a temporary intermediate array.
+    r_tmp.resize(m, n);
+
+    sbgemm_(&transa, &transb, &m, &n, &k, (const float*)&numext::real_ref(falpha), a, &lda, b, &ldb,
+            (const float*)&numext::real_ref(fbeta), r_tmp.data(), &ldc);
+
+    // Cast to the output.
+    Map<MatrixXbf16, 0, OuterStride<> > result(res, m, n, OuterStride<>(resStride));
+    result = r_tmp.cast<Eigen::bfloat16>();
+  }
+};
+
+#endif  // EIGEN_USE_OPENBLAS_SBGEMM
+
 }  // namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 89b2fff..44056b3 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -762,7 +762,7 @@
  * This is accomplished through alloca if this later is supported and if the required number of bytes
  * is below EIGEN_STACK_ALLOCATION_LIMIT.
  */
-#ifdef EIGEN_ALLOCA
+#if defined(EIGEN_ALLOCA) && !defined(EIGEN_NO_ALLOCA)
 
 #if EIGEN_DEFAULT_ALIGN_BYTES > 0
 // We always manually re-align the result of EIGEN_ALLOCA.
@@ -785,14 +785,14 @@
 #define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE)
 #endif
 
-#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                     \
-  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                             \
-  TYPE* NAME = (BUFFER) != 0 ? (BUFFER)                                                                             \
-                             : reinterpret_cast<TYPE*>((sizeof(TYPE) * SIZE <= EIGEN_STACK_ALLOCATION_LIMIT)        \
-                                                           ? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE) * SIZE)              \
-                                                           : Eigen::internal::aligned_malloc(sizeof(TYPE) * SIZE)); \
-  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                    \
-      (BUFFER) == 0 ? NAME : 0, SIZE, sizeof(TYPE) * SIZE > EIGEN_STACK_ALLOCATION_LIMIT)
+#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                       \
+  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                               \
+  TYPE* NAME = (BUFFER) != 0 ? (BUFFER)                                                                               \
+                             : reinterpret_cast<TYPE*>((sizeof(TYPE) * (SIZE) <= EIGEN_STACK_ALLOCATION_LIMIT)        \
+                                                           ? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE) * (SIZE))              \
+                                                           : Eigen::internal::aligned_malloc(sizeof(TYPE) * (SIZE))); \
+  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                      \
+      (BUFFER) == 0 ? NAME : 0, SIZE, sizeof(TYPE) * (SIZE) > EIGEN_STACK_ALLOCATION_LIMIT)
 
 #define ei_declare_local_nested_eval(XPR_T, XPR, N, NAME)                                        \
   Eigen::internal::local_nested_eval_wrapper<XPR_T, N> EIGEN_CAT(NAME, _wrapper)(                \
@@ -805,10 +805,11 @@
 
 #else
 
-#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                        \
-  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                                \
-  TYPE* NAME = (BUFFER) != 0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE) * SIZE)); \
-  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                       \
+#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                 \
+  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                         \
+  TYPE* NAME =                                                                                                  \
+      (BUFFER) != 0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE) * (SIZE))); \
+  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                \
       (BUFFER) == 0 ? NAME : 0, SIZE, true)
 
 #define ei_declare_local_nested_eval(XPR_T, XPR, N, NAME) \
diff --git a/test/half_float.cpp b/test/half_float.cpp
index 90ac825..aabe896 100644
--- a/test/half_float.cpp
+++ b/test/half_float.cpp
@@ -72,17 +72,16 @@
   // NaNs and infinities.
   VERIFY(!(numext::isinf)(float(half(65504.0f))));  // Largest finite number.
   VERIFY(!(numext::isnan)(float(half(0.0f))));
+  VERIFY((numext::isfinite)(float(half(65504.0f))));
+  VERIFY((numext::isfinite)(float(half(0.0f))));
   VERIFY((numext::isinf)(float(half(__half_raw(0xfc00)))));
   VERIFY((numext::isnan)(float(half(__half_raw(0xfc01)))));
   VERIFY((numext::isinf)(float(half(__half_raw(0x7c00)))));
   VERIFY((numext::isnan)(float(half(__half_raw(0x7c01)))));
 
-#if !EIGEN_COMP_MSVC
-  // Visual Studio errors out on divisions by 0
-  VERIFY((numext::isnan)(float(half(0.0 / 0.0))));
-  VERIFY((numext::isinf)(float(half(1.0 / 0.0))));
-  VERIFY((numext::isinf)(float(half(-1.0 / 0.0))));
-#endif
+  VERIFY((numext::isnan)(float(NumTraits<half>::quiet_NaN())));
+  VERIFY((numext::isinf)(float(NumTraits<half>::infinity())));
+  VERIFY((numext::isinf)(float(-NumTraits<half>::infinity())));
 
   // Exactly same checks as above, just directly on the half representation.
   VERIFY(!(numext::isinf)(half(__half_raw(0x7bff))));
@@ -92,12 +91,9 @@
   VERIFY((numext::isinf)(half(__half_raw(0x7c00))));
   VERIFY((numext::isnan)(half(__half_raw(0x7c01))));
 
-#if !EIGEN_COMP_MSVC
-  // Visual Studio errors out on divisions by 0
-  VERIFY((numext::isnan)(half(0.0 / 0.0)));
-  VERIFY((numext::isinf)(half(1.0 / 0.0)));
-  VERIFY((numext::isinf)(half(-1.0 / 0.0)));
-#endif
+  VERIFY((numext::isnan)(NumTraits<half>::quiet_NaN()));
+  VERIFY((numext::isinf)(NumTraits<half>::infinity()));
+  VERIFY((numext::isinf)(-NumTraits<half>::infinity()));
 
   // Conversion to bool
   VERIFY(!static_cast<bool>(half(0.0)));
@@ -204,19 +200,25 @@
   VERIFY(half(1.0f) != half(2.0f));
 
   // Comparisons with NaNs and infinities.
-#if !EIGEN_COMP_MSVC
-  // Visual Studio errors out on divisions by 0
-  VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0)));
-  VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0));
+  VERIFY(!(NumTraits<half>::quiet_NaN() == NumTraits<half>::quiet_NaN()));
+  VERIFY(NumTraits<half>::quiet_NaN() != NumTraits<half>::quiet_NaN());
 
-  VERIFY(!(half(1.0) == half(0.0 / 0.0)));
-  VERIFY(!(half(1.0) < half(0.0 / 0.0)));
-  VERIFY(!(half(1.0) > half(0.0 / 0.0)));
-  VERIFY(half(1.0) != half(0.0 / 0.0));
+  VERIFY(!(internal::random<half>() == NumTraits<half>::quiet_NaN()));
+  VERIFY(!(internal::random<half>() < NumTraits<half>::quiet_NaN()));
+  VERIFY(!(internal::random<half>() > NumTraits<half>::quiet_NaN()));
+  VERIFY(!(internal::random<half>() <= NumTraits<half>::quiet_NaN()));
+  VERIFY(!(internal::random<half>() >= NumTraits<half>::quiet_NaN()));
+  VERIFY(internal::random<half>() != NumTraits<half>::quiet_NaN());
 
-  VERIFY(half(1.0) < half(1.0 / 0.0));
-  VERIFY(half(1.0) > half(-1.0 / 0.0));
-#endif
+  VERIFY(!(NumTraits<half>::quiet_NaN() == internal::random<half>()));
+  VERIFY(!(NumTraits<half>::quiet_NaN() < internal::random<half>()));
+  VERIFY(!(NumTraits<half>::quiet_NaN() > internal::random<half>()));
+  VERIFY(!(NumTraits<half>::quiet_NaN() <= internal::random<half>()));
+  VERIFY(!(NumTraits<half>::quiet_NaN() >= internal::random<half>()));
+  VERIFY(NumTraits<half>::quiet_NaN() != internal::random<half>());
+
+  VERIFY(internal::random<half>() < NumTraits<half>::infinity());
+  VERIFY(internal::random<half>() > -NumTraits<half>::infinity());
 }
 
 void test_basic_functions() {