Update Eigen to commit:f19a6803c87cb8486c3147cd180edb02c9ff9a68

CHANGELOG
=========
f19a6803c - Refactor special case handling in pow(x,y) and revert to repeated squaring for <float,int>
5064cb7d5 - Add test for using pcast on scalars.
1ea61a5d2 - Improve pow(x,y): 25% speedup, increase accuracy for integer exponents.
8ad4344ca - optimize setConstant, setZero

PiperOrigin-RevId: 700731159
Change-Id: I762ac07c755af7d2e5d030e3a09291fedc604d1a
diff --git a/Eigen/Core b/Eigen/Core
index e6dbe3a..f87f3d2 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -318,6 +318,7 @@
 #include "src/Core/PlainObjectBase.h"
 #include "src/Core/Matrix.h"
 #include "src/Core/Array.h"
+#include "src/Core/Fill.h"
 #include "src/Core/CwiseTernaryOp.h"
 #include "src/Core/CwiseBinaryOp.h"
 #include "src/Core/CwiseUnaryOp.h"
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index f40b2f4..e70d555 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -737,20 +737,6 @@
   dense_assignment_loop<Kernel>::run(kernel);
 }
 
-// Specialization for filling the destination with a constant value.
-#if !EIGEN_COMP_MSVC
-#ifndef EIGEN_GPU_COMPILE_PHASE
-template <typename DstXprType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(
-    DstXprType& dst,
-    const Eigen::CwiseNullaryOp<Eigen::internal::scalar_constant_op<typename DstXprType::Scalar>, DstXprType>& src,
-    const internal::assign_op<typename DstXprType::Scalar, typename DstXprType::Scalar>& func) {
-  resize_if_allowed(dst, src, func);
-  std::fill_n(dst.data(), dst.size(), src.functor()());
-}
-#endif
-#endif
-
 template <typename DstXprType, typename SrcXprType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) {
   call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>());
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 39c33cf..86ddd5e 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -343,7 +343,8 @@
  */
 template <typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val) {
-  return derived() = Constant(rows(), cols(), val);
+  internal::eigen_fill_impl<Derived>::run(derived(), val);
+  return derived();
 }
 
 /** Resizes to the given \a size, and sets all coefficients in this expression to the given value \a val.
@@ -547,7 +548,8 @@
  */
 template <typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero() {
-  return setConstant(Scalar(0));
+  internal::eigen_zero_impl<Derived>::run(derived());
+  return derived();
 }
 
 /** Resizes to the given \a size, and sets all coefficients in this expression to zero.
@@ -562,7 +564,7 @@
 template <typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setZero(Index newSize) {
   resize(newSize);
-  return setConstant(Scalar(0));
+  return setZero();
 }
 
 /** Resizes to the given size, and sets all coefficients in this expression to zero.
@@ -578,7 +580,7 @@
 template <typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setZero(Index rows, Index cols) {
   resize(rows, cols);
-  return setConstant(Scalar(0));
+  return setZero();
 }
 
 /** Resizes to the given size, changing only the number of columns, and sets all
diff --git a/Eigen/src/Core/Fill.h b/Eigen/src/Core/Fill.h
new file mode 100644
index 0000000..048b51d
--- /dev/null
+++ b/Eigen/src/Core/Fill.h
@@ -0,0 +1,94 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Charles Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FILL_H
+#define EIGEN_FILL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Xpr>
+struct eigen_fill_helper : std::false_type {};
+
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct eigen_fill_helper<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> : std::true_type {};
+
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct eigen_fill_helper<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> : std::true_type {};
+
+template <typename Xpr, int BlockRows, int BlockCols>
+struct eigen_fill_helper<Block<Xpr, BlockRows, BlockCols, /*InnerPanel*/ true>> : eigen_fill_helper<Xpr> {};
+
+template <typename Xpr, int BlockRows, int BlockCols>
+struct eigen_fill_helper<Block<Xpr, BlockRows, BlockCols, /*InnerPanel*/ false>>
+    : std::integral_constant<bool, eigen_fill_helper<Xpr>::value &&
+                                       (Xpr::IsRowMajor ? (BlockRows == 1) : (BlockCols == 1))> {};
+
+template <typename Xpr, int Options, typename StrideType>
+struct eigen_fill_helper<Map<Xpr, Options, StrideType>>
+    : std::integral_constant<bool, eigen_fill_helper<Xpr>::value &&
+                                       (evaluator<Map<Xpr, Options, StrideType>>::Flags & LinearAccessBit)> {};
+
+template <typename Xpr, bool use_fill = eigen_fill_helper<Xpr>::value>
+struct eigen_fill_impl {
+  using Scalar = typename Xpr::Scalar;
+  using Func = scalar_constant_op<Scalar>;
+  using PlainObject = typename Xpr::PlainObject;
+  using Constant = CwiseNullaryOp<Func, PlainObject>;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const Scalar& val) {
+    dst = Constant(dst.rows(), dst.cols(), Func(val));
+  }
+};
+
+#if !EIGEN_COMP_MSVC
+#ifndef EIGEN_GPU_COMPILE_PHASE
+template <typename Xpr>
+struct eigen_fill_impl<Xpr, /*use_fill*/ true> {
+  using Scalar = typename Xpr::Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const Scalar& val) {
+    EIGEN_USING_STD(fill_n);
+    fill_n(dst.data(), dst.size(), val);
+  }
+};
+#endif
+#endif
+
+template <typename Xpr>
+struct eigen_memset_helper {
+  static constexpr bool value = std::is_trivial<typename Xpr::Scalar>::value && eigen_fill_helper<Xpr>::value;
+};
+
+template <typename Xpr, bool use_memset = eigen_memset_helper<Xpr>::value>
+struct eigen_zero_impl {
+  using Scalar = typename Xpr::Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst) { eigen_fill_impl<Xpr, false>::run(dst, Scalar(0)); }
+};
+
+template <typename Xpr>
+struct eigen_zero_impl<Xpr, /*use_memset*/ true> {
+  using Scalar = typename Xpr::Scalar;
+  static constexpr size_t max_bytes = (std::numeric_limits<std::ptrdiff_t>::max)();
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst) {
+    const size_t num_bytes = dst.size() * sizeof(Scalar);
+#ifndef EIGEN_NO_DEBUG
+    if (num_bytes > max_bytes) throw_std_bad_alloc();
+#endif
+    EIGEN_USING_STD(memset);
+    memset(dst.data(), 0, num_bytes);
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_FILL_H
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index c7a9846..26a4634 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -194,7 +194,7 @@
   static constexpr int Size = unpacket_traits<Packet>::size;
   using DefaultPacket = typename packet_traits<Scalar>::type;
   static constexpr int DefaultSize = unpacket_traits<DefaultPacket>::size;
-  static constexpr bool value = Size < DefaultSize;
+  static constexpr bool value = Size != 1 && Size < DefaultSize;
 };
 
 template <typename Src, typename Tgt>
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 652892e..174eb57 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -1837,81 +1837,53 @@
 };
 
 // This specialization uses a more accurate algorithm to compute log2(x) for
-// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.42e-10.
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.56508e-10.
 // This additional accuracy is needed to counter the error-magnification
 // inherent in multiplying by a potentially large exponent in pow(x,y).
-// The minimax polynomial used was calculated using the Sollya tool.
-// See sollya.org.
+// The minimax polynomial used was calculated using the Rminimax tool,
+// see https://gitlab.inria.fr/sfilip/rminimax.
+// Command line:
+//   $ ratapprox --function="log2(1+x)/x"  --dom='[-0.2929,0.41422]'
+//   --type=[10,0]
+//       --numF="[D,D,SG]" --denF="[SG]" --log --dispCoeff="dec"
+//
+// The resulting implementation of pow(x,y) is accurate to 3 ulps.
 template <>
 struct accurate_log2<float> {
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
-    // The function log(1+x)/x is approximated in the interval
-    // [1/sqrt(2)-1;sqrt(2)-1] by a degree 10 polynomial of the form
-    //  Q(x) = (C0 + x * (C1 + x * (C2 + x * (C3 + x * P(x))))),
-    // where the degree 6 polynomial P(x) is evaluated in single precision,
-    // while the remaining 4 terms of Q(x), as well as the final multiplication by x
-    // to reconstruct log(1+x) are evaluated in extra precision using
-    // double word arithmetic. C0 through C3 are extra precise constants
-    // stored as double words.
-    //
-    // The polynomial coefficients were calculated using Sollya commands:
-    // > n = 10;
-    // > f = log2(1+x)/x;
-    // > interval = [sqrt(0.5)-1;sqrt(2)-1];
-    // > p = fpminimax(f,n,[|double,double,double,double,single...|],interval,relative,floating);
+    // Split the two lowest order constant coefficient into double-word representation.
+    constexpr double kC0 = 1.442695041742110273474963832995854318141937255859375e+00;
+    constexpr float kC0_hi = static_cast<float>(kC0);
+    constexpr float kC0_lo = static_cast<float>(kC0 - static_cast<double>(kC0_hi));
+    const Packet c0_hi = pset1<Packet>(kC0_hi);
+    const Packet c0_lo = pset1<Packet>(kC0_lo);
 
-    const Packet p6 = pset1<Packet>(9.703654795885e-2f);
-    const Packet p5 = pset1<Packet>(-0.1690667718648f);
-    const Packet p4 = pset1<Packet>(0.1720575392246f);
-    const Packet p3 = pset1<Packet>(-0.1789081543684f);
-    const Packet p2 = pset1<Packet>(0.2050433009862f);
-    const Packet p1 = pset1<Packet>(-0.2404672354459f);
-    const Packet p0 = pset1<Packet>(0.2885761857032f);
+    constexpr double kC1 = -7.2134751588268664068692714863573201000690460205078125e-01;
+    constexpr float kC1_hi = static_cast<float>(kC1);
+    constexpr float kC1_lo = static_cast<float>(kC1 - static_cast<double>(kC1_hi));
+    const Packet c1_hi = pset1<Packet>(kC1_hi);
+    const Packet c1_lo = pset1<Packet>(kC1_lo);
 
-    const Packet C3_hi = pset1<Packet>(-0.360674142838f);
-    const Packet C3_lo = pset1<Packet>(-6.13283912543e-09f);
-    const Packet C2_hi = pset1<Packet>(0.480897903442f);
-    const Packet C2_lo = pset1<Packet>(-1.44861207474e-08f);
-    const Packet C1_hi = pset1<Packet>(-0.721347510815f);
-    const Packet C1_lo = pset1<Packet>(-4.84483164698e-09f);
-    const Packet C0_hi = pset1<Packet>(1.44269502163f);
-    const Packet C0_lo = pset1<Packet>(2.01711713999e-08f);
+    constexpr float c[] = {
+        9.7010828554630279541015625e-02,  -1.6896486282348632812500000e-01, 1.7200836539268493652343750e-01,
+        -1.7892272770404815673828125e-01, 2.0505344867706298828125000e-01,  -2.4046677350997924804687500e-01,
+        2.8857553005218505859375000e-01,  -3.6067414283752441406250000e-01, 4.8089790344238281250000000e-01};
+
+    // Evaluate the higher order terms in the polynomial using
+    // standard arithmetic.
     const Packet one = pset1<Packet>(1.0f);
-
     const Packet x = psub(z, one);
-    // Evaluate P(x) in working precision.
-    // We evaluate it in multiple parts to improve instruction level
-    // parallelism.
-    Packet x2 = pmul(x, x);
-    Packet p_even = pmadd(p6, x2, p4);
-    p_even = pmadd(p_even, x2, p2);
-    p_even = pmadd(p_even, x2, p0);
-    Packet p_odd = pmadd(p5, x2, p3);
-    p_odd = pmadd(p_odd, x2, p1);
-    Packet p = pmadd(p_odd, x, p_even);
-
-    // Now evaluate the low-order tems of Q(x) in double word precision.
-    // In the following, due to the alternating signs and the fact that
-    // |x| < sqrt(2)-1, we can assume that |C*_hi| >= q_i, and use
-    // fast_twosum instead of the slower twosum.
-    Packet q_hi, q_lo;
-    Packet t_hi, t_lo;
-    // C3 + x * p(x)
-    twoprod(p, x, t_hi, t_lo);
-    fast_twosum(C3_hi, C3_lo, t_hi, t_lo, q_hi, q_lo);
-    // C2 + x * p(x)
-    twoprod(q_hi, q_lo, x, t_hi, t_lo);
-    fast_twosum(C2_hi, C2_lo, t_hi, t_lo, q_hi, q_lo);
-    // C1 + x * p(x)
-    twoprod(q_hi, q_lo, x, t_hi, t_lo);
-    fast_twosum(C1_hi, C1_lo, t_hi, t_lo, q_hi, q_lo);
-    // C0 + x * p(x)
-    twoprod(q_hi, q_lo, x, t_hi, t_lo);
-    fast_twosum(C0_hi, C0_lo, t_hi, t_lo, q_hi, q_lo);
-
-    // log(z) ~= x * Q(x)
-    twoprod(q_hi, q_lo, x, log2_x_hi, log2_x_lo);
+    Packet p = ppolevl<Packet, 8>::run(x, c);
+    // Evaluate the final two step in Horner's rule using double-word
+    // arithmetic.
+    Packet p_hi, p_lo;
+    twoprod(x, p, p_hi, p_lo);
+    fast_twosum(c1_hi, c1_lo, p_hi, p_lo, p_hi, p_lo);
+    twoprod(p_hi, p_lo, x, p_hi, p_lo);
+    fast_twosum(c0_hi, c0_lo, p_hi, p_lo, p_hi, p_lo);
+    // Multiply by x to recover log2(z).
+    twoprod(p_hi, p_lo, x, log2_x_hi, log2_x_lo);
   }
 };
 
@@ -2006,16 +1978,6 @@
   }
 };
 
-// This function accurately computes exp2(x) for x in [-0.5:0.5], which is
-// needed in pow(x,y).
-template <typename Scalar>
-struct fast_accurate_exp2 {
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet operator()(const Packet& x) {
-    return generic_exp2(x);
-  }
-};
-
 // This function implements the non-trivial case of pow(x,y) where x is
 // positive and y is (possibly) non-integer.
 // Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
@@ -2081,69 +2043,91 @@
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Packet& x, const Packet& y) {
   typedef typename unpacket_traits<Packet>::type Scalar;
 
-  const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
-  const Packet cst_neg_inf = pset1<Packet>(-NumTraits<Scalar>::infinity());
+  const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
   const Packet cst_zero = pset1<Packet>(Scalar(0));
   const Packet cst_one = pset1<Packet>(Scalar(1));
   const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
 
-  const Packet abs_x = pabs(x);
+  const Packet x_abs = pabs(x);
+  Packet pow = generic_pow_impl(x_abs, y);
+
+  // In the following we enforce the special case handling prescribed in
+  // https://en.cppreference.com/w/cpp/numeric/math/pow.
+
   // Predicates for sign and magnitude of x.
-  const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_zero);
+  const Packet x_is_negative = pcmp_lt(x, cst_zero);
+  const Packet x_is_zero = pcmp_eq(x, cst_zero);
+  const Packet x_is_one = pcmp_eq(x, cst_one);
   const Packet x_has_signbit = psignbit(x);
-  const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero);
-  const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero);
-  const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
-  const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one);
-  const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x);
-  const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one);
-  const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg);
-  const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg);
-  const Packet x_is_nan = pisnan(x);
+  const Packet x_abs_gt_one = pcmp_lt(cst_one, x_abs);
+  const Packet x_abs_is_inf = pcmp_eq(x_abs, cst_inf);
 
   // Predicates for sign and magnitude of y.
-  const Packet abs_y = pabs(y);
+  const Packet y_abs = pabs(y);
+  const Packet y_abs_is_inf = pcmp_eq(y_abs, cst_inf);
+  const Packet y_is_negative = pcmp_lt(y, cst_zero);
+  const Packet y_is_zero = pcmp_eq(y, cst_zero);
   const Packet y_is_one = pcmp_eq(y, cst_one);
-  const Packet abs_y_is_zero = pcmp_eq(abs_y, cst_zero);
-  const Packet y_is_neg = pcmp_lt(y, cst_zero);
-  const Packet y_is_pos = pandnot(ptrue(y), por(abs_y_is_zero, y_is_neg));
-  const Packet y_is_nan = pisnan(y);
-  const Packet abs_y_is_inf = pcmp_eq(abs_y, cst_pos_inf);
-  EIGEN_CONSTEXPR Scalar huge_exponent =
-      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
-  const Packet abs_y_is_huge = pcmp_le(pset1<Packet>(huge_exponent), pabs(y));
-
-  // Predicates for whether y is integer and/or even.
-  const Packet y_is_int = pcmp_eq(pfloor(y), y);
+  // Predicates for whether y is integer and odd/even.
+  const Packet y_is_int = pandnot(pcmp_eq(pfloor(y), y), y_abs_is_inf);
   const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
   const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
+  const Packet y_is_odd_int = pandnot(y_is_int, y_is_even);
+  // Smallest exponent for which (1 + epsilon) overflows to infinity.
+  EIGEN_CONSTEXPR Scalar huge_exponent =
+      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
+  const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
 
-  // Predicates encoding special cases for the value of pow(x,y)
-  const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf);
-  const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
-  const Packet pow_is_one =
-      por(por(x_is_one, abs_y_is_zero), pand(x_is_neg_one, por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
-  const Packet pow_is_zero = por(por(por(pand(abs_x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)),
-                                     pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_pos)),
-                                 pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_neg));
-  const Packet pow_is_inf = por(por(por(pand(abs_x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)),
-                                    pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)),
-                                pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos));
-  const Packet pow_is_neg_zero = pand(pandnot(y_is_int, y_is_even),
-                                      por(pand(y_is_neg, pand(abs_x_is_inf, x_is_neg)), pand(y_is_pos, x_is_neg_zero)));
-  const Packet inf_val =
-      pselect(pandnot(pand(por(pand(abs_x_is_inf, x_is_neg), pand(x_is_neg_zero, y_is_neg)), y_is_int), y_is_even),
-              cst_neg_inf, cst_pos_inf);
-  // General computation of pow(x,y) for positive x or negative x and integer y.
-  const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
-  const Packet pow_abs = generic_pow_impl(abs_x, y);
-  return pselect(y_is_one, x,
-                 pselect(pow_is_one, cst_one,
-                         pselect(pow_is_nan, cst_nan,
-                                 pselect(pow_is_inf, inf_val,
-                                         pselect(pow_is_neg_zero, pnegate(cst_zero),
-                                                 pselect(pow_is_zero, cst_zero,
-                                                         pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))));
+  // *  pow(base, exp) returns NaN if base is finite and negative
+  //    and exp is finite and non-integer.
+  pow = pselect(pandnot(x_is_negative, y_is_int), cst_nan, pow);
+
+  // * pow(±0, exp), where exp is negative, finite, and is an even integer or
+  // a non-integer, returns +∞
+  // * pow(±0, exp), where exp is positive non-integer or a positive even
+  // integer, returns +0
+  // * pow(+0, exp), where exp is a negative odd integer, returns +∞
+  // * pow(-0, exp), where exp is a negative odd integer, returns -∞
+  // * pow(+0, exp), where exp is a positive odd integer, returns +0
+  // * pow(-0, exp), where exp is a positive odd integer, returns -0
+  // Sign is flipped by the rule below.
+  pow = pselect(x_is_zero, pselect(y_is_negative, cst_inf, cst_zero), pow);
+
+  // pow(base, exp) returns -pow(abs(base), exp) if base has the sign bit set,
+  // and exp is an odd integer exponent.
+  pow = pselect(pand(x_has_signbit, y_is_odd_int), pnegate(pow), pow);
+
+  // * pow(base, -∞) returns +∞ for any |base|<1
+  // * pow(base, -∞) returns +0 for any |base|>1
+  // * pow(base, +∞) returns +0 for any |base|<1
+  // * pow(base, +∞) returns +∞ for any |base|>1
+  // * pow(±0, -∞) returns +∞
+  // * pow(-1, +-∞) = 1
+  Packet inf_y_val = pselect(por(pand(y_is_negative, x_is_zero), pxor(y_is_negative, x_abs_gt_one)), cst_inf, cst_zero);
+  inf_y_val = pselect(pcmp_eq(x, pset1<Packet>(Scalar(-1.0))), cst_one, inf_y_val);
+  pow = pselect(y_abs_is_huge, inf_y_val, pow);
+
+  // * pow(+∞, exp) returns +0 for any negative exp
+  // * pow(+∞, exp) returns +∞ for any positive exp
+  // * pow(-∞, exp) returns -0 if exp is a negative odd integer.
+  // * pow(-∞, exp) returns +0 if exp is a negative non-integer or negative
+  //     even integer.
+  // * pow(-∞, exp) returns -∞ if exp is a positive odd integer.
+  // * pow(-∞, exp) returns +∞ if exp is a positive non-integer or positive
+  //     even integer.
+  auto x_pos_inf_value = pselect(y_is_negative, cst_zero, cst_inf);
+  auto x_neg_inf_value = pselect(y_is_odd_int, pnegate(x_pos_inf_value), x_pos_inf_value);
+  pow = pselect(x_abs_is_inf, pselect(x_is_negative, x_neg_inf_value, x_pos_inf_value), pow);
+
+  // All cases of NaN inputs return NaN, except the two below.
+  pow = pselect(por(pisnan(x), pisnan(y)), cst_nan, pow);
+
+  // * pow(base, 1) returns base.
+  // * pow(base, +/-0) returns 1, regardless of base, even NaN.
+  // * pow(+1, exp) returns 1, regardless of exponent, even NaN.
+  pow = pselect(y_is_one, x, pselect(por(x_is_one, y_is_zero), cst_one, pow));
+
+  return pow;
 }
 
 namespace unary_pow {
@@ -2343,7 +2327,12 @@
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
     const bool exponent_is_integer = (numext::isfinite)(exponent) && numext::round(exponent) == exponent;
     if (exponent_is_integer) {
-      return unary_pow::int_pow(x, exponent);
+      // The simple recursive doubling implementation is only accurate to 3 ulps
+      // for integer exponents in [-3:7]. Since this is a common case, we
+      // specialize it here.
+      bool use_repeated_squaring =
+          (exponent <= ScalarExponent(7) && (!ExponentIsSigned || exponent >= ScalarExponent(-3)));
+      return use_repeated_squaring ? unary_pow::int_pow(x, exponent) : generic_pow(x, pset1<Packet>(exponent));
     } else {
       Packet result = unary_pow::gen_pow(x, exponent);
       result = unary_pow::handle_nonint_nonint_errors(x, result, exponent);
diff --git a/test/block.cpp b/test/block.cpp
index c67dd22..8f7c8de 100644
--- a/test/block.cpp
+++ b/test/block.cpp
@@ -73,6 +73,21 @@
 
   block_real_only(m1, r1, r2, c1, c1, s1);
 
+  // test fill logic with innerpanel and non-innerpanel blocks
+  m1.row(r1).setConstant(s1);
+  VERIFY_IS_CWISE_EQUAL(m1.row(r1), DynamicVectorType::Constant(cols, s1).transpose());
+  m1 = m1_copy;
+  m1.col(c1).setConstant(s1);
+  VERIFY_IS_CWISE_EQUAL(m1.col(c1), DynamicVectorType::Constant(rows, s1));
+  m1 = m1_copy;
+  // test setZero logic with innerpanel and non-innerpanel blocks
+  m1.row(r1).setZero();
+  VERIFY_IS_CWISE_EQUAL(m1.row(r1), DynamicVectorType::Zero(cols).transpose());
+  m1 = m1_copy;
+  m1.col(c1).setZero();
+  VERIFY_IS_CWISE_EQUAL(m1.col(c1), DynamicVectorType::Zero(rows));
+  m1 = m1_copy;
+
   // check row() and col()
   VERIFY_IS_EQUAL(m1.col(c1).transpose(), m1.transpose().row(c1));
   // check operator(), both constant and non-constant, on row() and col()
diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp
index a8d17d9..78ae09c 100644
--- a/test/mapped_matrix.cpp
+++ b/test/mapped_matrix.cpp
@@ -75,8 +75,23 @@
   Map<MatrixType> map4(array4, rows, cols);
 
   VERIFY_IS_EQUAL(map1, MatrixType::Ones(rows, cols));
+  map1.setConstant(s1);
+  VERIFY_IS_EQUAL(map1, MatrixType::Constant(rows, cols, s1));
+  map1.setZero();
+  VERIFY_IS_EQUAL(map1, MatrixType::Zero(rows, cols));
+
   VERIFY_IS_EQUAL(map2, MatrixType::Ones(rows, cols));
+  map2.setConstant(s1);
+  VERIFY_IS_EQUAL(map2, MatrixType::Constant(rows, cols, s1));
+  map2.setZero();
+  VERIFY_IS_EQUAL(map2, MatrixType::Zero(rows, cols));
+
   VERIFY_IS_EQUAL(map3, MatrixType::Ones(rows, cols));
+  map3.setConstant(s1);
+  VERIFY_IS_EQUAL(map3, MatrixType::Constant(rows, cols, s1));
+  map3.setZero();
+  VERIFY_IS_EQUAL(map3, MatrixType::Zero(rows, cols));
+
   map1 = MatrixType::Random(rows, cols);
   map2 = map1;
   map3 = map1;
diff --git a/test/mapstride.cpp b/test/mapstride.cpp
index 224316f..1a9f50c 100644
--- a/test/mapstride.cpp
+++ b/test/mapstride.cpp
@@ -94,6 +94,8 @@
     VERIFY_IS_APPROX(s1 * map, s1 * m);
     map *= s1;
     VERIFY_IS_APPROX(map, s1 * m);
+    map.setZero();
+    VERIFY_IS_CWISE_EQUAL(map, MatrixType::Zero(rows, cols));
   }
 
   // test no inner stride and an outer stride of +4. This is quite important as for fixed-size matrices,
@@ -118,6 +120,8 @@
     VERIFY_IS_APPROX(s1 * map, s1 * m);
     map *= s1;
     VERIFY_IS_APPROX(map, s1 * m);
+    map.setZero();
+    VERIFY_IS_CWISE_EQUAL(map, MatrixType::Zero(rows, cols));
   }
 
   // test both inner stride and outer stride
@@ -138,6 +142,8 @@
     VERIFY_IS_APPROX(s1 * map, s1 * m);
     map *= s1;
     VERIFY_IS_APPROX(map, s1 * m);
+    map.setZero();
+    VERIFY_IS_CWISE_EQUAL(map, MatrixType::Zero(rows, cols));
   }
 
   // test inner stride and no outer stride
@@ -156,6 +162,8 @@
     VERIFY_IS_APPROX(s1 * map, s1 * m);
     map *= s1;
     VERIFY_IS_APPROX(map, s1 * m);
+    map.setZero();
+    VERIFY_IS_CWISE_EQUAL(map, MatrixType::Zero(rows, cols));
   }
 
   // test negative strides
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index e4d1b8c..cdbaad6 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -199,6 +199,12 @@
     pcast_array<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio>::cast(data1, DataSize, data2);
 
     VERIFY(test::areApprox(ref, data2, DataSize) && "internal::pcast<>");
+
+    // Test that pcast<SrcScalar, TgtScalar> generates the same result.
+    for (int i = 0; i < DataSize; ++i) {
+      data2[i] = internal::pcast<SrcScalar, TgtScalar>(data1[i]);
+    }
+    VERIFY(test::areApprox(ref, data2, DataSize) && "internal::pcast<>");
   }
 };
 
@@ -1496,7 +1502,7 @@
 template <typename Scalar, typename Packet>
 struct exp_complex_test_impl<Scalar, Packet, false> {
   typedef typename Scalar::value_type RealScalar;
-  static void run(Scalar*, Scalar*, Scalar*, int){};
+  static void run(Scalar*, Scalar*, Scalar*, int) {};
 };
 
 template <typename Scalar, typename Packet>