Fix accuracy for psqrt<double> on Arm. Merged upstream as: https://gitlab.com/libeigen/eigen/-/commit/6cee8d347e8a7e8e1a689a3b7de5fe413f3e1103
PiperOrigin-RevId: 347629113
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 90ffee7..47bca82 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -3895,9 +3895,10 @@
Packet2d x = vrsqrteq_f64(_x);
// Do a single step of Newton's iteration.
//the number 1.5f was set reference to Quake3's fast inverse square root
- x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
- // Do one more Newton's iteration to get more accurate result.
- x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
+ x = pmul(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
+ // Do two more Newton's iterations to geta result accurate to 1 ulp.
+ x = pmul(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
+ x = pmul(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
// Flush results for denormals to zero.
return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(pmul(_x, x)), denormal_mask));
}