Fix accuracy for psqrt<double> on Arm. Merged upstream as: https://gitlab.com/libeigen/eigen/-/commit/6cee8d347e8a7e8e1a689a3b7de5fe413f3e1103

PiperOrigin-RevId: 347629113
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 90ffee7..47bca82 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -3895,9 +3895,10 @@
   Packet2d x = vrsqrteq_f64(_x);
   // Do a single step of Newton's iteration.
   //the number 1.5f was set reference to Quake3's fast inverse square root
-  x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
-  // Do one more Newton's iteration to get more accurate result.
-  x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
+  x = pmul(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
+  // Do two more Newton's iterations to geta result accurate to 1 ulp.
+  x = pmul(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
+  x = pmul(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
   // Flush results for denormals to zero.
   return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(pmul(_x, x)), denormal_mask));
 }