Fix accuracy for psqrt<double> on Arm. Merged upstream as: https://gitlab.com/libeigen/eigen/-/commit/6cee8d347e8a7e8e1a689a3b7de5fe413f3e1103 PiperOrigin-RevId: 347629113

commit: f1cae63b51b86c63821d85f1331babc6debb63d9 [log] [tgz]
author: rmlarsen <rmlarsen@google.com> Tue Dec 15 09:30:36 2020 -0800
committer: Antonio Sanchez <cantonios@google.com> Wed Jun 30 15:17:10 2021 -0700
tree: 8f0be5ef2dd519aabd6e582f98d7c140877a4dbb
parent: e68f7e4db6f96468619c8150a54e605d1dae85c8 [diff]
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 90ffee7..47bca82 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h

@@ -3895,9 +3895,10 @@
   Packet2d x = vrsqrteq_f64(_x);
   // Do a single step of Newton's iteration.
   //the number 1.5f was set reference to Quake3's fast inverse square root
-  x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
-  // Do one more Newton's iteration to get more accurate result.
-  x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
+  x = pmul(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
+  // Do two more Newton's iterations to geta result accurate to 1 ulp.
+  x = pmul(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
+  x = pmul(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
   // Flush results for denormals to zero.
   return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(pmul(_x, x)), denormal_mask));
 }
commit	f1cae63b51b86c63821d85f1331babc6debb63d9	[log] [tgz]
author	rmlarsen <rmlarsen@google.com>	Tue Dec 15 09:30:36 2020 -0800
committer	Antonio Sanchez <cantonios@google.com>	Wed Jun 30 15:17:10 2021 -0700
tree	8f0be5ef2dd519aabd6e582f98d7c140877a4dbb
parent	e68f7e4db6f96468619c8150a54e605d1dae85c8 [diff]