Update Eigen to commit:ab310943d62982a45734f52fed782ec57c9aa6a0 CHANGELOG ========= ab310943d - Add a yield instruction in the two spinloops of the threaded matmul implementation. 99ffad197 - A few cleanups to threaded product code and test. 59498c96f - SSE/AVX use fmaddsub for complex products PiperOrigin-RevId: 662989342 Change-Id: I5ca079b01b427f8ba6a32dc6b2684eeda7ac1fd5

commit: ca0ee8864adfe0ab1225b549299604aba47a3713 [log] [tgz]
author: Rasmus Munk Larsen <rmlarsen@google.com> Wed Aug 14 11:18:07 2024 -0700
committer: Copybara-Service <copybara-worker@google.com> Wed Aug 14 11:19:10 2024 -0700
tree: 84804a6b17a97a6037d8d018f052c5d1774b0feb
parent: cd3b0ab2e985e5fe26b4ca4a94be12ba76687e76 [diff]
diff --git a/Eigen/Core b/Eigen/Core
index e452e73..29dda39 100644
--- a/Eigen/Core
+++ b/Eigen/Core

@@ -97,6 +97,11 @@
 // for std::is_nothrow_move_assignable
 #include <type_traits>
 
+// for std::this_thread::yield().
+#if !defined(EIGEN_USE_BLAS) && (defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL))
+#include <thread>
+#endif
+
 // for outputting debug info
 #ifdef EIGEN_DEBUG_ASSIGN
 #include <iostream>
@@ -117,8 +122,8 @@
 #include <CL/sycl.hpp>
 #include <map>
 #include <memory>
-#include <utility>
 #include <thread>
+#include <utility>
 #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0
 #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16
 #endif

diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index bae5714..67945cb 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h

@@ -85,10 +85,14 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
-  __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
-  __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)));
-  __m256 result = _mm256_addsub_ps(tmp1, tmp2);
+EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) {
+  __m256 tmp1 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)));
+  __m256 tmp2 = _mm256_moveldup_ps(a.v);
+#ifdef EIGEN_VECTORIZE_FMA
+  __m256 result = _mm256_fmaddsub_ps(tmp2, b.v, tmp1);
+#else
+  __m256 result = _mm256_addsub_ps(_mm256_mul_ps(tmp2, b.v), tmp1);
+#endif
   return Packet4cf(result);
 }
 
@@ -121,11 +125,11 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4cf pload<Packet4cf>(const std::complex<float>* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from)));
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(_mm256_load_ps(&numext::real_ref(*from)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from)));
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(_mm256_loadu_ps(&numext::real_ref(*from)));
 }
 
 template <>
@@ -145,11 +149,11 @@
 
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) {
-  EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v);
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(&numext::real_ref(*to), from.v);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v);
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(&numext::real_ref(*to), from.v);
 }
 
 template <>
@@ -283,13 +287,15 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
-  __m256d tmp1 = _mm256_shuffle_pd(a.v, a.v, 0x0);
-  __m256d even = _mm256_mul_pd(tmp1, b.v);
-  __m256d tmp2 = _mm256_shuffle_pd(a.v, a.v, 0xF);
-  __m256d tmp3 = _mm256_shuffle_pd(b.v, b.v, 0x5);
-  __m256d odd = _mm256_mul_pd(tmp2, tmp3);
-  return Packet2cd(_mm256_addsub_pd(even, odd));
+EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) {
+  __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a.v, 0xF), _mm256_permute_pd(b.v, 0x5));
+  __m256d tmp2 = _mm256_movedup_pd(a.v);
+#ifdef EIGEN_VECTORIZE_FMA
+  __m256d result = _mm256_fmaddsub_pd(tmp2, b.v, tmp1);
+#else
+  __m256d result = _mm256_addsub_pd(_mm256_mul_pd(tmp2, b.v), tmp1);
+#endif
+  return Packet2cd(result);
 }
 
 template <>
@@ -321,11 +327,11 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2cd pload<Packet2cd>(const std::complex<double>* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from));
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(_mm256_load_pd((const double*)from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from));
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(_mm256_loadu_pd((const double*)from));
 }
 
 template <>
@@ -342,11 +348,11 @@
 
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet2cd& from) {
-  EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v);
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd((double*)to, from.v);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet2cd& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v);
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd((double*)to, from.v);
 }
 
 template <>

diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 0e70f03..a390260 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h

@@ -89,19 +89,25 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) {
 #ifdef EIGEN_VECTORIZE_SSE3
-  return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
-                                 _mm_mul_ps(_mm_movehdup_ps(a.v), vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-  //   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-  //                                  _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-  //                                             vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+  __m128 tmp1 = _mm_mul_ps(_mm_movehdup_ps(a.v), vec4f_swizzle1(b.v, 1, 0, 3, 2));
+  __m128 tmp2 = _mm_moveldup_ps(a.v);
 #else
-  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000));
-  return Packet2cf(
-      _mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-                 _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
+  __m128 tmp1 = _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), vec4f_swizzle1(b.v, 1, 0, 3, 2));
+  __m128 tmp2 = vec4f_swizzle1(a.v, 0, 0, 2, 2);
 #endif
+#ifdef EIGEN_VECTORIZE_FMA
+  __m128 result = _mm_fmaddsub_ps(tmp2, b.v, tmp1);
+#else
+#ifdef EIGEN_VECTORIZE_SSE3
+  __m128 result = _mm_addsub_ps(_mm_mul_ps(tmp2, b.v), tmp1);
+#else
+  const __m128 mask = _mm_setr_ps(-0.0f, 0.0f, -0.0f, 0.0f);
+  __m128 result = _mm_add_ps(_mm_mul_ps(tmp2, b.v), _mm_xor_ps(tmp1, mask));
+#endif
+#endif
+  return Packet2cf(result);
 }
 
 template <>
@@ -127,11 +133,11 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from)));
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(_mm_load_ps(&numext::real_ref(*from)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from)));
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(_mm_loadu_ps(&numext::real_ref(*from)));
 }
 
 template <>
@@ -148,11 +154,11 @@
 
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
-  EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v));
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(&numext::real_ref(*to), from.v);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v));
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(&numext::real_ref(*to), from.v);
 }
 
 template <>
@@ -277,15 +283,24 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) {
+  __m128d tmp1 = _mm_mul_pd(_mm_unpackhi_pd(a.v, a.v), vec2d_swizzle1(b.v, 1, 0));
 #ifdef EIGEN_VECTORIZE_SSE3
-  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v),
-                                 _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0))));
+  __m128d tmp2 = _mm_movedup_pd(a.v);
 #else
-  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0, 0x0, 0x80000000, 0x0));
-  return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
-                              _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0)), mask)));
+  __m128d tmp2 = _mm_unpacklo_pd(a.v, a.v);
 #endif
+#ifdef EIGEN_VECTORIZE_FMA
+  __m128d result = _mm_fmaddsub_pd(tmp2, b.v, tmp1);
+#else
+#ifdef EIGEN_VECTORIZE_SSE3
+  __m128d result = _mm_addsub_pd(_mm_mul_pd(tmp2, b.v), tmp1);
+#else
+  const __m128d mask = _mm_setr_pd(-0.0, 0.0);
+  __m128d result = _mm_add_pd(_mm_mul_pd(tmp2, b.v), _mm_xor_pd(tmp1, mask));
+#endif
+#endif
+  return Packet1cd(result);
 }
 
 template <>
@@ -312,11 +327,11 @@
 // FIXME force unaligned load, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(_mm_load_pd((const double*)from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(_mm_loadu_pd((const double*)from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd
@@ -332,11 +347,11 @@
 // FIXME force unaligned store, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
-  EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v));
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd((double*)to, from.v);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v));
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd((double*)to, from.v);
 }
 
 template <>

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index e9d0cae..ebfac01 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h

@@ -97,6 +97,7 @@
         // Then, we set info->task_info[tid].users to the number of threads to mark that all other threads are going to
         // use it.
         while (info->task_info[tid].users != 0) {
+          std::this_thread::yield();
         }
         info->task_info[tid].users = threads;
 
@@ -115,6 +116,7 @@
           // However, no need to wait for the B' part which has been updated by the current thread!
           if (shift > 0) {
             while (info->task_info[i].sync != k) {
+              std::this_thread::yield();
             }
           }
 

diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 667fea2..018efa6 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h

@@ -71,7 +71,7 @@
 // TODO(rmlarsen): Make the device API available instead of
 // storing a local static pointer variable to avoid this issue.
 inline ThreadPool* setGemmThreadPool(ThreadPool* new_pool) {
-  static ThreadPool* pool;
+  static ThreadPool* pool = nullptr;
   if (new_pool != nullptr) {
     // This will wait for work in all threads in *pool to finish,
     // then destroy the old ThreadPool, and then replace it with new_pool.
@@ -232,7 +232,6 @@
   }
 
 #elif defined(EIGEN_GEMM_THREADPOOL)
-  ei_declare_aligned_stack_constructed_variable(GemmParallelTaskInfo<Index>, meta_info, threads, 0);
   Barrier barrier(threads);
   auto task = [=, &func, &barrier, &task_info](int i) {
     Index actual_threads = threads;

diff --git a/test/product_threaded.cpp b/test/product_threaded.cpp
index 1eb38fb..1782c28 100644
--- a/test/product_threaded.cpp
+++ b/test/product_threaded.cpp

@@ -19,6 +19,7 @@
   c.noalias() = a * b;
 
   ThreadPool pool(num_threads);
+  Eigen::setGemmThreadPool(&pool);
   MatrixXf c_threaded(n, n);
   c_threaded.noalias() = a * b;
commit	ca0ee8864adfe0ab1225b549299604aba47a3713	[log] [tgz]
author	Rasmus Munk Larsen <rmlarsen@google.com>	Wed Aug 14 11:18:07 2024 -0700
committer	Copybara-Service <copybara-worker@google.com>	Wed Aug 14 11:19:10 2024 -0700
tree	84804a6b17a97a6037d8d018f052c5d1774b0feb
parent	cd3b0ab2e985e5fe26b4ca4a94be12ba76687e76 [diff]