Update Eigen to commit:316eab8deb574d150f9cfc7f8b170156dc0cdd9f

CHANGELOG
=========
316eab8de - Do not set EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC for cuda compilation
07e4604b1 - Replace usage of CudaStreamDevice with GpuStreamDevice in tensor benchmarks GPU

PiperOrigin-RevId: 536777161
Change-Id: I8b60af36fd7f79ab0a624bc3497fcaea45489e63
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index f847b8d..0b867d5 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -361,28 +361,6 @@
   #endif
 #endif
 
-/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture
-/// supports Neon vector intrinsics for fp16.
-#if EIGEN_ARCH_ARM_OR_ARM64
-  #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
-    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1
-    #else
-      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0
-    #endif
-  #endif
-#endif
-
-/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture
-/// supports Neon scalar intrinsics for fp16.
-#if EIGEN_ARCH_ARM_OR_ARM64
-  #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
-    #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
-      #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1
-    #endif
-  #endif
-#endif
-
 /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
 #if defined(__mips__) || defined(__mips)
   #define EIGEN_ARCH_MIPS 1
@@ -641,6 +619,28 @@
 //
 #endif
 
+/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture
+/// supports Neon vector intrinsics for fp16.
+#if EIGEN_ARCH_ARM_OR_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
+      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1
+    #else
+      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0
+    #endif
+  #endif
+#endif
+
+/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture
+/// supports Neon scalar intrinsics for fp16.
+#if EIGEN_ARCH_ARM_OR_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
+    #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
+      #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1
+    #endif
+  #endif
+#endif
+
 #if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
 // EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro.
 // In most cases we want to check if both macros are defined which can be done using the define below.
diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
index 65784d0..d63ff8b 100644
--- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
@@ -10,7 +10,7 @@
 #define BM_FuncGPU(FUNC)                                                       \
   static void BM_##FUNC(int iters, int N) {                                    \
     StopBenchmarkTiming();                                                     \
-    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuStreamDevice stream;                                             \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N);            \
     cudaDeviceSynchronize();                                                   \
@@ -40,7 +40,7 @@
 #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
   static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
     StopBenchmarkTiming();                                                     \
-    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuStreamDevice stream;                                             \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3);   \
     cudaDeviceSynchronize();                                                   \
@@ -59,7 +59,7 @@
 #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
   static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
     StopBenchmarkTiming();                                                     \
-    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuStreamDevice stream;                                             \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N);            \
     cudaDeviceSynchronize();                                                   \
diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu
index 76d68c5..c778102 100644
--- a/bench/tensors/tensor_benchmarks_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_gpu.cu
@@ -10,7 +10,7 @@
 #define BM_FuncGPU(FUNC)                                                       \
   static void BM_##FUNC(int iters, int N) {                                    \
     StopBenchmarkTiming();                                                     \
-    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuStreamDevice stream;                                             \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N);                  \
     cudaDeviceSynchronize();                                                   \
@@ -40,7 +40,7 @@
 #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
   static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
     StopBenchmarkTiming();                                                     \
-    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuStreamDevice stream;                                             \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3);         \
     cudaDeviceSynchronize();                                                   \
@@ -59,7 +59,7 @@
 #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
   static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
     StopBenchmarkTiming();                                                     \
-    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuStreamDevice stream;                                             \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N);                  \
     cudaDeviceSynchronize();                                                   \