Update Eigen to commit:316eab8deb574d150f9cfc7f8b170156dc0cdd9f
CHANGELOG
=========
316eab8de - Do not set EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC for cuda compilation
07e4604b1 - Replace usage of CudaStreamDevice with GpuStreamDevice in tensor benchmarks GPU
PiperOrigin-RevId: 536777161
Change-Id: I8b60af36fd7f79ab0a624bc3497fcaea45489e63
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index f847b8d..0b867d5 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -361,28 +361,6 @@
#endif
#endif
-/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture
-/// supports Neon vector intrinsics for fp16.
-#if EIGEN_ARCH_ARM_OR_ARM64
- #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
- #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1
- #else
- #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0
- #endif
- #endif
-#endif
-
-/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture
-/// supports Neon scalar intrinsics for fp16.
-#if EIGEN_ARCH_ARM_OR_ARM64
- #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
- #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
- #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1
- #endif
- #endif
-#endif
-
/// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
#if defined(__mips__) || defined(__mips)
#define EIGEN_ARCH_MIPS 1
@@ -641,6 +619,28 @@
//
#endif
+/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture
+/// supports Neon vector intrinsics for fp16.
+#if EIGEN_ARCH_ARM_OR_ARM64
+ #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
+ #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1
+ #else
+ #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0
+ #endif
+ #endif
+#endif
+
+/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture
+/// supports Neon scalar intrinsics for fp16.
+#if EIGEN_ARCH_ARM_OR_ARM64
+ #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
+ #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
+ #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1
+ #endif
+ #endif
+#endif
+
#if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
// EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro.
// In most cases we want to check if both macros are defined which can be done using the define below.
diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
index 65784d0..d63ff8b 100644
--- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
@@ -10,7 +10,7 @@
#define BM_FuncGPU(FUNC) \
static void BM_##FUNC(int iters, int N) { \
StopBenchmarkTiming(); \
- Eigen::CudaStreamDevice stream; \
+ Eigen::GpuStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \
cudaDeviceSynchronize(); \
@@ -40,7 +40,7 @@
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
StopBenchmarkTiming(); \
- Eigen::CudaStreamDevice stream; \
+ Eigen::GpuStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3); \
cudaDeviceSynchronize(); \
@@ -59,7 +59,7 @@
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
StopBenchmarkTiming(); \
- Eigen::CudaStreamDevice stream; \
+ Eigen::GpuStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \
cudaDeviceSynchronize(); \
diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu
index 76d68c5..c778102 100644
--- a/bench/tensors/tensor_benchmarks_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_gpu.cu
@@ -10,7 +10,7 @@
#define BM_FuncGPU(FUNC) \
static void BM_##FUNC(int iters, int N) { \
StopBenchmarkTiming(); \
- Eigen::CudaStreamDevice stream; \
+ Eigen::GpuStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \
cudaDeviceSynchronize(); \
@@ -40,7 +40,7 @@
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
StopBenchmarkTiming(); \
- Eigen::CudaStreamDevice stream; \
+ Eigen::GpuStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3); \
cudaDeviceSynchronize(); \
@@ -59,7 +59,7 @@
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
StopBenchmarkTiming(); \
- Eigen::CudaStreamDevice stream; \
+ Eigen::GpuStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \
cudaDeviceSynchronize(); \