Eigen/src/Core/util/ConfigureVectorization.h - eigen - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2020, Arm Limited and Contributors
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_CONFIGURE_VECTORIZATION_H
 #define EIGEN_CONFIGURE_VECTORIZATION_H

 //------------------------------------------------------------------------------------------
 // Static and dynamic alignment control
 //
 // The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
 // as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
 // The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
 // a default value is automatically computed based on architecture, compiler, and OS.
 //
 // This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
 // to be used to declare statically aligned buffers.
 //------------------------------------------------------------------------------------------

 /* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
  * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
  * so that vectorization doesn't affect binary compatibility.
  *
  * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
  * vectorized and non-vectorized code.
  */
 #if (defined EIGEN_CUDACC)
 #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
 #define EIGEN_ALIGNOF(x) __alignof(x)
 #else
 #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n)
 #define EIGEN_ALIGNOF(x) alignof(x)
 #endif

 // If the user explicitly disable vectorization, then we also disable alignment
 #if defined(EIGEN_DONT_VECTORIZE)
 #if defined(EIGEN_GPUCC)
 // GPU code is always vectorized and requires memory alignment for
 // statically allocated buffers.
 #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
 #else
 #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
 #endif
 #elif defined(__AVX512F__)
 // 64 bytes static alignment is preferred only if really required
 #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
 #elif defined(__AVX__)
 // 32 bytes static alignment is preferred only if really required
 #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
 #elif defined __HVX__ && (__HVX_LENGTH__ == 128)
 #define EIGEN_IDEAL_MAX_ALIGN_BYTES 128
 #else
 #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
 #endif

 // EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
 #define EIGEN_MIN_ALIGN_BYTES 16

 // Defined the boundary (in bytes) on which the data needs to be aligned. Note
 // that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
 // aligned at all regardless of the value of this #define.

 #if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && \
     EIGEN_MAX_STATIC_ALIGN_BYTES > 0
 #error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
 #endif

 // EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated
 // They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
 #if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
 #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
 #undef EIGEN_MAX_STATIC_ALIGN_BYTES
 #endif
 #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
 #endif

 #ifndef EIGEN_MAX_STATIC_ALIGN_BYTES

 // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES

 // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
 // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
 // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
 // certain common platform (compiler+architecture combinations) to avoid these problems.
 // Only static alignment is really problematic (relies on nonstandard compiler extensions),
 // try to keep heap alignment even when we have to disable static alignment.
 #if EIGEN_COMP_GNUC && \
     !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS)
 #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
 #else
 #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
 #endif

 // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
 #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT && !EIGEN_COMP_SUNCC && !EIGEN_OS_QNX
 #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
 #else
 #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
 #endif

 #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
 #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
 #else
 #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
 #endif

 #endif

 // If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES
 #if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES < EIGEN_MAX_STATIC_ALIGN_BYTES
 #undef EIGEN_MAX_STATIC_ALIGN_BYTES
 #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
 #endif

 #if EIGEN_MAX_STATIC_ALIGN_BYTES == 0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
 #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
 #endif

 // At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
 // It takes into account both the user choice to explicitly enable/disable alignment (by setting
 // EIGEN_MAX_STATIC_ALIGN_BYTES) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only
 // EIGEN_MAX_STATIC_ALIGN_BYTES should be used.

 // Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
 #define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8)
 #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
 #define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
 #define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
 #if EIGEN_MAX_STATIC_ALIGN_BYTES > 0
 #define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
 #else
 #define EIGEN_ALIGN_MAX
 #endif

 // Dynamic alignment control

 #if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES > 0
 #error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
 #endif

 #ifdef EIGEN_DONT_ALIGN
 #ifdef EIGEN_MAX_ALIGN_BYTES
 #undef EIGEN_MAX_ALIGN_BYTES
 #endif
 #define EIGEN_MAX_ALIGN_BYTES 0
 #elif !defined(EIGEN_MAX_ALIGN_BYTES)
 #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
 #endif

 #if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
 #define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
 #else
 #define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
 #endif

 #ifndef EIGEN_UNALIGNED_VECTORIZE
 #define EIGEN_UNALIGNED_VECTORIZE 1
 #endif

 //----------------------------------------------------------------------

 // if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
 // account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
 #if EIGEN_MAX_ALIGN_BYTES == 0
 #ifndef EIGEN_DONT_VECTORIZE
 #define EIGEN_DONT_VECTORIZE
 #endif
 #endif

 // The following (except #include <malloc.h> and _M_IX86_FP ??) can likely be
 // removed as gcc 4.1 and msvc 2008 are not supported anyways.
 #if EIGEN_COMP_MSVC
 #include <malloc.h>  // for _aligned_malloc -- need it regardless of whether vectorization is enabled
 // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
 #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
 #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
 #endif
 #else
 #if defined(__SSE2__)
 #define EIGEN_SSE2_ON_NON_MSVC
 #endif
 #endif

 #if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC))

 #if defined(EIGEN_SSE2_ON_NON_MSVC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)

 // Defines symbols for compile-time detection of which instructions are
 // used.
 // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_SSE
 #define EIGEN_VECTORIZE_SSE2

 // Detect sse3/ssse3/sse4:
 // gcc and icc defines __SSE3__, ...
 // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
 // want to force the use of those instructions with msvc.
 #ifdef __SSE3__
 #define EIGEN_VECTORIZE_SSE3
 #endif
 #ifdef __SSSE3__
 #define EIGEN_VECTORIZE_SSSE3
 #endif
 #ifdef __SSE4_1__
 #define EIGEN_VECTORIZE_SSE4_1
 #endif
 #ifdef __SSE4_2__
 #define EIGEN_VECTORIZE_SSE4_2
 #endif
 #ifdef __AVX__
 #ifndef EIGEN_USE_SYCL
 #define EIGEN_VECTORIZE_AVX
 #endif
 #define EIGEN_VECTORIZE_SSE3
 #define EIGEN_VECTORIZE_SSSE3
 #define EIGEN_VECTORIZE_SSE4_1
 #define EIGEN_VECTORIZE_SSE4_2
 #endif
 #ifdef __AVX2__
 #ifndef EIGEN_USE_SYCL
 #define EIGEN_VECTORIZE_AVX2
 #define EIGEN_VECTORIZE_AVX
 #endif
 #define EIGEN_VECTORIZE_SSE3
 #define EIGEN_VECTORIZE_SSSE3
 #define EIGEN_VECTORIZE_SSE4_1
 #define EIGEN_VECTORIZE_SSE4_2
 #endif
 #if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__))
 // MSVC does not expose a switch dedicated for FMA
 // For MSVC, AVX2 => FMA
 #define EIGEN_VECTORIZE_FMA
 #endif
 #if defined(__AVX512F__)
 #ifndef EIGEN_VECTORIZE_FMA
 #if EIGEN_COMP_GNUC
 #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638).
 #else
 #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638).
 #endif
 #endif
 #ifndef EIGEN_USE_SYCL
 #define EIGEN_VECTORIZE_AVX512
 #define EIGEN_VECTORIZE_AVX2
 #define EIGEN_VECTORIZE_AVX
 #endif
 #define EIGEN_VECTORIZE_FMA
 #define EIGEN_VECTORIZE_SSE3
 #define EIGEN_VECTORIZE_SSSE3
 #define EIGEN_VECTORIZE_SSE4_1
 #define EIGEN_VECTORIZE_SSE4_2
 #ifndef EIGEN_USE_SYCL
 #ifdef __AVX512DQ__
 #define EIGEN_VECTORIZE_AVX512DQ
 #endif
 #ifdef __AVX512ER__
 #define EIGEN_VECTORIZE_AVX512ER
 #endif
 #ifdef __AVX512BF16__
 #define EIGEN_VECTORIZE_AVX512BF16
 #endif
 #ifdef __AVX512FP16__
 #ifdef __AVX512VL__
 #define EIGEN_VECTORIZE_AVX512FP16
 #else
 #if EIGEN_COMP_GNUC
 #error Please add -mavx512vl to your compiler flags: compiling with -mavx512fp16 alone without AVX512-VL is not supported.
 #else
 #error Please enable AVX512-VL in your compiler flags (e.g. -mavx512vl): compiling with AVX512-FP16 alone without AVX512-VL is not supported.
 #endif
 #endif
 #endif
 #endif
 #endif

 // Disable AVX support on broken xcode versions
 #if (EIGEN_COMP_CLANGAPPLE == 11000033) && (__MAC_OS_X_VERSION_MIN_REQUIRED == 101500)
 // A nasty bug in the clang compiler shipped with xcode in a common compilation situation
 // when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1
 #ifdef EIGEN_VECTORIZE_AVX
 #undef EIGEN_VECTORIZE_AVX
 #warning \
     "Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. "
 #ifdef EIGEN_VECTORIZE_AVX2
 #undef EIGEN_VECTORIZE_AVX2
 #endif
 #ifdef EIGEN_VECTORIZE_FMA
 #undef EIGEN_VECTORIZE_FMA
 #endif
 #ifdef EIGEN_VECTORIZE_AVX512
 #undef EIGEN_VECTORIZE_AVX512
 #endif
 #ifdef EIGEN_VECTORIZE_AVX512DQ
 #undef EIGEN_VECTORIZE_AVX512DQ
 #endif
 #ifdef EIGEN_VECTORIZE_AVX512ER
 #undef EIGEN_VECTORIZE_AVX512ER
 #endif
 #endif
 // NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with  -macosx-version-min=10.15 and AVX
 // NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2
 // produce core dumps in 3 tests NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all
 // cases NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)"  XCode 11.0 <- Produces many segfault and core dumping
 // tests
 //                                                                    with  -macosx-version-min=10.15 and AVX
 // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with
 //                                                                    -macosx-version-min=10.15 and AVX
 #endif

 // include files

 // This extern "C" works around a MINGW-w64 compilation issue
 // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
 // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
 // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
 // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
 // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
 // notice that since these are C headers, the extern "C" is theoretically needed anyways.
 extern "C" {
 // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
 // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
 #if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN
 #include <immintrin.h>
 #else
 #include <mmintrin.h>
 #include <emmintrin.h>
 #include <xmmintrin.h>
 #ifdef EIGEN_VECTORIZE_SSE3
 #include <pmmintrin.h>
 #endif
 #ifdef EIGEN_VECTORIZE_SSSE3
 #include <tmmintrin.h>
 #endif
 #ifdef EIGEN_VECTORIZE_SSE4_1
 #include <smmintrin.h>
 #endif
 #ifdef EIGEN_VECTORIZE_SSE4_2
 #include <nmmintrin.h>
 #endif
 #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
 #include <immintrin.h>
 #endif
 #endif
 }  // end extern "C"

 #elif defined(__VSX__) && !defined(__APPLE__)

 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_VSX 1
 #define EIGEN_VECTORIZE_FMA
 #include <altivec.h>
 // We need to #undef all these ugly tokens defined in <altivec.h>
 // => use __vector instead of vector
 #undef bool
 #undef vector
 #undef pixel

 #elif defined __ALTIVEC__

 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_ALTIVEC
 #define EIGEN_VECTORIZE_FMA
 #include <altivec.h>
 // We need to #undef all these ugly tokens defined in <altivec.h>
 // => use __vector instead of vector
 #undef bool
 #undef vector
 #undef pixel

 #elif ((defined __ARM_NEON) || (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE)

 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_NEON
 #include <arm_neon.h>

 // We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and
 // will not select the backend automatically
 #elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE)

 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_SVE
 #include <arm_sve.h>

 // Since we depend on knowing SVE vector lengths at compile-time, we need
 // to ensure a fixed lengths is set
 #if defined __ARM_FEATURE_SVE_BITS
 #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS
 #else
 #error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set."
 #endif

 #elif (defined __s390x__ && defined __VEC__)

 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_ZVECTOR
 #include <vecintrin.h>

 #elif defined __mips_msa

 // Limit MSA optimizations to little-endian CPUs for now.
 // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
 #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
 #if defined(__LP64__)
 #define EIGEN_MIPS_64
 #else
 #define EIGEN_MIPS_32
 #endif
 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_MSA
 #include <msa.h>
 #endif

 #elif defined __HVX__ && (__HVX_LENGTH__ == 128)

 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_HVX
 #include <hexagon_types.h>

 #endif
 #endif

 // Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all
 // compilers seem to follow this. We therefore include it explicitly.
 // See also: https://bugs.llvm.org/show_bug.cgi?id=47955
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
 #include <arm_fp16.h>
 #endif

 // Enable FMA for ARM.
 #if defined(__ARM_FEATURE_FMA)
 #define EIGEN_VECTORIZE_FMA
 #endif

 #if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_CLANG_STRICT_AT_LEAST(3, 8, 0))
 // We can use the optimized fp16 to float and float to fp16 conversion routines
 #define EIGEN_HAS_FP16_C

 #if EIGEN_COMP_GNUC
 // Make sure immintrin.h is included, even if e.g. vectorization is
 // explicitly disabled (see also issue #2395).
 // Note that FP16C intrinsics for gcc and clang are included by immintrin.h,
 // as opposed to emmintrin.h as suggested by Intel:
 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711
 #include <immintrin.h>
 #endif
 #endif

 #if defined EIGEN_CUDACC
 #define EIGEN_VECTORIZE_GPU
 #include <vector_types.h>
 #if EIGEN_CUDA_SDK_VER >= 70500
 #define EIGEN_HAS_CUDA_FP16
 #endif
 #endif

 #if defined(EIGEN_HAS_CUDA_FP16)
 #include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
 #endif

 #if defined(EIGEN_HIPCC)
 #define EIGEN_VECTORIZE_GPU
 #include <hip/hip_vector_types.h>
 #define EIGEN_HAS_HIP_FP16
 #include <hip/hip_fp16.h>
 #define EIGEN_HAS_HIP_BF16
 #include <hip/hip_bfloat16.h>
 #endif

 /** \brief Namespace containing all symbols from the %Eigen library. */
 // IWYU pragma: private
 #include "../InternalHeaderCheck.h"

 namespace Eigen {

 inline static const char *SimdInstructionSetsInUse(void) {
 #if defined(EIGEN_VECTORIZE_AVX512)
   return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_AVX)
   return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_SSE4_2)
   return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_SSE4_1)
   return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
 #elif defined(EIGEN_VECTORIZE_SSSE3)
   return "SSE, SSE2, SSE3, SSSE3";
 #elif defined(EIGEN_VECTORIZE_SSE3)
   return "SSE, SSE2, SSE3";
 #elif defined(EIGEN_VECTORIZE_SSE2)
   return "SSE, SSE2";
 #elif defined(EIGEN_VECTORIZE_ALTIVEC)
   return "AltiVec";
 #elif defined(EIGEN_VECTORIZE_VSX)
   return "VSX";
 #elif defined(EIGEN_VECTORIZE_NEON)
   return "ARM NEON";
 #elif defined(EIGEN_VECTORIZE_SVE)
   return "ARM SVE";
 #elif defined(EIGEN_VECTORIZE_ZVECTOR)
   return "S390X ZVECTOR";
 #elif defined(EIGEN_VECTORIZE_MSA)
   return "MIPS MSA";
 #else
   return "None";
 #endif
 }

 }  // end namespace Eigen

 #endif  // EIGEN_CONFIGURE_VECTORIZATION_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
	// Copyright (C) 2020, Arm Limited and Contributors
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	#ifndef EIGEN_CONFIGURE_VECTORIZATION_H
	#define EIGEN_CONFIGURE_VECTORIZATION_H

	//------------------------------------------------------------------------------------------
	// Static and dynamic alignment control
	//
	// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
	// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
	// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
	// a default value is automatically computed based on architecture, compiler, and OS.
	//
	// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
	// to be used to declare statically aligned buffers.
	//------------------------------------------------------------------------------------------

	/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
	* However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
	* so that vectorization doesn't affect binary compatibility.
	*
	* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
	* vectorized and non-vectorized code.
	*/
	#if (defined EIGEN_CUDACC)
	#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
	#define EIGEN_ALIGNOF(x) __alignof(x)
	#else
	#define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n)
	#define EIGEN_ALIGNOF(x) alignof(x)
	#endif

	// If the user explicitly disable vectorization, then we also disable alignment
	#if defined(EIGEN_DONT_VECTORIZE)
	#if defined(EIGEN_GPUCC)
	// GPU code is always vectorized and requires memory alignment for
	// statically allocated buffers.
	#define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
	#else
	#define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
	#endif
	#elif defined(__AVX512F__)
	// 64 bytes static alignment is preferred only if really required
	#define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
	#elif defined(__AVX__)
	// 32 bytes static alignment is preferred only if really required
	#define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
	#elif defined __HVX__ && (__HVX_LENGTH__ == 128)
	#define EIGEN_IDEAL_MAX_ALIGN_BYTES 128
	#else
	#define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
	#endif

	// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
	#define EIGEN_MIN_ALIGN_BYTES 16

	// Defined the boundary (in bytes) on which the data needs to be aligned. Note
	// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
	// aligned at all regardless of the value of this #define.

	#if (defined(EIGEN_DONT_ALIGN_STATICALLY) \|\| defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && \
	EIGEN_MAX_STATIC_ALIGN_BYTES > 0
	#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
	#endif

	// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated
	// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
	#if defined(EIGEN_DONT_ALIGN_STATICALLY) \|\| defined(EIGEN_DONT_ALIGN)
	#ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
	#undef EIGEN_MAX_STATIC_ALIGN_BYTES
	#endif
	#define EIGEN_MAX_STATIC_ALIGN_BYTES 0
	#endif

	#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES

	// Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES

	// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
	// 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
	// enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
	// certain common platform (compiler+architecture combinations) to avoid these problems.
	// Only static alignment is really problematic (relies on nonstandard compiler extensions),
	// try to keep heap alignment even when we have to disable static alignment.
	#if EIGEN_COMP_GNUC && \
	!(EIGEN_ARCH_i386_OR_x86_64 \|\| EIGEN_ARCH_ARM_OR_ARM64 \|\| EIGEN_ARCH_PPC \|\| EIGEN_ARCH_IA64 \|\| EIGEN_ARCH_MIPS)
	#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
	#else
	#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
	#endif

	// static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
	#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT && !EIGEN_COMP_SUNCC && !EIGEN_OS_QNX
	#define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
	#else
	#define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
	#endif

	#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
	#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
	#else
	#define EIGEN_MAX_STATIC_ALIGN_BYTES 0
	#endif

	#endif

	// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES
	#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES < EIGEN_MAX_STATIC_ALIGN_BYTES
	#undef EIGEN_MAX_STATIC_ALIGN_BYTES
	#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
	#endif

	#if EIGEN_MAX_STATIC_ALIGN_BYTES == 0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
	#define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
	#endif

	// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
	// It takes into account both the user choice to explicitly enable/disable alignment (by setting
	// EIGEN_MAX_STATIC_ALIGN_BYTES) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only
	// EIGEN_MAX_STATIC_ALIGN_BYTES should be used.

	// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
	#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8)
	#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
	#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
	#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
	#if EIGEN_MAX_STATIC_ALIGN_BYTES > 0
	#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
	#else
	#define EIGEN_ALIGN_MAX
	#endif

	// Dynamic alignment control

	#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES > 0
	#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
	#endif

	#ifdef EIGEN_DONT_ALIGN
	#ifdef EIGEN_MAX_ALIGN_BYTES
	#undef EIGEN_MAX_ALIGN_BYTES
	#endif
	#define EIGEN_MAX_ALIGN_BYTES 0
	#elif !defined(EIGEN_MAX_ALIGN_BYTES)
	#define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
	#endif

	#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
	#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
	#else
	#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
	#endif

	#ifndef EIGEN_UNALIGNED_VECTORIZE
	#define EIGEN_UNALIGNED_VECTORIZE 1
	#endif

	//----------------------------------------------------------------------

	// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
	// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
	#if EIGEN_MAX_ALIGN_BYTES == 0
	#ifndef EIGEN_DONT_VECTORIZE
	#define EIGEN_DONT_VECTORIZE
	#endif
	#endif

	// The following (except #include <malloc.h> and _M_IX86_FP ??) can likely be
	// removed as gcc 4.1 and msvc 2008 are not supported anyways.
	#if EIGEN_COMP_MSVC
	#include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
	// a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
	#if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) \|\| EIGEN_ARCH_x86_64
	#define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
	#endif
	#else
	#if defined(__SSE2__)
	#define EIGEN_SSE2_ON_NON_MSVC
	#endif
	#endif

	#if !(defined(EIGEN_DONT_VECTORIZE) \|\| defined(EIGEN_GPUCC))

	#if defined(EIGEN_SSE2_ON_NON_MSVC) \|\| defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)

	// Defines symbols for compile-time detection of which instructions are
	// used.
	// EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
	#define EIGEN_VECTORIZE
	#define EIGEN_VECTORIZE_SSE
	#define EIGEN_VECTORIZE_SSE2

	// Detect sse3/ssse3/sse4:
	// gcc and icc defines __SSE3__, ...
	// there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
	// want to force the use of those instructions with msvc.
	#ifdef __SSE3__
	#define EIGEN_VECTORIZE_SSE3
	#endif
	#ifdef __SSSE3__
	#define EIGEN_VECTORIZE_SSSE3
	#endif
	#ifdef __SSE4_1__
	#define EIGEN_VECTORIZE_SSE4_1
	#endif
	#ifdef __SSE4_2__
	#define EIGEN_VECTORIZE_SSE4_2
	#endif
	#ifdef __AVX__
	#ifndef EIGEN_USE_SYCL
	#define EIGEN_VECTORIZE_AVX
	#endif
	#define EIGEN_VECTORIZE_SSE3
	#define EIGEN_VECTORIZE_SSSE3
	#define EIGEN_VECTORIZE_SSE4_1
	#define EIGEN_VECTORIZE_SSE4_2
	#endif
	#ifdef __AVX2__
	#ifndef EIGEN_USE_SYCL
	#define EIGEN_VECTORIZE_AVX2
	#define EIGEN_VECTORIZE_AVX
	#endif
	#define EIGEN_VECTORIZE_SSE3
	#define EIGEN_VECTORIZE_SSSE3
	#define EIGEN_VECTORIZE_SSE4_1
	#define EIGEN_VECTORIZE_SSE4_2
	#endif
	#if defined(__FMA__) \|\| (EIGEN_COMP_MSVC && defined(__AVX2__))
	// MSVC does not expose a switch dedicated for FMA
	// For MSVC, AVX2 => FMA
	#define EIGEN_VECTORIZE_FMA
	#endif
	#if defined(__AVX512F__)
	#ifndef EIGEN_VECTORIZE_FMA
	#if EIGEN_COMP_GNUC
	#error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638).
	#else
	#error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638).
	#endif
	#endif
	#ifndef EIGEN_USE_SYCL
	#define EIGEN_VECTORIZE_AVX512
	#define EIGEN_VECTORIZE_AVX2
	#define EIGEN_VECTORIZE_AVX
	#endif
	#define EIGEN_VECTORIZE_FMA
	#define EIGEN_VECTORIZE_SSE3
	#define EIGEN_VECTORIZE_SSSE3
	#define EIGEN_VECTORIZE_SSE4_1
	#define EIGEN_VECTORIZE_SSE4_2
	#ifndef EIGEN_USE_SYCL
	#ifdef __AVX512DQ__
	#define EIGEN_VECTORIZE_AVX512DQ
	#endif
	#ifdef __AVX512ER__
	#define EIGEN_VECTORIZE_AVX512ER
	#endif
	#ifdef __AVX512BF16__
	#define EIGEN_VECTORIZE_AVX512BF16
	#endif
	#ifdef __AVX512FP16__
	#ifdef __AVX512VL__
	#define EIGEN_VECTORIZE_AVX512FP16
	#else
	#if EIGEN_COMP_GNUC
	#error Please add -mavx512vl to your compiler flags: compiling with -mavx512fp16 alone without AVX512-VL is not supported.
	#else
	#error Please enable AVX512-VL in your compiler flags (e.g. -mavx512vl): compiling with AVX512-FP16 alone without AVX512-VL is not supported.
	#endif
	#endif
	#endif
	#endif
	#endif

	// Disable AVX support on broken xcode versions
	#if (EIGEN_COMP_CLANGAPPLE == 11000033) && (__MAC_OS_X_VERSION_MIN_REQUIRED == 101500)
	// A nasty bug in the clang compiler shipped with xcode in a common compilation situation
	// when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1
	#ifdef EIGEN_VECTORIZE_AVX
	#undef EIGEN_VECTORIZE_AVX
	#warning \
	"Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. "
	#ifdef EIGEN_VECTORIZE_AVX2
	#undef EIGEN_VECTORIZE_AVX2
	#endif
	#ifdef EIGEN_VECTORIZE_FMA
	#undef EIGEN_VECTORIZE_FMA
	#endif
	#ifdef EIGEN_VECTORIZE_AVX512
	#undef EIGEN_VECTORIZE_AVX512
	#endif
	#ifdef EIGEN_VECTORIZE_AVX512DQ
	#undef EIGEN_VECTORIZE_AVX512DQ
	#endif
	#ifdef EIGEN_VECTORIZE_AVX512ER
	#undef EIGEN_VECTORIZE_AVX512ER
	#endif
	#endif
	// NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with -macosx-version-min=10.15 and AVX
	// NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2
	// produce core dumps in 3 tests NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all
	// cases NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)" XCode 11.0 <- Produces many segfault and core dumping
	// tests
	// with -macosx-version-min=10.15 and AVX
	// NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with
	// -macosx-version-min=10.15 and AVX
	#endif

	// include files

	// This extern "C" works around a MINGW-w64 compilation issue
	// https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
	// In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
	// However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
	// with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
	// so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
	// notice that since these are C headers, the extern "C" is theoretically needed anyways.
	extern "C" {
	// In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
	// Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
	#if EIGEN_COMP_ICC >= 1110 \|\| EIGEN_COMP_EMSCRIPTEN
	#include <immintrin.h>
	#else
	#include <mmintrin.h>
	#include <emmintrin.h>
	#include <xmmintrin.h>
	#ifdef EIGEN_VECTORIZE_SSE3
	#include <pmmintrin.h>
	#endif
	#ifdef EIGEN_VECTORIZE_SSSE3
	#include <tmmintrin.h>
	#endif
	#ifdef EIGEN_VECTORIZE_SSE4_1
	#include <smmintrin.h>
	#endif
	#ifdef EIGEN_VECTORIZE_SSE4_2
	#include <nmmintrin.h>
	#endif
	#if defined(EIGEN_VECTORIZE_AVX) \|\| defined(EIGEN_VECTORIZE_AVX512)
	#include <immintrin.h>
	#endif
	#endif
	} // end extern "C"

	#elif defined(__VSX__) && !defined(__APPLE__)

	#define EIGEN_VECTORIZE
	#define EIGEN_VECTORIZE_VSX 1
	#define EIGEN_VECTORIZE_FMA
	#include <altivec.h>
	// We need to #undef all these ugly tokens defined in <altivec.h>
	// => use __vector instead of vector
	#undef bool
	#undef vector
	#undef pixel

	#elif defined __ALTIVEC__

	#define EIGEN_VECTORIZE
	#define EIGEN_VECTORIZE_ALTIVEC
	#define EIGEN_VECTORIZE_FMA
	#include <altivec.h>
	// We need to #undef all these ugly tokens defined in <altivec.h>
	// => use __vector instead of vector
	#undef bool
	#undef vector
	#undef pixel

	#elif ((defined __ARM_NEON) \|\| (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE)

	#define EIGEN_VECTORIZE
	#define EIGEN_VECTORIZE_NEON
	#include <arm_neon.h>

	// We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and
	// will not select the backend automatically
	#elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE)

	#define EIGEN_VECTORIZE
	#define EIGEN_VECTORIZE_SVE
	#include <arm_sve.h>

	// Since we depend on knowing SVE vector lengths at compile-time, we need
	// to ensure a fixed lengths is set
	#if defined __ARM_FEATURE_SVE_BITS
	#define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS
	#else
	#error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set."
	#endif

	#elif (defined __s390x__ && defined __VEC__)

	#define EIGEN_VECTORIZE
	#define EIGEN_VECTORIZE_ZVECTOR
	#include <vecintrin.h>

	#elif defined __mips_msa

	// Limit MSA optimizations to little-endian CPUs for now.
	// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
	#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
	#if defined(__LP64__)
	#define EIGEN_MIPS_64
	#else
	#define EIGEN_MIPS_32
	#endif
	#define EIGEN_VECTORIZE
	#define EIGEN_VECTORIZE_MSA
	#include <msa.h>
	#endif

	#elif defined __HVX__ && (__HVX_LENGTH__ == 128)

	#define EIGEN_VECTORIZE
	#define EIGEN_VECTORIZE_HVX
	#include <hexagon_types.h>

	#endif
	#endif

	// Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all
	// compilers seem to follow this. We therefore include it explicitly.
	// See also: https://bugs.llvm.org/show_bug.cgi?id=47955
	#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
	#include <arm_fp16.h>
	#endif

	// Enable FMA for ARM.
	#if defined(__ARM_FEATURE_FMA)
	#define EIGEN_VECTORIZE_FMA
	#endif

	#if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT \|\| EIGEN_CLANG_STRICT_AT_LEAST(3, 8, 0))
	// We can use the optimized fp16 to float and float to fp16 conversion routines
	#define EIGEN_HAS_FP16_C

	#if EIGEN_COMP_GNUC
	// Make sure immintrin.h is included, even if e.g. vectorization is
	// explicitly disabled (see also issue #2395).
	// Note that FP16C intrinsics for gcc and clang are included by immintrin.h,
	// as opposed to emmintrin.h as suggested by Intel:
	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711
	#include <immintrin.h>
	#endif
	#endif

	#if defined EIGEN_CUDACC
	#define EIGEN_VECTORIZE_GPU
	#include <vector_types.h>
	#if EIGEN_CUDA_SDK_VER >= 70500
	#define EIGEN_HAS_CUDA_FP16
	#endif
	#endif

	#if defined(EIGEN_HAS_CUDA_FP16)
	#include <cuda_runtime_api.h>
	#include <cuda_fp16.h>
	#endif

	#if defined(EIGEN_HIPCC)
	#define EIGEN_VECTORIZE_GPU
	#include <hip/hip_vector_types.h>
	#define EIGEN_HAS_HIP_FP16
	#include <hip/hip_fp16.h>
	#define EIGEN_HAS_HIP_BF16
	#include <hip/hip_bfloat16.h>
	#endif

	/** \brief Namespace containing all symbols from the %Eigen library. */
	// IWYU pragma: private
	#include "../InternalHeaderCheck.h"

	namespace Eigen {

	inline static const char *SimdInstructionSetsInUse(void) {
	#if defined(EIGEN_VECTORIZE_AVX512)
	return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
	#elif defined(EIGEN_VECTORIZE_AVX)
	return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
	#elif defined(EIGEN_VECTORIZE_SSE4_2)
	return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
	#elif defined(EIGEN_VECTORIZE_SSE4_1)
	return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
	#elif defined(EIGEN_VECTORIZE_SSSE3)
	return "SSE, SSE2, SSE3, SSSE3";
	#elif defined(EIGEN_VECTORIZE_SSE3)
	return "SSE, SSE2, SSE3";
	#elif defined(EIGEN_VECTORIZE_SSE2)
	return "SSE, SSE2";
	#elif defined(EIGEN_VECTORIZE_ALTIVEC)
	return "AltiVec";
	#elif defined(EIGEN_VECTORIZE_VSX)
	return "VSX";
	#elif defined(EIGEN_VECTORIZE_NEON)
	return "ARM NEON";
	#elif defined(EIGEN_VECTORIZE_SVE)
	return "ARM SVE";
	#elif defined(EIGEN_VECTORIZE_ZVECTOR)
	return "S390X ZVECTOR";
	#elif defined(EIGEN_VECTORIZE_MSA)
	return "MIPS MSA";
	#else
	return "None";
	#endif
	}

	} // end namespace Eigen

	#endif // EIGEN_CONFIGURE_VECTORIZATION_H