BEGIN_PUBLIC
Update Eigen to: https://gitlab.com/libeigen/eigen/-/commit/41d5d5334b8a4e364dfd88dcd91f6cd38834b8ed
END_PUBLIC

PiperOrigin-RevId: 343178274
diff --git a/Eigen/LU b/Eigen/LU
index ca72f13..2a6b771 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -40,12 +40,8 @@
 
 // Use the SSE optimized version whenever possible. At the moment the
 // SSE version doesn't compile when AVX is enabled
-#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
-  #include "src/LU/arch/Inverse_SSE.h"
-#endif
-
-#if defined EIGEN_VECTORIZE_NEON
-  #include "src/LU/arch/Inverse_NEON.h"
+#if (defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX) || defined EIGEN_VECTORIZE_NEON
+  #include "src/LU/arch/InverseSize4.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 07f4b94..e9da359 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -376,7 +376,7 @@
 * Implementation of cast                                                 *
 ****************************************************************************/
 
-template<typename OldType, typename NewType>
+template<typename OldType, typename NewType, typename EnableIf = void>
 struct cast_impl
 {
   EIGEN_DEVICE_FUNC
@@ -386,6 +386,22 @@
   }
 };
 
+// Casting from S -> Complex<T> leads to an implicit conversion from S to T,
+// generating warnings on clang.  Here we explicitly cast the real component.
+template<typename OldType, typename NewType>
+struct cast_impl<OldType, NewType,
+  typename internal::enable_if<
+    !NumTraits<OldType>::IsComplex && NumTraits<NewType>::IsComplex
+  >::type>
+{
+  EIGEN_DEVICE_FUNC
+  static inline NewType run(const OldType& x)
+  {
+    typedef typename NumTraits<NewType>::Real NewReal;
+    return static_cast<NewType>(static_cast<NewReal>(x));
+  }
+};
+
 // here, for once, we're plainly returning NewType: we don't want cast to do weird things.
 
 template<typename OldType, typename NewType>
@@ -486,7 +502,7 @@
       #if defined(EIGEN_HIP_DEVICE_COMPILE)
       // HIP does not seem to have a native device side implementation for the math routine "arg"
       using std::arg;
-      #else 		  
+      #else
       EIGEN_USING_STD(arg);
       #endif
       return arg(x);
@@ -967,7 +983,7 @@
 
 namespace numext {
 
-#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) 
+#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index f6d02f7..fcfe6f4 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -10,20 +10,22 @@
 #ifndef EIGEN_TRANSPOSITIONS_H
 #define EIGEN_TRANSPOSITIONS_H
 
-namespace Eigen { 
+namespace Eigen {
 
 template<typename Derived>
 class TranspositionsBase
 {
     typedef internal::traits<Derived> Traits;
-    
+
   public:
 
     typedef typename Traits::IndicesType IndicesType;
     typedef typename IndicesType::Scalar StorageIndex;
     typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
+    EIGEN_DEVICE_FUNC
     Derived& derived() { return *static_cast<Derived*>(this); }
+    EIGEN_DEVICE_FUNC
     const Derived& derived() const { return *static_cast<const Derived*>(this); }
 
     /** Copies the \a other transpositions into \c *this */
@@ -35,10 +37,13 @@
     }
 
     /** \returns the number of transpositions */
+    EIGEN_DEVICE_FUNC
     Index size() const { return indices().size(); }
     /** \returns the number of rows of the equivalent permutation matrix */
+    EIGEN_DEVICE_FUNC
     Index rows() const { return indices().size(); }
     /** \returns the number of columns of the equivalent permutation matrix */
+    EIGEN_DEVICE_FUNC
     Index cols() const { return indices().size(); }
 
     /** Direct access to the underlying index vector */
@@ -55,8 +60,10 @@
     inline StorageIndex& operator[](Index i) { return indices()(i); }
 
     /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
     const IndicesType& indices() const { return derived().indices(); }
     /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
     IndicesType& indices() { return derived().indices(); }
 
     /** Resizes to given size. */
@@ -178,8 +185,10 @@
     {}
 
     /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
     const IndicesType& indices() const { return m_indices; }
     /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
     IndicesType& indices() { return m_indices; }
 
   protected:
@@ -237,9 +246,11 @@
     #endif
 
     /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
     const IndicesType& indices() const { return m_indices; }
-    
+
     /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
     IndicesType& indices() { return m_indices; }
 
   protected:
@@ -279,9 +290,11 @@
     }
 
     /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
     const IndicesType& indices() const { return m_indices; }
 
     /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
     IndicesType& indices() { return m_indices; }
 
   protected:
@@ -335,8 +348,11 @@
 
     explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
 
+    EIGEN_DEVICE_FUNC
     Index size() const { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC
     Index rows() const { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC
     Index cols() const { return m_transpositions.size(); }
 
     /** \returns the \a matrix with the inverse transpositions applied to the columns.
@@ -356,7 +372,7 @@
     {
       return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
     }
-    
+
     const TranspositionType& nestedExpression() const { return m_transpositions; }
 
   protected:
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index 747f7a5..53ee53d 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -323,7 +323,7 @@
 }
 
 template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
-  return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, EIGEN_SSE_SHUFFLE_MASK(3,2,1,0)));
+  return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, (shuffle_mask<3,2,1,0>::mask)));
 }
 
 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a)
@@ -426,15 +426,15 @@
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4cd,4>& kernel) {
-  __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [a0 a1 b0 b1]
-  __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [a2 a3 b2 b3]
-  __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [c0 c1 d0 d1]
-  __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [c2 c3 d2 d3]
+  __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<0,1,0,1>::mask)); // [a0 a1 b0 b1]
+  __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<2,3,2,3>::mask)); // [a2 a3 b2 b3]
+  __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<0,1,0,1>::mask)); // [c0 c1 d0 d1]
+  __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<2,3,2,3>::mask)); // [c2 c3 d2 d3]
 
-  kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a3 b3 c3 d3]
-  kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a2 b2 c2 d2]
-  kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a1 b1 c1 d1]
-  kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a0 b0 c0 d0]
+  kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<1,3,1,3>::mask))); // [a3 b3 c3 d3]
+  kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<0,2,0,2>::mask))); // [a2 b2 c2 d2]
+  kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<1,3,1,3>::mask))); // [a1 b1 c1 d1]
+  kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0]
 }
 
 } // end namespace internal
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 4fdda8a..0bc1e9d 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -60,20 +60,6 @@
 
 struct half;
 
-// explicit conversion operators are no available before C++11 so we first cast
-// half to RealScalar rather than to std::complex<RealScalar> directly
-#if !EIGEN_HAS_CXX11
-namespace internal {
-template <typename RealScalar>
-struct cast_impl<half, std::complex<RealScalar> > {
-  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(const half &x)
-  {
-    return static_cast<std::complex<RealScalar> >(static_cast<RealScalar>(x));
-  }
-};
-} // namespace internal
-#endif  // EIGEN_HAS_CXX11
-
 namespace half_impl {
 
 #if !defined(EIGEN_HAS_GPU_FP16)
@@ -788,7 +774,7 @@
 
 } // end namespace Eigen
 
-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
   #pragma pop_macro("EIGEN_CONSTEXPR")
 #endif
 
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index dbfb1cd..30edd70 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -84,6 +84,53 @@
 
 #endif // EIGEN_COMP_MSVC
 
+// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
+// == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
+// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
+// to enable a shared implementation for fast inversion of matrices of size 4. 
+template<bool interleave> 
+EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask)
+{
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  return res;
+}
+
+template<> 
+EIGEN_STRONG_INLINE Packet4f shuffle<true>(const Packet4f &m, const Packet4f &n, int mask) 
+{
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  return res;
+}
+
+EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
+
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
+{ 
+  return shuffle<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
+}
+#define vec4f_duplane(a, p) \
+  vdupq_lane_f32(vget_low_f32(a), p)
+
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
@@ -3525,6 +3572,32 @@
 typedef float64x2_t Packet2d;
 typedef float64x1_t Packet1d;
 
+// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
+// Currently used in LU/arch/InverseSize4.h to enable a shared implementation 
+// for fast inversion of matrices of size 4.
+EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask)
+{
+  const double* a = reinterpret_cast<const double*>(&m);
+  const double* b = reinterpret_cast<const double*>(&n);
+  Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
+  return res;
+}
+
+EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask)
+{
+  return shuffle(a, b, mask);
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b)
+{
+  return shuffle(a, b, 0);
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b)
+{
+  return shuffle(a, b, 3);
+}
+#define vec2d_duplane(a, p) \
+  vdupq_laneq_f64(a, p)
+
 template<> struct packet_traits<double>  : default_packet_traits
 {
   typedef Packet2d type;
@@ -3766,7 +3839,7 @@
 }
 
 #else 
-template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrt_f64(_x); }
+template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
 #endif
 
 #endif // EIGEN_ARCH_ARM64
@@ -4282,7 +4355,7 @@
   prod = vmul_f16(prod, vrev64_f16(prod));
 
   Eigen::half h;
-  h.x = vget_lane_f16(prod, 0) * vget_lane_f16(prod, 1);
+  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
   return h;
 }
 
@@ -4291,7 +4364,7 @@
   float16x4_t prod;
   prod = vmul_f16(a, vrev64_f16(a));
   Eigen::half h;
-  h.x = vget_lane_f16(prod, 0) * vget_lane_f16(prod, 1);
+  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
   return h;
 }
 
diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h
index 80be213..54f9733 100644
--- a/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@@ -1401,6 +1401,14 @@
 EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
   return vreinterpretq_u64_f64(a);
 }
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
+  return vreinterpretq_f64_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
+  return vreinterpretq_s32_f64(a);
+}
 
 #endif  // EIGEN_ARCH_ARM64
 
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index c39c0a0..4db7014 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -52,22 +52,59 @@
 template<> struct is_arithmetic<Packet4i>  { enum { value = true }; };
 template<> struct is_arithmetic<Packet16b>  { enum { value = true }; };
 
-#define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p))
+template<int p, int q, int r, int s>
+struct shuffle_mask{
+ enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) };
+};
 
+// TODO: change the implementation of all swizzle* ops from macro to template,
 #define vec4f_swizzle1(v,p,q,r,s) \
-  (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))))
+  Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask<p,q,r,s>::mask))))
 
 #define vec4i_swizzle1(v,p,q,r,s) \
-  (_mm_shuffle_epi32( v, EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))
+  Packet4i(_mm_shuffle_epi32( v, (shuffle_mask<p,q,r,s>::mask)))
 
 #define vec2d_swizzle1(v,p,q) \
-  (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), EIGEN_SSE_SHUFFLE_MASK(2*p,2*p+1,2*q,2*q+1))))
+  Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask))))
 
 #define vec4f_swizzle2(a,b,p,q,r,s) \
-  (_mm_shuffle_ps( (a), (b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))
+  Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask<p,q,r,s>::mask)))
 
 #define vec4i_swizzle2(a,b,p,q,r,s) \
-  (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))))
+  Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p,q,r,s>::mask)))))
+
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_movelh_ps(a,b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_movehl_ps(a,b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_unpacklo_ps(a,b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_unpackhi_ps(a,b));
+}
+#define vec4f_duplane(a,p) \
+  vec4f_swizzle2(a,a,p,p,p,p)
+
+#define vec2d_swizzle2(a,b,mask) \
+  Packet2d(_mm_shuffle_pd(a,b,mask))
+
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b)
+{
+  return Packet2d(_mm_unpacklo_pd(a,b));
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b)
+{
+  return Packet2d(_mm_unpackhi_pd(a,b));
+}
+#define vec2d_duplane(a,p) \
+  vec2d_swizzle2(a,a,(p<<1)|p)
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index 3e6cd90..d2a0037 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -77,6 +77,14 @@
   return _mm_castsi128_ps(a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d,Packet4i>(const Packet4i& a) {
+  return _mm_castsi128_pd(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet2d>(const Packet2d& a) {
+  return _mm_castpd_si128(a);
+}
+
 // Disable the following code since it's broken on too many platforms / compilers.
 //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
 #if 0
diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h
new file mode 100644
index 0000000..df0fe0e
--- /dev/null
+++ b/Eigen/src/LU/arch/InverseSize4.h
@@ -0,0 +1,348 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2001 Intel Corporation
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath.
+// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,
+// adjugate of M and determinant of M respectively. M# is computed block-wise
+// using specific formulae. For proof, see:
+// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
+// Variable names are adopted from \src\LU\Inverse_SSE.h.
+//
+// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h
+// comes from the following Intel's library:
+// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
+//
+// Here is the respective copyright and license statement:
+//
+//   Copyright (c) 2001 Intel Corporation.
+//
+// Permition is granted to use, copy, distribute and prepare derivative works
+// of this library for any purpose and without fee, provided, that the above
+// copyright notice and this statement appear in all copies.
+// Intel makes no representations about the suitability of this software for
+// any purpose, and specifically disclaims all warranties.
+// See LEGAL.TXT for all the legal information.
+//
+// TODO: Unify implementations of different data types (i.e. float and double).
+#ifndef EIGEN_INVERSE_SIZE_4_H
+#define EIGEN_INVERSE_SIZE_4_H
+
+namespace Eigen
+{
+namespace internal
+{
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType>
+{
+  enum
+  {
+    MatrixAlignment = traits<MatrixType>::Alignment,
+    ResultAlignment = traits<ResultType>::Alignment,
+    StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
+  };
+  typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType;
+
+  static void run(const MatrixType &mat, ResultType &result)
+  {
+    ActualMatrixType matrix(mat);
+
+    Packet4f _L1 = matrix.template packet<MatrixAlignment>(0);
+    Packet4f _L2 = matrix.template packet<MatrixAlignment>(4);
+    Packet4f _L3 = matrix.template packet<MatrixAlignment>(8);
+    Packet4f _L4 = matrix.template packet<MatrixAlignment>(12);
+
+    // Four 2x2 sub-matrices of the input matrix
+    // input = [[A, B],
+    //          [C, D]]
+    Packet4f A, B, C, D;
+
+    if (!StorageOrdersMatch)
+    {
+      A = vec4f_unpacklo(_L1, _L2);
+      B = vec4f_unpacklo(_L3, _L4);
+      C = vec4f_unpackhi(_L1, _L2);
+      D = vec4f_unpackhi(_L3, _L4);
+    }
+    else
+    {
+      A = vec4f_movelh(_L1, _L2);
+      B = vec4f_movehl(_L2, _L1);
+      C = vec4f_movelh(_L3, _L4);
+      D = vec4f_movehl(_L4, _L3);
+    }
+
+    Packet4f AB, DC;
+
+    // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
+    AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);
+    AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));
+
+    // DC = D#*C
+    DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);
+    DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));
+
+    // determinants of the sub-matrices
+    Packet4f dA, dB, dC, dD;
+
+    dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);
+    dA = psub(dA, vec4f_movehl(dA, dA));
+
+    dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);
+    dB = psub(dB, vec4f_movehl(dB, dB));
+
+    dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);
+    dC = psub(dC, vec4f_movehl(dC, dC));
+
+    dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);
+    dD = psub(dD, vec4f_movehl(dD, dD));
+
+    Packet4f d, d1, d2;
+
+    d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);
+    d = padd(d, vec4f_movehl(d, d));
+    d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));
+    d1 = pmul(dA, dD);
+    d2 = pmul(dB, dC);
+
+    // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
+    Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
+
+    // reciprocal of the determinant of the input matrix, rd = 1/det
+    Packet4f rd = pdiv(pset1<Packet4f>(1.0f), det);
+
+    // Four sub-matrices of the inverse
+    Packet4f iA, iB, iC, iD;
+
+    // iD = D*|A| - C*A#*B
+    iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));
+    iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));
+    iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);
+
+    // iA = A*|D| - B*D#*C
+    iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));
+    iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));
+    iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);
+
+    // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
+    iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));
+    iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));
+    iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);
+
+    // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
+    iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));
+    iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
+    iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
+
+    const int bits[4] = {0, -2147483648, -2147483648, 0};
+    const Packet4f p4f_sign_PNNP = preinterpret<Packet4f, Packet4i>(pgather<int, Packet4i>(bits, static_cast<Eigen::Index>(1)));
+    rd = pxor(rd, p4f_sign_PNNP);
+    iA = pmul(iA, rd);
+    iB = pmul(iB, rd);
+    iC = pmul(iC, rd);
+    iD = pmul(iD, rd);
+
+    Index res_stride = result.outerStride();
+    float *res = result.data();
+
+    pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1));
+    pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0));
+    pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1));
+    pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0));
+  }
+};
+
+#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))
+// same algorithm as above, except that each operand is split into
+// halves for two registers to hold.
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType>
+{
+  enum
+  {
+    MatrixAlignment = traits<MatrixType>::Alignment,
+    ResultAlignment = traits<ResultType>::Alignment,
+    StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
+  };
+  typedef typename conditional<(MatrixType::Flags & LinearAccessBit),
+                               MatrixType const &,
+                               typename MatrixType::PlainObject>::type
+      ActualMatrixType;
+
+  static void run(const MatrixType &mat, ResultType &result)
+  {
+    ActualMatrixType matrix(mat);
+
+    // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower
+    // row e.g. A1, upper row of A, A2, lower row of A
+    // input = [[A, B],  =  [[[A1, [B1,
+    //          [C, D]]        A2], B2]],
+    //                       [[C1, [D1,
+    //                         C2], D2]]]
+
+    Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
+
+    if (StorageOrdersMatch)
+    {
+      A1 = matrix.template packet<MatrixAlignment>(0);
+      B1 = matrix.template packet<MatrixAlignment>(2);
+      A2 = matrix.template packet<MatrixAlignment>(4);
+      B2 = matrix.template packet<MatrixAlignment>(6);
+      C1 = matrix.template packet<MatrixAlignment>(8);
+      D1 = matrix.template packet<MatrixAlignment>(10);
+      C2 = matrix.template packet<MatrixAlignment>(12);
+      D2 = matrix.template packet<MatrixAlignment>(14);
+    }
+    else
+    {
+      Packet2d temp;
+      A1 = matrix.template packet<MatrixAlignment>(0);
+      C1 = matrix.template packet<MatrixAlignment>(2);
+      A2 = matrix.template packet<MatrixAlignment>(4);
+      C2 = matrix.template packet<MatrixAlignment>(6);
+
+      temp = A1;
+      A1 = vec2d_unpacklo(A1, A2);
+      A2 = vec2d_unpackhi(temp, A2);
+
+      temp = C1;
+      C1 = vec2d_unpacklo(C1, C2);
+      C2 = vec2d_unpackhi(temp, C2);
+
+      B1 = matrix.template packet<MatrixAlignment>(8);
+      D1 = matrix.template packet<MatrixAlignment>(10);
+      B2 = matrix.template packet<MatrixAlignment>(12);
+      D2 = matrix.template packet<MatrixAlignment>(14);
+
+      temp = B1;
+      B1 = vec2d_unpacklo(B1, B2);
+      B2 = vec2d_unpackhi(temp, B2);
+
+      temp = D1;
+      D1 = vec2d_unpacklo(D1, D2);
+      D2 = vec2d_unpackhi(temp, D2);
+    }
+
+    // determinants of the sub-matrices
+    Packet2d dA, dB, dC, dD;
+
+    dA = vec2d_swizzle2(A2, A2, 1);
+    dA = pmul(A1, dA);
+    dA = psub(dA, vec2d_duplane(dA, 1));
+
+    dB = vec2d_swizzle2(B2, B2, 1);
+    dB = pmul(B1, dB);
+    dB = psub(dB, vec2d_duplane(dB, 1));
+
+    dC = vec2d_swizzle2(C2, C2, 1);
+    dC = pmul(C1, dC);
+    dC = psub(dC, vec2d_duplane(dC, 1));
+
+    dD = vec2d_swizzle2(D2, D2, 1);
+    dD = pmul(D1, dD);
+    dD = psub(dD, vec2d_duplane(dD, 1));
+
+    Packet2d DC1, DC2, AB1, AB2;
+
+    // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
+    AB1 = pmul(B1, vec2d_duplane(A2, 1));
+    AB2 = pmul(B2, vec2d_duplane(A1, 0));
+    AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));
+    AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));
+
+    // DC = D#*C
+    DC1 = pmul(C1, vec2d_duplane(D2, 1));
+    DC2 = pmul(C2, vec2d_duplane(D1, 0));
+    DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));
+    DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));
+
+    Packet2d d1, d2;
+
+    // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
+    Packet2d det;
+
+    // reciprocal of the determinant of the input matrix, rd = 1/det
+    Packet2d rd;
+
+    d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));
+    d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));
+    rd = padd(d1, d2);
+    rd = padd(rd, vec2d_duplane(rd, 1));
+
+    d1 = pmul(dA, dD);
+    d2 = pmul(dB, dC);
+
+    det = padd(d1, d2);
+    det = psub(det, rd);
+    det = vec2d_duplane(det, 0);
+    rd = pdiv(pset1<Packet2d>(1.0), det);
+
+    // rows of four sub-matrices of the inverse
+    Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
+
+    // iD = D*|A| - C*A#*B
+    iD1 = pmul(AB1, vec2d_duplane(C1, 0));
+    iD2 = pmul(AB1, vec2d_duplane(C2, 0));
+    iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));
+    iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));
+    dA = vec2d_duplane(dA, 0);
+    iD1 = psub(pmul(D1, dA), iD1);
+    iD2 = psub(pmul(D2, dA), iD2);
+
+    // iA = A*|D| - B*D#*C
+    iA1 = pmul(DC1, vec2d_duplane(B1, 0));
+    iA2 = pmul(DC1, vec2d_duplane(B2, 0));
+    iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));
+    iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));
+    dD = vec2d_duplane(dD, 0);
+    iA1 = psub(pmul(A1, dD), iA1);
+    iA2 = psub(pmul(A2, dD), iA2);
+
+    // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
+    iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));
+    iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));
+    iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2)));
+    iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2)));
+    dB = vec2d_duplane(dB, 0);
+    iB1 = psub(pmul(C1, dB), iB1);
+    iB2 = psub(pmul(C2, dB), iB2);
+
+    // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
+    iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));
+    iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));
+    iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2)));
+    iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2)));
+    dC = vec2d_duplane(dC, 0);
+    iC1 = psub(pmul(B1, dC), iC1);
+    iC2 = psub(pmul(B2, dC), iC2);
+
+    const int bits1[4] = {0, -2147483648, 0, 0};
+    const int bits2[4] = {0, 0, 0, -2147483648};
+    const Packet2d _Sign_NP = preinterpret<Packet2d, Packet4i>(pgather<int, Packet4i>(bits1, static_cast<Eigen::Index>(1)));
+    const Packet2d _Sign_PN = preinterpret<Packet2d, Packet4i>(pgather<int, Packet4i>(bits2, static_cast<Eigen::Index>(1)));
+    d1 = pxor(rd, _Sign_PN);
+    d2 = pxor(rd, _Sign_NP);
+
+    Index res_stride = result.outerStride();
+    double *res = result.data();
+    pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2));
+  }
+};
+#endif
+} // namespace internal
+} // namespace Eigen
+#endif
diff --git a/Eigen/src/LU/arch/Inverse_NEON.h b/Eigen/src/LU/arch/Inverse_NEON.h
deleted file mode 100644
index ed64b1b..0000000
--- a/Eigen/src/LU/arch/Inverse_NEON.h
+++ /dev/null
@@ -1,372 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-//
-// The algorithm below is a re-implementation of \src\LU\Inverse_SSE.h using NEON
-// intrinsics. inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,
-// adjugate of M and determinant of M respectively. M# is computed block-wise
-// using specific formulae. For proof, see:
-// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
-// Variable names are adopted from \src\LU\Inverse_SSE.h.
-
-// TODO: Unify implementations of different data types (i.e. float and double) and
-// different sets of instrinsics (i.e. SSE and NEON)
-#ifndef EIGEN_INVERSE_NEON_H
-#define EIGEN_INVERSE_NEON_H
-
-namespace Eigen
-{
-namespace internal
-{
-template <typename MatrixType, typename ResultType>
-struct compute_inverse_size4<Architecture::NEON, float, MatrixType, ResultType>
-{
-  enum
-  {
-    MatrixAlignment = traits<MatrixType>::Alignment,
-    ResultAlignment = traits<ResultType>::Alignment,
-    StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
-  };
-  typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType;
-
-  // fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
-  // == false (i.e. shuffle(m, n, mask, false) equals _mm_shuffle_ps(m, n, mask)),
-  // interleave m and n when interleave == true
-  static Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask, bool interleave = false)
-  {
-    const float *a = reinterpret_cast<const float *>(&m);
-    const float *b = reinterpret_cast<const float *>(&n);
-    if (!interleave)
-    {
-      Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
-      return res;
-    }
-    else
-    {
-      Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
-      return res;
-    }
-  }
-
-  static void run(const MatrixType &mat, ResultType &result)
-  {
-    ActualMatrixType matrix(mat);
-
-    Packet4f _L1 = matrix.template packet<MatrixAlignment>(0);
-    Packet4f _L2 = matrix.template packet<MatrixAlignment>(4);
-    Packet4f _L3 = matrix.template packet<MatrixAlignment>(8);
-    Packet4f _L4 = matrix.template packet<MatrixAlignment>(12);
-
-    // Four 2x2 sub-matrices of the input matrix
-    // input = [[A, B],
-    //          [C, D]]
-    Packet4f A, B, C, D;
-
-    if (!StorageOrdersMatch)
-    {
-      A = shuffle(_L1, _L2, 0x50, true);
-      B = shuffle(_L3, _L4, 0x50, true);
-      C = shuffle(_L1, _L2, 0xFA, true);
-      D = shuffle(_L3, _L4, 0xFA, true);
-    }
-    else
-    {
-      A = shuffle(_L1, _L2, 0x44);
-      B = shuffle(_L1, _L2, 0xEE);
-      C = shuffle(_L3, _L4, 0x44);
-      D = shuffle(_L3, _L4, 0xEE);
-    }
-
-    Packet4f AB, DC, temp;
-
-    // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
-    AB = shuffle(A, A, 0x0F);
-    AB = pmul(AB, B);
-
-    temp = shuffle(A, A, 0xA5);
-    temp = pmul(temp, shuffle(B, B, 0x4E));
-    AB = psub(AB, temp);
-
-    // DC = D#*C
-    DC = shuffle(D, D, 0x0F);
-    DC = pmul(DC, C);
-    temp = shuffle(D, D, 0xA5);
-    temp = pmul(temp, shuffle(C, C, 0x4E));
-    DC = psub(DC, temp);
-
-    // determinants of the sub-matrices
-    Packet4f dA, dB, dC, dD;
-
-    dA = pmul(shuffle(A, A, 0x5F), A);
-    dA = psub(dA, shuffle(dA, dA, 0xEE));
-
-    dB = pmul(shuffle(B, B, 0x5F), B);
-    dB = psub(dB, shuffle(dB, dB, 0xEE));
-
-    dC = pmul(shuffle(C, C, 0x5F), C);
-    dC = psub(dC, shuffle(dC, dC, 0xEE));
-
-    dD = pmul(shuffle(D, D, 0x5F), D);
-    dD = psub(dD, shuffle(dD, dD, 0xEE));
-
-    Packet4f d, d1, d2;
-    Packet2f sum;
-    temp = shuffle(DC, DC, 0xD8);
-    d = pmul(temp, AB);
-    sum = vpadd_f32(vadd_f32(vget_low_f32(d), vget_high_f32(d)), vadd_f32(vget_low_f32(d), vget_high_f32(d)));
-    d = vdupq_lane_f32(sum, 0);
-    d1 = pmul(dA, dD);
-    d2 = pmul(dB, dC);
-
-    // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
-    Packet4f det = psub(padd(d1, d2), d);
-
-    // reciprocal of the determinant of the input matrix, rd = 1/det
-    Packet4f rd = pdiv(vdupq_n_f32(float32_t(1.0)), det);
-
-    // Four sub-matrices of the inverse
-    Packet4f iA, iB, iC, iD;
-
-    // iD = D*|A| - C*A#*B
-    temp = shuffle(C, C, 0xA0);
-    temp = pmul(temp, shuffle(AB, AB, 0x44));
-    iD = shuffle(C, C, 0xF5);
-    iD = pmul(iD, shuffle(AB, AB, 0xEE));
-    iD = padd(iD, temp);
-    iD = psub(vmulq_lane_f32(D, vget_low_f32(dA), 0), iD);
-
-    // iA = A*|D| - B*D#*C
-    temp = shuffle(B, B, 0xA0);
-    temp = pmul(temp, shuffle(DC, DC, 0x44));
-    iA = shuffle(B, B, 0xF5);
-    iA = pmul(iA, shuffle(DC, DC, 0xEE));
-    iA = padd(iA, temp);
-    iA = psub(vmulq_lane_f32(A, vget_low_f32(dD), 0), iA);
-
-    // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
-    iB = pmul(D, shuffle(AB, AB, 0x33));
-    iB = psub(iB, pmul(shuffle(D, D, 0xB1), shuffle(AB, AB, 0x66)));
-    iB = psub(vmulq_lane_f32(C, vget_low_f32(dB), 0), iB);
-
-    // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
-    iC = pmul(A, shuffle(DC, DC, 0x33));
-    iC = psub(iC, pmul(shuffle(A, A, 0xB1), shuffle(DC, DC, 0x66)));
-    iC = psub(vmulq_lane_f32(B, vget_low_f32(dC), 0), iC);
-
-    const Packet4f coeff = {1.0, -1.0, -1.0, 1.0};
-    rd = pmul(vdupq_lane_f32(vget_low_f32(rd), 0), coeff);
-    iA = pmul(iA, rd);
-    iB = pmul(iB, rd);
-    iC = pmul(iC, rd);
-    iD = pmul(iD, rd);
-
-    Index res_stride = result.outerStride();
-    float *res = result.data();
-
-    pstoret<float, Packet4f, ResultAlignment>(res + 0, shuffle(iA, iB, 0x77));
-    pstoret<float, Packet4f, ResultAlignment>(res + res_stride, shuffle(iA, iB, 0x22));
-    pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, shuffle(iC, iD, 0x77));
-    pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, shuffle(iC, iD, 0x22));
-  }
-};
-
-#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
-
-// same algorithm as above, except that each operand is split into
-// halves for two registers to hold.
-template <typename MatrixType, typename ResultType>
-struct compute_inverse_size4<Architecture::NEON, double, MatrixType, ResultType>
-{
-  enum
-  {
-    MatrixAlignment = traits<MatrixType>::Alignment,
-    ResultAlignment = traits<ResultType>::Alignment,
-    StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
-  };
-  typedef typename conditional<(MatrixType::Flags & LinearAccessBit),
-                               MatrixType const &,
-                               typename MatrixType::PlainObject>::type
-      ActualMatrixType;
-
-  // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
-  static Packet2d shuffle(const Packet2d &m, const Packet2d &n, int mask)
-  {
-    const double *a = reinterpret_cast<const double *>(&m);
-    const double *b = reinterpret_cast<const double *>(&n);
-    Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
-    return res;
-  }
-
-  static void run(const MatrixType &mat, ResultType &result)
-  {
-    ActualMatrixType matrix(mat);
-
-    // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower
-    // row e.g. A1, upper row of A, A2, lower row of A
-    // input = [[A, B],  =  [[[A1, [B1,
-    //          [C, D]]        A2], B2]],
-    //                       [[C1, [D1,
-    //                         C2], D2]]]
-
-    Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
-
-    if (StorageOrdersMatch)
-    {
-      A1 = matrix.template packet<MatrixAlignment>(0);
-      B1 = matrix.template packet<MatrixAlignment>(2);
-      A2 = matrix.template packet<MatrixAlignment>(4);
-      B2 = matrix.template packet<MatrixAlignment>(6);
-      C1 = matrix.template packet<MatrixAlignment>(8);
-      D1 = matrix.template packet<MatrixAlignment>(10);
-      C2 = matrix.template packet<MatrixAlignment>(12);
-      D2 = matrix.template packet<MatrixAlignment>(14);
-    }
-    else
-    {
-      Packet2d temp;
-      A1 = matrix.template packet<MatrixAlignment>(0);
-      C1 = matrix.template packet<MatrixAlignment>(2);
-      A2 = matrix.template packet<MatrixAlignment>(4);
-      C2 = matrix.template packet<MatrixAlignment>(6);
-
-      temp = A1;
-      A1 = shuffle(A1, A2, 0);
-      A2 = shuffle(temp, A2, 3);
-
-      temp = C1;
-      C1 = shuffle(C1, C2, 0);
-      C2 = shuffle(temp, C2, 3);
-
-      B1 = matrix.template packet<MatrixAlignment>(8);
-      D1 = matrix.template packet<MatrixAlignment>(10);
-      B2 = matrix.template packet<MatrixAlignment>(12);
-      D2 = matrix.template packet<MatrixAlignment>(14);
-
-      temp = B1;
-      B1 = shuffle(B1, B2, 0);
-      B2 = shuffle(temp, B2, 3);
-
-      temp = D1;
-      D1 = shuffle(D1, D2, 0);
-      D2 = shuffle(temp, D2, 3);
-    }
-
-    // determinants of the sub-matrices
-    Packet2d dA, dB, dC, dD;
-
-    dA = shuffle(A2, A2, 1);
-    dA = pmul(A1, dA);
-    dA = psub(dA, vdupq_laneq_f64(dA, 1));
-
-    dB = shuffle(B2, B2, 1);
-    dB = pmul(B1, dB);
-    dB = psub(dB, vdupq_laneq_f64(dB, 1));
-
-    dC = shuffle(C2, C2, 1);
-    dC = pmul(C1, dC);
-    dC = psub(dC, vdupq_laneq_f64(dC, 1));
-
-    dD = shuffle(D2, D2, 1);
-    dD = pmul(D1, dD);
-    dD = psub(dD, vdupq_laneq_f64(dD, 1));
-
-    Packet2d DC1, DC2, AB1, AB2;
-
-    // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
-    AB1 = pmul(B1, vdupq_laneq_f64(A2, 1));
-    AB2 = pmul(B2, vdupq_laneq_f64(A1, 0));
-    AB1 = psub(AB1, pmul(B2, vdupq_laneq_f64(A1, 1)));
-    AB2 = psub(AB2, pmul(B1, vdupq_laneq_f64(A2, 0)));
-
-    // DC = D#*C
-    DC1 = pmul(C1, vdupq_laneq_f64(D2, 1));
-    DC2 = pmul(C2, vdupq_laneq_f64(D1, 0));
-    DC1 = psub(DC1, pmul(C2, vdupq_laneq_f64(D1, 1)));
-    DC2 = psub(DC2, pmul(C1, vdupq_laneq_f64(D2, 0)));
-
-    Packet2d d1, d2;
-
-    // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
-    Packet2d det;
-
-    // reciprocal of the determinant of the input matrix, rd = 1/det
-    Packet2d rd;
-
-    d1 = pmul(AB1, shuffle(DC1, DC2, 0));
-    d2 = pmul(AB2, shuffle(DC1, DC2, 3));
-    rd = padd(d1, d2);
-    rd = padd(rd, vdupq_laneq_f64(rd, 1));
-
-    d1 = pmul(dA, dD);
-    d2 = pmul(dB, dC);
-
-    det = padd(d1, d2);
-    det = psub(det, rd);
-    det = vdupq_laneq_f64(det, 0);
-    rd = pdiv(vdupq_n_f64(float64_t(1.0)), det);
-
-    // rows of four sub-matrices of the inverse
-    Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
-
-    // iD = D*|A| - C*A#*B
-    iD1 = pmul(AB1, vdupq_laneq_f64(C1, 0));
-    iD2 = pmul(AB1, vdupq_laneq_f64(C2, 0));
-    iD1 = padd(iD1, pmul(AB2, vdupq_laneq_f64(C1, 1)));
-    iD2 = padd(iD2, pmul(AB2, vdupq_laneq_f64(C2, 1)));
-    dA = vdupq_laneq_f64(dA, 0);
-    iD1 = psub(pmul(D1, dA), iD1);
-    iD2 = psub(pmul(D2, dA), iD2);
-
-    // iA = A*|D| - B*D#*C
-    iA1 = pmul(DC1, vdupq_laneq_f64(B1, 0));
-    iA2 = pmul(DC1, vdupq_laneq_f64(B2, 0));
-    iA1 = padd(iA1, pmul(DC2, vdupq_laneq_f64(B1, 1)));
-    iA2 = padd(iA2, pmul(DC2, vdupq_laneq_f64(B2, 1)));
-    dD = vdupq_laneq_f64(dD, 0);
-    iA1 = psub(pmul(A1, dD), iA1);
-    iA2 = psub(pmul(A2, dD), iA2);
-
-    // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
-    iB1 = pmul(D1, shuffle(AB2, AB1, 1));
-    iB2 = pmul(D2, shuffle(AB2, AB1, 1));
-    iB1 = psub(iB1, pmul(shuffle(D1, D1, 1), shuffle(AB2, AB1, 2)));
-    iB2 = psub(iB2, pmul(shuffle(D2, D2, 1), shuffle(AB2, AB1, 2)));
-    dB = vdupq_laneq_f64(dB, 0);
-    iB1 = psub(pmul(C1, dB), iB1);
-    iB2 = psub(pmul(C2, dB), iB2);
-
-    // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
-    iC1 = pmul(A1, shuffle(DC2, DC1, 1));
-    iC2 = pmul(A2, shuffle(DC2, DC1, 1));
-    iC1 = psub(iC1, pmul(shuffle(A1, A1, 1), shuffle(DC2, DC1, 2)));
-    iC2 = psub(iC2, pmul(shuffle(A2, A2, 1), shuffle(DC2, DC1, 2)));
-    dC = vdupq_laneq_f64(dC, 0);
-    iC1 = psub(pmul(B1, dC), iC1);
-    iC2 = psub(pmul(B2, dC), iC2);
-
-    const Packet2d PN = {1.0, -1.0};
-    const Packet2d NP = {-1.0, 1.0};
-    d1 = pmul(PN, rd);
-    d2 = pmul(NP, rd);
-
-    Index res_stride = result.outerStride();
-    double *res = result.data();
-    pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(shuffle(iA2, iA1, 3), d1));
-    pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(shuffle(iA2, iA1, 0), d2));
-    pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(shuffle(iB2, iB1, 3), d1));
-    pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(shuffle(iB2, iB1, 0), d2));
-    pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(shuffle(iC2, iC1, 3), d1));
-    pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(shuffle(iC2, iC1, 0), d2));
-    pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(shuffle(iD2, iD1, 3), d1));
-    pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(shuffle(iD2, iD1, 0), d2));
-  }
-};
-
-#endif  // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
-
-} // namespace internal
-} // namespace Eigen
-#endif
diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h
deleted file mode 100644
index 4dce2ef..0000000
--- a/Eigen/src/LU/arch/Inverse_SSE.h
+++ /dev/null
@@ -1,338 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2001 Intel Corporation
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// The SSE code for the 4x4 float and double matrix inverse in this file
-// comes from the following Intel's library:
-// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
-//
-// Here is the respective copyright and license statement:
-//
-//   Copyright (c) 2001 Intel Corporation.
-//
-// Permition is granted to use, copy, distribute and prepare derivative works
-// of this library for any purpose and without fee, provided, that the above
-// copyright notice and this statement appear in all copies.
-// Intel makes no representations about the suitability of this software for
-// any purpose, and specifically disclaims all warranties.
-// See LEGAL.TXT for all the legal information.
-
-#ifndef EIGEN_INVERSE_SSE_H
-#define EIGEN_INVERSE_SSE_H
-
-namespace Eigen { 
-
-namespace internal {
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
-{
-  enum {
-    MatrixAlignment     = traits<MatrixType>::Alignment,
-    ResultAlignment     = traits<ResultType>::Alignment,
-    StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
-  };
-  typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
-  
-  static void run(const MatrixType& mat, ResultType& result)
-  {
-    ActualMatrixType matrix(mat);
-    const Packet4f p4f_sign_PNNP = _mm_castsi128_ps(_mm_set_epi32(0x00000000, 0x80000000, 0x80000000, 0x00000000));
-
-    // Load the full matrix into registers
-    __m128 _L1 = matrix.template packet<MatrixAlignment>( 0);
-    __m128 _L2 = matrix.template packet<MatrixAlignment>( 4);
-    __m128 _L3 = matrix.template packet<MatrixAlignment>( 8);
-    __m128 _L4 = matrix.template packet<MatrixAlignment>(12);
-
-    // The inverse is calculated using "Divide and Conquer" technique. The
-    // original matrix is divide into four 2x2 sub-matrices. Since each
-    // register holds four matrix element, the smaller matrices are
-    // represented as a registers. Hence we get a better locality of the
-    // calculations.
-
-    __m128 A, B, C, D; // the four sub-matrices
-    if(!StorageOrdersMatch)
-    {
-      A = _mm_unpacklo_ps(_L1, _L2);
-      B = _mm_unpacklo_ps(_L3, _L4);
-      C = _mm_unpackhi_ps(_L1, _L2);
-      D = _mm_unpackhi_ps(_L3, _L4);
-    }
-    else
-    {
-      A = _mm_movelh_ps(_L1, _L2);
-      B = _mm_movehl_ps(_L2, _L1);
-      C = _mm_movelh_ps(_L3, _L4);
-      D = _mm_movehl_ps(_L4, _L3);
-    }
-
-    __m128 iA, iB, iC, iD,                 // partial inverse of the sub-matrices
-            DC, AB;
-    __m128 dA, dB, dC, dD;                 // determinant of the sub-matrices
-    __m128 det, d, d1, d2;
-    __m128 rd;                             // reciprocal of the determinant
-
-    //  AB = A# * B
-    AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B);
-    AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E)));
-    //  DC = D# * C
-    DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C);
-    DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E)));
-
-    //  dA = |A|
-    dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A);
-    dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA));
-    //  dB = |B|
-    dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B);
-    dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB));
-
-    //  dC = |C|
-    dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C);
-    dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC));
-    //  dD = |D|
-    dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D);
-    dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD));
-
-    //  d = trace(AB*DC) = trace(A#*B*D#*C)
-    d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB);
-
-    //  iD = C*A#*B
-    iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB));
-    iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB)));
-    //  iA = B*D#*C
-    iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC));
-    iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC)));
-
-    //  d = trace(AB*DC) = trace(A#*B*D#*C) [continue]
-    d  = _mm_add_ps(d, _mm_movehl_ps(d, d));
-    d  = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1));
-    d1 = _mm_mul_ss(dA,dD);
-    d2 = _mm_mul_ss(dB,dC);
-
-    //  iD = D*|A| - C*A#*B
-    iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD);
-
-    //  iA = A*|D| - B*D#*C;
-    iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA);
-
-    //  det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
-    det = _mm_sub_ss(_mm_add_ss(d1,d2),d);
-    rd  = _mm_div_ss(_mm_set_ss(1.0f), det);
-
-//     #ifdef ZERO_SINGULAR
-//         rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd);
-//     #endif
-
-    //  iB = D * (A#B)# = D*B#*A
-    iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33));
-    iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66)));
-    //  iC = A * (D#C)# = A*C#*D
-    iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33));
-    iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66)));
-
-    rd = _mm_shuffle_ps(rd,rd,0);
-    rd = _mm_xor_ps(rd, p4f_sign_PNNP);
-
-    //  iB = C*|B| - D*B#*A
-    iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB);
-
-    //  iC = B*|C| - A*C#*D;
-    iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC);
-
-    //  iX = iX / det
-    iA = _mm_mul_ps(rd,iA);
-    iB = _mm_mul_ps(rd,iB);
-    iC = _mm_mul_ps(rd,iC);
-    iD = _mm_mul_ps(rd,iD);
-
-    Index res_stride = result.outerStride();
-    float* res = result.data();
-    pstoret<float, Packet4f, ResultAlignment>(res+0,            _mm_shuffle_ps(iA,iB,0x77));
-    pstoret<float, Packet4f, ResultAlignment>(res+res_stride,   _mm_shuffle_ps(iA,iB,0x22));
-    pstoret<float, Packet4f, ResultAlignment>(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77));
-    pstoret<float, Packet4f, ResultAlignment>(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22));
-  }
-
-};
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
-{
-  enum {
-    MatrixAlignment     = traits<MatrixType>::Alignment,
-    ResultAlignment     = traits<ResultType>::Alignment,
-    StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
-  };
-  typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
-  
-  static void run(const MatrixType& mat, ResultType& result)
-  {
-    ActualMatrixType matrix(mat);
-    const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
-    const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-
-    // The inverse is calculated using "Divide and Conquer" technique. The
-    // original matrix is divide into four 2x2 sub-matrices. Since each
-    // register of the matrix holds two elements, the smaller matrices are
-    // consisted of two registers. Hence we get a better locality of the
-    // calculations.
-
-    // the four sub-matrices
-    __m128d A1, A2, B1, B2, C1, C2, D1, D2;
-    
-    if(StorageOrdersMatch)
-    {
-      A1 = matrix.template packet<MatrixAlignment>( 0); B1 = matrix.template packet<MatrixAlignment>( 2);
-      A2 = matrix.template packet<MatrixAlignment>( 4); B2 = matrix.template packet<MatrixAlignment>( 6);
-      C1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
-      C2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
-    }
-    else
-    {
-      __m128d tmp;
-      A1 = matrix.template packet<MatrixAlignment>( 0); C1 = matrix.template packet<MatrixAlignment>( 2);
-      A2 = matrix.template packet<MatrixAlignment>( 4); C2 = matrix.template packet<MatrixAlignment>( 6);
-      tmp = A1;
-      A1 = _mm_unpacklo_pd(A1,A2);
-      A2 = _mm_unpackhi_pd(tmp,A2);
-      tmp = C1;
-      C1 = _mm_unpacklo_pd(C1,C2);
-      C2 = _mm_unpackhi_pd(tmp,C2);
-      
-      B1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
-      B2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
-      tmp = B1;
-      B1 = _mm_unpacklo_pd(B1,B2);
-      B2 = _mm_unpackhi_pd(tmp,B2);
-      tmp = D1;
-      D1 = _mm_unpacklo_pd(D1,D2);
-      D2 = _mm_unpackhi_pd(tmp,D2);
-    }
-    
-    __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2,     // partial invese of the sub-matrices
-            DC1, DC2, AB1, AB2;
-    __m128d dA, dB, dC, dD;     // determinant of the sub-matrices
-    __m128d det, d1, d2, rd;
-
-    //  dA = |A|
-    dA = _mm_shuffle_pd(A2, A2, 1);
-    dA = _mm_mul_pd(A1, dA);
-    dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3));
-    //  dB = |B|
-    dB = _mm_shuffle_pd(B2, B2, 1);
-    dB = _mm_mul_pd(B1, dB);
-    dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3));
-
-    //  AB = A# * B
-    AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3));
-    AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0));
-    AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3)));
-    AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0)));
-
-    //  dC = |C|
-    dC = _mm_shuffle_pd(C2, C2, 1);
-    dC = _mm_mul_pd(C1, dC);
-    dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3));
-    //  dD = |D|
-    dD = _mm_shuffle_pd(D2, D2, 1);
-    dD = _mm_mul_pd(D1, dD);
-    dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3));
-
-    //  DC = D# * C
-    DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3));
-    DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0));
-    DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3)));
-    DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0)));
-
-    //  rd = trace(AB*DC) = trace(A#*B*D#*C)
-    d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0));
-    d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3));
-    rd = _mm_add_pd(d1, d2);
-    rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3));
-
-    //  iD = C*A#*B
-    iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0));
-    iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0));
-    iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3)));
-    iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3)));
-
-    //  iA = B*D#*C
-    iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0));
-    iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0));
-    iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3)));
-    iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3)));
-
-    //  iD = D*|A| - C*A#*B
-    dA = _mm_shuffle_pd(dA,dA,0);
-    iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1);
-    iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2);
-
-    //  iA = A*|D| - B*D#*C;
-    dD = _mm_shuffle_pd(dD,dD,0);
-    iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1);
-    iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2);
-
-    d1 = _mm_mul_sd(dA, dD);
-    d2 = _mm_mul_sd(dB, dC);
-
-    //  iB = D * (A#B)# = D*B#*A
-    iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1));
-    iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1));
-    iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2)));
-    iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2)));
-
-    //  det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
-    det = _mm_add_sd(d1, d2);
-    det = _mm_sub_sd(det, rd);
-
-    //  iC = A * (D#C)# = A*C#*D
-    iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1));
-    iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1));
-    iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2)));
-    iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2)));
-
-    rd = _mm_div_sd(_mm_set_sd(1.0), det);
-//     #ifdef ZERO_SINGULAR
-//         rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd);
-//     #endif
-    rd = _mm_shuffle_pd(rd,rd,0);
-
-    //  iB = C*|B| - D*B#*A
-    dB = _mm_shuffle_pd(dB,dB,0);
-    iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1);
-    iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2);
-
-    d1 = _mm_xor_pd(rd, _Sign_PN);
-    d2 = _mm_xor_pd(rd, _Sign_NP);
-
-    //  iC = B*|C| - A*C#*D;
-    dC = _mm_shuffle_pd(dC,dC,0);
-    iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1);
-    iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2);
-
-    Index res_stride = result.outerStride();
-    double* res = result.data();
-    pstoret<double, Packet2d, ResultAlignment>(res+0,             _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));
-    pstoret<double, Packet2d, ResultAlignment>(res+res_stride,    _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
-    pstoret<double, Packet2d, ResultAlignment>(res+2,             _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));
-    pstoret<double, Packet2d, ResultAlignment>(res+res_stride+2,  _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
-    pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride,  _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));
-    pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride,  _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
-    pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));
-    pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_INVERSE_SSE_H
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 50ae9ea..0f9ef23 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -14,7 +14,7 @@
 check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
 
 option(EIGEN_INTERNAL_DOCUMENTATION "Build internal documentation" OFF)
-
+option(EIGEN_DOC_USE_MATHJAX "Use MathJax for rendering math in HTML docs" ON)
 
 # Set some Doxygen flags
 set(EIGEN_DOXY_PROJECT_NAME             "Eigen")
@@ -22,12 +22,19 @@
 set(EIGEN_DOXY_INPUT                    "\"${Eigen_SOURCE_DIR}/Eigen\" \"${Eigen_SOURCE_DIR}/doc\"")
 set(EIGEN_DOXY_HTML_COLORSTYLE_HUE      "220")
 set(EIGEN_DOXY_TAGFILES                 "")
+
 if(EIGEN_INTERNAL_DOCUMENTATION)
   set(EIGEN_DOXY_INTERNAL                 "YES")
 else()
   set(EIGEN_DOXY_INTERNAL                 "NO")
 endif()
 
+if (EIGEN_DOC_USE_MATHJAX)
+  set(EIGEN_DOXY_USE_MATHJAX "YES")
+else ()
+  set(EIGEN_DOXY_USE_MATHJAX "NO")
+endif()
+
 configure_file(
   ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in
   ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 72120f1..ddfd7e8 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -1246,7 +1246,7 @@
 # output. When enabled you may also need to install MathJax separately and
 # configure the path to it using the MATHJAX_RELPATH option.
 
-USE_MATHJAX            = NO
+USE_MATHJAX            = @EIGEN_DOXY_USE_MATHJAX@
 
 # When MathJax is enabled you need to specify the location relative to the
 # HTML output directory using the MATHJAX_RELPATH option. The destination
@@ -1258,12 +1258,12 @@
 # However, it is strongly recommended to install a local
 # copy of MathJax from http://www.mathjax.org before deployment.
 
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+MATHJAX_RELPATH        = https://cdn.mathjax.org/mathjax/latest
 
 # The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
 # names that should be enabled during MathJax rendering.
 
-MATHJAX_EXTENSIONS     =
+MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
 
 # When the SEARCHENGINE tag is enabled doxygen will generate a search box
 # for the HTML output. The underlying search engine uses javascript
diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp
index f9044a2..4ca607c 100644
--- a/test/basicstuff.cpp
+++ b/test/basicstuff.cpp
@@ -195,11 +195,8 @@
   VERIFY(!static_cast<const MatrixType&>(cm).imag().isZero());
 }
 
-template<typename SrcScalar, typename TgtScalar, bool SrcIsHalfOrBF16 = (internal::is_same<SrcScalar, half>::value || internal::is_same<SrcScalar, bfloat16>::value)> struct casting_test;
-
-
 template<typename SrcScalar, typename TgtScalar>
-struct casting_test<SrcScalar, TgtScalar, false> {
+struct casting_test {
   static void run() {
     Matrix<SrcScalar,4,4> m;
     for (int i=0; i<m.rows(); ++i) {
@@ -210,33 +207,7 @@
     Matrix<TgtScalar,4,4> n = m.template cast<TgtScalar>();
     for (int i=0; i<m.rows(); ++i) {
       for (int j=0; j<m.cols(); ++j) {
-        VERIFY_IS_APPROX(n(i, j), static_cast<TgtScalar>(m(i, j)));
-      }
-    }
-  }
-};
-
-template<typename SrcScalar, typename TgtScalar>
-struct casting_test<SrcScalar, TgtScalar, true> {
-  static void run() {
-    casting_test<SrcScalar, TgtScalar, false>::run();
-  }
-};
-
-template<typename SrcScalar, typename RealScalar>
-struct casting_test<SrcScalar, std::complex<RealScalar>, true> {
-  static void run() {
-    typedef std::complex<RealScalar> TgtScalar;
-    Matrix<SrcScalar,4,4> m;
-    for (int i=0; i<m.rows(); ++i) {
-      for (int j=0; j<m.cols(); ++j) {
-        m(i, j) = internal::random_without_cast_overflow<SrcScalar, TgtScalar>::value();
-      }
-    }
-    Matrix<TgtScalar,4,4> n = m.template cast<TgtScalar>();
-    for (int i=0; i<m.rows(); ++i) {
-      for (int j=0; j<m.cols(); ++j) {
-        VERIFY_IS_APPROX(n(i, j), static_cast<TgtScalar>(static_cast<RealScalar>(m(i, j))));
+        VERIFY_IS_APPROX(n(i, j), (internal::cast<SrcScalar,TgtScalar>(m(i, j))));
       }
     }
   }
diff --git a/test/bfloat16_float.cpp b/test/bfloat16_float.cpp
index 79c868e..09df2b2 100644
--- a/test/bfloat16_float.cpp
+++ b/test/bfloat16_float.cpp
@@ -44,14 +44,14 @@
 template<typename T>
  void test_roundtrip() {
   // Representable T round trip via bfloat16
-  VERIFY_IS_EQUAL(static_cast<T>(static_cast<bfloat16>(-std::numeric_limits<T>::infinity())), -std::numeric_limits<T>::infinity());
-  VERIFY_IS_EQUAL(static_cast<T>(static_cast<bfloat16>(std::numeric_limits<T>::infinity())), std::numeric_limits<T>::infinity());
-  VERIFY_IS_EQUAL(static_cast<T>(static_cast<bfloat16>(T(-1.0))), T(-1.0));
-  VERIFY_IS_EQUAL(static_cast<T>(static_cast<bfloat16>(T(-0.5))), T(-0.5));
-  VERIFY_IS_EQUAL(static_cast<T>(static_cast<bfloat16>(T(-0.0))), T(-0.0));
-  VERIFY_IS_EQUAL(static_cast<T>(static_cast<bfloat16>(T(1.0))), T(1.0));
-  VERIFY_IS_EQUAL(static_cast<T>(static_cast<bfloat16>(T(0.5))), T(0.5));
-  VERIFY_IS_EQUAL(static_cast<T>(static_cast<bfloat16>(T(0.0))), T(0.0));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(-std::numeric_limits<T>::infinity()))), -std::numeric_limits<T>::infinity());
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(std::numeric_limits<T>::infinity()))), std::numeric_limits<T>::infinity());
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(-1.0)))), T(-1.0));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(-0.5)))), T(-0.5));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(-0.0)))), T(-0.0));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(1.0)))), T(1.0));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(0.5)))), T(0.5));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(0.0)))), T(0.0));
 }
 
 void test_conversion()
diff --git a/test/blasutil.cpp b/test/blasutil.cpp
index 0194291..845a498 100644
--- a/test/blasutil.cpp
+++ b/test/blasutil.cpp
@@ -9,9 +9,13 @@
 
 #include "main.h"
 
-#include <Eigen/Core>
-
-using namespace Eigen;
+// Disable "ignoring attributes on template argument"
+// for packet_traits<Packet*>
+// => The only workaround would be to wrap _m128 and the likes
+//    within wrappers.
+#if EIGEN_GNUC_AT_LEAST(6,0)
+    #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
 
 #define GET(i,j) (StorageOrder == RowMajor ? (i)*stride + (j) : (i) + (j)*stride)
 #define SCATTER(i,j,k) (StorageOrder == RowMajor ? ((i)+(k))*stride + (j) : (i) + ((j)+(k))*stride)
diff --git a/test/meta.cpp b/test/meta.cpp
index b432d93..7a8b93c 100644
--- a/test/meta.cpp
+++ b/test/meta.cpp
@@ -34,7 +34,7 @@
   VERIFY((!internal::is_same<float,double>::value));
   VERIFY((!internal::is_same<float,float&>::value));
   VERIFY((!internal::is_same<float,const float&>::value));
-  
+
   VERIFY(( internal::is_same<float,internal::remove_all<const float&>::type >::value));
   VERIFY(( internal::is_same<float,internal::remove_all<const float*>::type >::value));
   VERIFY(( internal::is_same<float,internal::remove_all<const float*&>::type >::value));
@@ -63,13 +63,13 @@
 
   VERIFY(( internal::is_same< internal::add_const_on_value_type<const float* const>::type, const float* const>::value));
   VERIFY(( internal::is_same< internal::add_const_on_value_type<float* const>::type, const float* const>::value));
-  
+
   VERIFY(( internal::is_same<float,internal::remove_reference<float&>::type >::value));
   VERIFY(( internal::is_same<const float,internal::remove_reference<const float&>::type >::value));
   VERIFY(( internal::is_same<float,internal::remove_pointer<float*>::type >::value));
   VERIFY(( internal::is_same<const float,internal::remove_pointer<const float*>::type >::value));
   VERIFY(( internal::is_same<float,internal::remove_pointer<float* const >::type >::value));
-  
+
 
   // is_convertible
   STATIC_CHECK(( internal::is_convertible<float,double>::value ));
@@ -96,7 +96,7 @@
   STATIC_CHECK((!internal::is_convertible<Array33f,int>::value ));
   STATIC_CHECK((!internal::is_convertible<MatrixXf,float>::value ));
   {
-    float f;
+    float f = 0.0f;
     MatrixXf A, B;
     VectorXf a, b;
     VERIFY(( check_is_convertible(a.dot(b), f) ));
@@ -126,7 +126,7 @@
   #endif
 
   {
-    int i;
+    int i = 0;
     VERIFY(( check_is_convertible(fix<3>(), i) ));
     VERIFY((!check_is_convertible(i, fix<DynamicIndex>()) ));
   }
@@ -136,7 +136,7 @@
   VERIFY((  internal::has_ReturnType<ScalarBinaryOpTraits<int,int> >::value ));
   VERIFY(( !internal::has_ReturnType<MatrixXf>::value ));
   VERIFY(( !internal::has_ReturnType<int>::value ));
-  
+
   VERIFY(internal::meta_sqrt<1>::ret == 1);
   #define VERIFY_META_SQRT(X) VERIFY(internal::meta_sqrt<X>::ret == int(std::sqrt(double(X))))
   VERIFY_META_SQRT(2);
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index eabf69c..81425b8 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -677,8 +677,8 @@
       test::packet_helper<PacketTraits::HasCos, Packet> h;
       for (Scalar k = Scalar(1); k < Scalar(10000) / std::numeric_limits<Scalar>::epsilon(); k *= Scalar(2)) {
         for (int k1 = 0; k1 <= 1; ++k1) {
-          data1[0] = Scalar((2 * k + k1) * EIGEN_PI / 2 * internal::random<double>(0.8, 1.2));
-          data1[1] = Scalar((2 * k + 2 + k1) * EIGEN_PI / 2 * internal::random<double>(0.8, 1.2));
+          data1[0] = Scalar((2 * double(k) + k1) * double(EIGEN_PI) / 2 * internal::random<double>(0.8, 1.2));
+          data1[1] = Scalar((2 * double(k) + 2 + k1) * double(EIGEN_PI) / 2 * internal::random<double>(0.8, 1.2));
           h.store(data2, internal::pcos(h.load(data1)));
           h.store(data2 + PacketSize, internal::psin(h.load(data1)));
           VERIFY(data2[0] <= Scalar(1.) && data2[0] >= Scalar(-1.));
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 17cee49..200f58b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -388,6 +388,7 @@
       resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
+
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 3a70d85..679996c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -966,6 +966,7 @@
 template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
 class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
  public:
+    typedef TensorBase<Derived, ReadOnlyAccessors> Base;
     typedef internal::traits<Derived> DerivedTraits;
     typedef typename DerivedTraits::Scalar Scalar;
     typedef typename DerivedTraits::Index Index;
@@ -1146,6 +1147,18 @@
     }
 
  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase)
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase)
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other)
+    {
+      typedef TensorAssignOp<Derived, const OtherDerived> Assign;
+      Assign assign(derived(), other.derived());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return derived();
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
     EIGEN_DEVICE_FUNC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 5b28e70..7c6bbd1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -80,44 +80,28 @@
 class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+    typedef TensorBase<TensorChippingOp<DimId, XprType> > Base;
+    typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
-      : m_xpr(expr), m_offset(offset), m_dim(dim) {
-  }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
+        : m_xpr(expr), m_offset(offset), m_dim(dim) {
+    }
 
-  EIGEN_DEVICE_FUNC
-  const Index offset() const { return m_offset; }
-  EIGEN_DEVICE_FUNC
-  const Index dim() const { return m_dim.actualDim(); }
+    EIGEN_DEVICE_FUNC
+    const Index offset() const { return m_offset; }
+    EIGEN_DEVICE_FUNC
+    const Index dim() const { return m_dim.actualDim(); }
 
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename XprType::Nested>::type&
-  expression() const { return m_xpr; }
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other)
-  {
-    typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign;
-    Assign assign(*this, other);
-    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-    return *this;
-  }
-
-  template<typename OtherDerived>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other)
-  {
-    typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign;
-    Assign assign(*this, other);
-    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-    return *this;
-  }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp)
 
   protected:
     typename XprType::Nested m_xpr;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 5968ff4..0dfe216 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -60,6 +60,7 @@
 class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
 {
   public:
+    typedef TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> Base;
     typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
     typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
     typedef typename internal::traits<TensorConcatenationOp>::Index Index;
@@ -81,25 +82,7 @@
 
     EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const TensorConcatenationOp& other)
-    {
-      typedef TensorAssignOp<TensorConcatenationOp, const TensorConcatenationOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorConcatenationOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp)
   protected:
     typename LhsXprType::Nested m_lhs_xpr;
     typename RhsXprType::Nested m_rhs_xpr;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 804a16c..96fa46c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -28,6 +28,8 @@
   public:
     TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
 
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice)
+
     template<typename OtherDerived>
     EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
       typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index a5be54b..ca39bb8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -340,27 +340,10 @@
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other)
-    {
-      // FIXME: check that the dimensions of other match the dimensions of *this.
-      // Unfortunately this isn't possible yet when the rhs is an expression.
-      typedef TensorAssignOp<Self, const TensorFixedSize> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other)
-    {
-      // FIXME: check that the dimensions of other match the dimensions of *this.
-      // Unfortunately this isn't possible yet when the rhs is an expression.
-      typedef TensorAssignOp<Self, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    // FIXME: check that the dimensions of other match the dimensions of *this.
+    // Unfortunately this isn't possible yet when the rhs is an expression.
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize)
+
 
   protected:
     EIGEN_DEVICE_FUNC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 05fa80e..f159db1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -69,39 +69,22 @@
 class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
 {
   public:
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
+    typedef TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> Base;
+    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
-      : m_xpr(expr) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
+        : m_xpr(expr) {}
 
     EIGEN_DEVICE_FUNC
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other)
-    {
-      typedef TensorAssignOp<TensorLayoutSwapOp, const TensorLayoutSwapOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp)
   protected:
     typename XprType::Nested m_xpr;
 };
@@ -211,7 +194,7 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
   { }
-  
+
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index 7a113f7..76d15f1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -83,4 +83,25 @@
   #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
 #endif
 
+#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+  #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =; \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
+    template <typename OtherDerived> \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { Base::operator=(other); return *this; }
+#else
+  #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
+#endif
+
+/** \internal
+ * \brief Macro to manually inherit assignment operators.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
+ * This also inherits template<OtherDerived> operator=(const OtherDerived&) assignments.
+ * With C++11 or later this also default-implements the copy-constructor
+ */
+#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived)  \
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
+
 #endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 172a6ba..6834c97 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -30,7 +30,7 @@
 {
   public:
     typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
-    typedef typename PlainObjectType::Base Base;
+    typedef TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > Base;
   #ifdef EIGEN_USE_SYCL
     typedef  typename Eigen::internal::remove_reference<typename Eigen::internal::nested<Self>::type>::type Nested;
   #else
@@ -40,7 +40,7 @@
     typedef typename internal::traits<PlainObjectType>::Index Index;
     typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType;
 
     typedef typename MakePointer_<Scalar>::Type PointerType;
     typedef typename MakePointer_<Scalar>::ConstType PointerConstType;
@@ -315,23 +315,7 @@
     }
 #endif
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other)
-    {
-      typedef TensorAssignOp<Self, const Self> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Self& operator=(const OtherDerived& other)
-    {
-      typedef TensorAssignOp<Self, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap)
 
   private:
     StoragePointerType m_data;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index a3a750f..a6181d3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -223,16 +223,6 @@
   Tuple(const U& f, const V& s) : first(f), second(s) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Tuple& operator= (const Tuple& rhs) {
-  #ifndef SYCL_DEVICE_ONLY
-    if (&rhs == this) return *this;
-  #endif
-    first = rhs.first;
-    second = rhs.second;
-    return *this;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void swap(Tuple& rhs) {
     using numext::swap;
     swap(first, rhs.first);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index b4bcb54..ceecd54 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -54,6 +54,7 @@
 class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors>
 {
   public:
+  typedef TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> Base;
   typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
@@ -70,24 +71,7 @@
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other)
-    {
-      typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -357,6 +341,7 @@
 class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> >
 {
   public:
+  typedef TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> > Base;
   typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
@@ -375,25 +360,7 @@
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other)
-    {
-      typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -873,6 +840,7 @@
 class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >
 {
   public:
+  typedef TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > Base;
   typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
@@ -896,26 +864,7 @@
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other)
-    {
-      typedef TensorAssignOp<TensorStridingSlicingOp, const TensorStridingSlicingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorStridingSlicingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp)
 
   protected:
     typename XprType::Nested m_xpr;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 2fc85c1..3b1fca5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -54,15 +54,16 @@
                                           XprType>, WriteAccessors>
 {
   public:
-  typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
-                                                                    StorageKind;
-  typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
+    typedef TensorBase<TensorReverseOp<ReverseDimensions, XprType>, WriteAccessors>Base;
+    typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
+                                                                      StorageKind;
+    typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
       const XprType& expr, const ReverseDimensions& reverse_dims)
       : m_xpr(expr), m_reverse_dims(reverse_dims) { }
 
@@ -73,24 +74,8 @@
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other)
-    {
-      typedef TensorAssignOp<TensorReverseOp, const TensorReverseOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp)
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorReverseOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
 
   protected:
     typename XprType::Nested m_xpr;
@@ -453,7 +438,7 @@
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return this->m_dimensions; }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 597ca64..e6fed3d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -54,14 +54,15 @@
 class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
+    typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
+    typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
       : m_xpr(expr), m_shuffle(shfl) {}
 
     EIGEN_DEVICE_FUNC
@@ -71,24 +72,8 @@
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other)
-    {
-      typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
 
   protected:
     typename XprType::Nested m_xpr;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index d05f375..64bf3f1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -54,14 +54,15 @@
 class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
+    typedef TensorBase<TensorStridingOp<Strides, XprType> > Base;
+    typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
       : m_xpr(expr), m_dims(dims) {}
 
     EIGEN_DEVICE_FUNC
@@ -71,24 +72,7 @@
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other)
-    {
-      typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp)
 
   protected:
     typename XprType::Nested m_xpr;
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 4cb962a..cea4bbe 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -81,8 +81,8 @@
   ei_add_property(EIGEN_MISSING_BACKENDS "fftw, ")
 endif()
 
-option(EIGEN_TEST_NO_OPENGL "Disable OpenGL support in unit tests" OFF)
-if(NOT EIGEN_TEST_NO_OPENGL)
+option(EIGEN_TEST_OPENGL "Enable OpenGL support in unit tests" OFF)
+if(EIGEN_TEST_OPENGL)
   find_package(OpenGL)
   find_package(GLUT)
   find_package(GLEW)