Update Eigen to commit:b0f877f8e01e90a5b0f3a79d46ea234899f8b499

CHANGELOG
=========
b0f877f8e - Don'\''t crash on empty tensor contraction.
15fbddaf9 - ASAN fixes for AVX512 GEMM/TRSM
178ef8c97 - qualify non-const symbolic indexed view with is_lvalue
df1049ddf - Small packet math cleanup.
9b48d1021 - Guard all malloc, realloc and free() fonctions with check_that_malloc_is_allowed()
c730290fa - Use the correct truncating intrinsic for double->int casting.
766db0202 - disable raw array indexed view access for 1d arrays
bfbc66e07 - refactor indexedviewmethods, enable non-const ref access with symbolic indices
1a5dfd7c0 - Fix incorrect casting in AVX512DQ path.
a08649994 - Optimize generic_rsqrt_newton_step
b8b8a2614 - Add more missing vectorized casts for int on x86, and remove redundant unit tests

PiperOrigin-RevId: 522181536
Change-Id: I0de49977abd65369d1646642599b61237dae9805
diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h
index f967301..feab3a9 100644
--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h
@@ -93,7 +93,6 @@
   *  - std::vector<int>
   *  - std::valarray<int>
   *  - std::array<int>
-  *  - Plain C arrays: int[N]
   *  - Eigen::ArrayXi
   *  - decltype(ArrayXi::LinSpaced(...))
   *  - Any view/expressions of the previous types
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 642e5d6..e5ae03d 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -77,26 +77,29 @@
 template <typename Packet, int Steps>
 struct generic_rsqrt_newton_step {
   static_assert(Steps > 0, "Steps must be at least 1.");
-
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE  Packet
+  using Scalar = typename unpacket_traits<Packet>::type;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet
   run(const Packet& a, const Packet& approx_rsqrt) {
-    using Scalar = typename unpacket_traits<Packet>::type;
-    const Packet one_point_five = pset1<Packet>(Scalar(1.5));
-    const Packet minus_half = pset1<Packet>(Scalar(-0.5));
+    const Packet cst_minus_one = pset1<Packet>(Scalar(-1));
+    const Packet cst_minus_half = pset1<Packet>(Scalar(-1)/Scalar(2));
     
     // Refine the approximation using one Newton-Raphson step:
-    //   x_{n+1} = x_n * (1.5 + (-0.5 * x_n) * (a * x_n)).
     // The approximation is expressed this way to avoid over/under-flows.  
-    Packet x_newton  = pmul(approx_rsqrt, pmadd(pmul(minus_half, approx_rsqrt), pmul(a, approx_rsqrt), one_point_five));
-    for (int step = 1; step < Steps; ++step) {
-      x_newton  = pmul(x_newton, pmadd(pmul(minus_half, x_newton), pmul(a, x_newton), one_point_five));
+    // x' = x - (x/2) * ( (a*x)*x - 1)
+
+    Packet x = approx_rsqrt;
+    for (int step = 0; step < Steps; ++step) {
+      Packet minushalfx = pmul(cst_minus_half, x);
+      Packet ax = pmul(a, x);
+      Packet ax2m1 = pmadd(ax, x, cst_minus_one);
+      x = pmadd(ax2m1, minushalfx, x);
     }
-    
-    // If approx_rsqrt is 0 or +/-inf, we should return it as is.  Note:
-    // on intel, approx_rsqrt can be inf for small denormal values.
-    const Packet return_approx = por(pcmp_eq(approx_rsqrt, pzero(a)),
-                                     pcmp_eq(pabs(approx_rsqrt), pset1<Packet>(NumTraits<Scalar>::infinity())));
-    return pselect(return_approx, approx_rsqrt, x_newton);
+
+    // If x is NaN, then either:
+    // 1) the input is NaN
+    // 2) zero and infinity were multiplied
+    // In either of these cases, return approx_rsqrt
+    return pselect(pisnan(x), approx_rsqrt, x);
   }
 };
 
@@ -108,7 +111,6 @@
   }
 };
 
-
 /** \internal Fast sqrt using Newton-Raphson's method.
 
  Preconditions:
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 3abb5bd..cd90496 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -33,7 +33,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 1,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -201,7 +200,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 2,
-    HasHalfPacket = 1,
 
     HasAdd    = 1,
     HasSub    = 1,
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index c5e1cc0..af4742b 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -66,7 +66,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
-    HasHalfPacket = 1,
 
     HasCmp  = 1,
     HasDiv = 1,
@@ -102,7 +101,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=4,
-    HasHalfPacket = 1,
 
     HasCmp  = 1,
     HasDiv  = 1,
@@ -128,7 +126,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
-    HasHalfPacket = 0,
 
     HasCmp    = 1,
     HasAdd    = 1,
@@ -172,7 +169,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
-    HasHalfPacket = 0,
 
     HasCmp = 1,
     HasAdd = 1,
@@ -873,6 +869,9 @@
 
 template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
 { return _mm256_blendv_ps(b,a,mask); }
+template<> EIGEN_STRONG_INLINE Packet8i pselect<Packet8i>(const Packet8i& mask, const Packet8i& a, const Packet8i& b)
+{ return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask))); }
+
 template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b)
 { return _mm256_blendv_pd(b,a,mask); }
 
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index db19b56..386543e 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -64,7 +64,6 @@
 };
 #endif  // EIGEN_VECTORIZE_AVX512
 
-
 template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
   return _mm256_cvttps_epi32(a);
 }
@@ -77,6 +76,10 @@
   return _mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a));
 }
 
+template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet4d, Packet8i>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_set_m128i(_mm256_cvttpd_epi32(b), _mm256_cvttpd_epi32(a));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16b pcast<Packet8f, Packet16b>(const Packet8f& a,
                                                          const Packet8f& b) {
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index 6d8ee2b..0372e95 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -32,7 +32,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
-    HasHalfPacket = 1,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -185,7 +184,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 4,
-    HasHalfPacket = 1,
 
     HasAdd    = 1,
     HasSub    = 1,
diff --git a/Eigen/src/Core/arch/AVX512/GemmKernel.h b/Eigen/src/Core/arch/AVX512/GemmKernel.h
index cb7cfdf..616a058 100644
--- a/Eigen/src/Core/arch/AVX512/GemmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/GemmKernel.h
@@ -641,7 +641,7 @@
     }
   }
 
-  template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch>
+  template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch, bool no_a_preload = false>
   EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo,
                                            Scalar *&co2, int &fetchA_idx, int &fetchB_idx) {
     const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
@@ -655,8 +655,8 @@
     if (max_b_unroll >= 8)
       innerkernel_1pow<uk, 8, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
 
-    // Load A after pow-loop.
-    load_a<0, um_vecs, uk, a_unroll, ktail>(ao);
+    // Load A after pow-loop. Skip this at the end to prevent running over the buffer
+    if (!no_a_preload) load_a<0, um_vecs, uk, a_unroll, ktail>(ao);
   }
 
   /*  Inner kernel loop structure.
@@ -698,7 +698,7 @@
    *  bo += b_unroll * kfactor;
    */
 
-  template <int a_unroll, int b_unroll, int k_factor, int max_b_unroll, int max_k_factor, bool c_fetch>
+  template <int a_unroll, int b_unroll, int k_factor, int max_b_unroll, int max_k_factor, bool c_fetch, bool no_a_preload = false>
   EIGEN_ALWAYS_INLINE void innerkernel(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co2) {
     int fetchA_idx = 0;
     int fetchB_idx = 0;
@@ -707,18 +707,19 @@
     const bool ktail = k_factor == 1;
 
     static_assert(k_factor <= 4 && k_factor > 0, "innerkernel maximum k_factor supported is 4");
+    static_assert(no_a_preload == false || (no_a_preload == true && k_factor == 1), "skipping a preload only allowed when k unroll is 1");
 
     if (k_factor > 0)
-      innerkernel_1uk<0, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx,
+      innerkernel_1uk<0, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(aa, ao, bo, co2, fetchA_idx,
                                                                                     fetchB_idx);
     if (k_factor > 1)
-      innerkernel_1uk<1, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx,
+      innerkernel_1uk<1, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(aa, ao, bo, co2, fetchA_idx,
                                                                                     fetchB_idx);
     if (k_factor > 2)
-      innerkernel_1uk<2, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx,
+      innerkernel_1uk<2, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(aa, ao, bo, co2, fetchA_idx,
                                                                                     fetchB_idx);
     if (k_factor > 3)
-      innerkernel_1uk<3, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx,
+      innerkernel_1uk<3, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(aa, ao, bo, co2, fetchA_idx,
                                                                                     fetchB_idx);
 
     // Advance A/B pointers after uk-loop.
@@ -729,7 +730,7 @@
   template <int a_unroll, int b_unroll, int max_b_unroll>
   EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
     const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
-    if (!use_less_a_regs)
+    if (!use_less_a_regs && k > 1)
       a_loads<0, 2, 0, um_vecs, a_unroll>(ao);
     else
       a_loads<0, 1, 0, um_vecs, a_unroll>(ao);
@@ -743,7 +744,13 @@
 
     // Unrolling k-loop by a factor of 4.
     const int max_k_factor = 4;
-    Index loop_count = k / max_k_factor;
+    Index kRem = k % max_k_factor;
+    Index k_ = k - kRem;
+    if (k_ >= max_k_factor) {
+      k_ -= max_k_factor;
+      kRem += max_k_factor;
+    }
+    Index loop_count = k_ / max_k_factor;
 
     if (loop_count > 0) {
 #ifdef SECOND_FETCH
@@ -771,11 +778,14 @@
     }
 
     // k-loop remainder handling.
-    loop_count = k % max_k_factor;
-    while (loop_count > 0) {
+    loop_count = kRem;
+    while (loop_count > 1) {
       innerkernel<a_unroll, b_unroll, 1, max_b_unroll, max_k_factor, 0>(aa, ao, bo, co2);
       loop_count--;
     }
+    if (loop_count > 0) {
+      innerkernel<a_unroll, b_unroll, 1, max_b_unroll, max_k_factor, 0, true>(aa, ao, bo, co2);
+    }
 
     // Update C matrix.
     c_update<max_b_unroll, a_unroll, b_unroll>(co1, co2);
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 543f424..129a68c 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -63,7 +63,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 16,
-    HasHalfPacket = 1,
 
     HasCmp    = 1,
     HasAdd    = 1,
@@ -89,7 +88,7 @@
     HasCos    = EIGEN_FAST_MATH,
     HasTanh   = EIGEN_FAST_MATH,
     HasErf    = EIGEN_FAST_MATH,
-    HasBlend = 0,
+    HasBlend  = 0,
     HasRound  = 1,
     HasFloor  = 1,
     HasCeil   = 1,
@@ -106,13 +105,12 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 16,
-    HasHalfPacket = 1,
 
     HasAbs = 1,
-    HasMin    = 1,
-    HasMax    = 1,
-    HasConj   = 1,
-    HasBlend = 0,
+    HasMin   = 1,
+    HasMax   = 1,
+    HasConj  = 1,
+    HasBlend = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasACos = 1,
@@ -146,7 +144,7 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
-    HasHalfPacket = 1,
+    HasBlend = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasLog  = 1,
@@ -168,6 +166,7 @@
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
+    HasBlend = 0,
     HasCmp = 1,
     HasDiv = 1,
     size=16
@@ -455,12 +454,19 @@
 EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask,
                                            const Packet16f& a,
                                            const Packet16f& b) {
-  __mmask16 mask16 = _mm512_cmp_epi32_mask(
-      _mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+  __mmask16 mask16 = _mm512_cmpeq_epi32_mask(_mm512_castps_si512(mask), _mm512_setzero_epi32());
   return _mm512_mask_blend_ps(mask16, a, b);
 }
 
 template <>
+EIGEN_DEVICE_FUNC inline Packet16i pselect(const Packet16i& mask,
+                                           const Packet16i& a,
+                                           const Packet16i& b) {
+  __mmask16 mask16 = _mm512_cmpeq_epi32_mask(mask, _mm512_setzero_epi32());
+  return _mm512_mask_blend_epi32(mask16, a, b);
+}
+
+template <>
 EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask,
                                           const Packet8d& a,
                                           const Packet8d& b) {
@@ -544,6 +550,7 @@
 template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
 template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
 EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
+EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) { return _mm512_inserti32x8(_mm512_castsi256_si512(a), b, 1); }
 #else
 // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
 template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
@@ -559,6 +566,9 @@
   return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
                                                 _mm256_castps_si256(b),1));
 }
+EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
+  return _mm512_inserti64x4(_mm512_castsi256_si512(a), b, 1);
+}
 #endif
 
 // Helper function for bit packing snippet of low precision comparison.
@@ -1843,11 +1853,16 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
-                                     const Packet16f& /*thenPacket*/,
-                                     const Packet16f& /*elsePacket*/) {
-  eigen_assert(false && "To be implemented");
-  return Packet16f();
+EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& ifPacket,
+                                     const Packet16f& thenPacket,
+                                     const Packet16f& elsePacket) {
+  __mmask16 m = (ifPacket.select[0]) | (ifPacket.select[1] << 1) | (ifPacket.select[2] << 2) |
+                (ifPacket.select[3] << 3) | (ifPacket.select[4] << 4) | (ifPacket.select[5] << 5) |
+                (ifPacket.select[6] << 6) | (ifPacket.select[7] << 7) | (ifPacket.select[8] << 8) |
+                (ifPacket.select[9] << 9) | (ifPacket.select[10] << 10) | (ifPacket.select[11] << 11) |
+                (ifPacket.select[12] << 12) | (ifPacket.select[13] << 13) | (ifPacket.select[14] << 14) |
+                (ifPacket.select[15] << 15);
+  return _mm512_mask_blend_ps(m, elsePacket, thenPacket);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
@@ -2291,7 +2306,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 16,
-    HasHalfPacket = 1,
     HasBlend = 0,
     HasInsert = 1,
     HasSin = EIGEN_FAST_MATH,
diff --git a/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
index e137d6a..4c6116c 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
+++ b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
@@ -299,7 +299,7 @@
    * 1-D unroll
    *      for(startN = 0; startN < endN; startN++)
    **/
-  template <int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remM>
+  template <int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remM, int64_t remN_>
   static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadB(
       Scalar *B_arr, int64_t LDB, PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
       int64_t remM_ = 0) {
@@ -310,12 +310,18 @@
       ymm.packet[packetIndexOffset + startN] =
           ploadu<vecHalf>((const Scalar *)&B_arr[startN * LDB], remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_));
     }
-    else ymm.packet[packetIndexOffset + startN] = ploadu<vecHalf>((const Scalar *)&B_arr[startN * LDB]);
+    else {
+      EIGEN_IF_CONSTEXPR(remN_ == 0) {
+        ymm.packet[packetIndexOffset + startN] = ploadu<vecHalf>((const Scalar *)&B_arr[startN * LDB]);
+      }
+      else ymm.packet[packetIndexOffset + startN] =
+          ploadu<vecHalf>((const Scalar *)&B_arr[startN * LDB], remMask<EIGEN_AVX_MAX_NUM_ROW>(remN_));
+    }
 
-    aux_loadB<endN, counter - 1, packetIndexOffset, remM>(B_arr, LDB, ymm, remM_);
+    aux_loadB<endN, counter - 1, packetIndexOffset, remM, remN_>(B_arr, LDB, ymm, remM_);
   }
 
-  template <int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remM>
+  template <int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remM, int64_t remN_>
   static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadB(
       Scalar *B_arr, int64_t LDB, PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
       int64_t remM_ = 0) {
@@ -363,17 +369,17 @@
    * 1-D unroll
    *      for(startN = 0; startN < endN; startN += EIGEN_AVX_MAX_NUM_ROW)
    **/
-  template <int64_t endN, int64_t counter, bool toTemp, bool remM>
+  template <int64_t endN, int64_t counter, bool toTemp, bool remM, int64_t remN_>
   static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadBBlock(
       Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
       PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
     constexpr int64_t counterReverse = endN - counter;
     constexpr int64_t startN = counterReverse;
-    transB::template loadB<EIGEN_AVX_MAX_NUM_ROW, startN, false>(&B_temp[startN], LDB_, ymm);
-    aux_loadBBlock<endN, counter - EIGEN_AVX_MAX_NUM_ROW, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+    transB::template loadB<EIGEN_AVX_MAX_NUM_ROW, startN, false, (toTemp ? 0 : remN_)>(&B_temp[startN], LDB_, ymm);
+    aux_loadBBlock<endN, counter - EIGEN_AVX_MAX_NUM_ROW, toTemp, remM, remN_>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
   }
 
-  template <int64_t endN, int64_t counter, bool toTemp, bool remM>
+  template <int64_t endN, int64_t counter, bool toTemp, bool remM, int64_t remN_>
   static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadBBlock(
       Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
       PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
@@ -424,11 +430,11 @@
    * Wrappers for aux_XXXX to hide counter parameter
    ********************************************************/
 
-  template <int64_t endN, int64_t packetIndexOffset, bool remM>
+  template <int64_t endN, int64_t packetIndexOffset, bool remM, int64_t remN_>
   static EIGEN_ALWAYS_INLINE void loadB(Scalar *B_arr, int64_t LDB,
                                         PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
                                         int64_t remM_ = 0) {
-    aux_loadB<endN, endN, packetIndexOffset, remM>(B_arr, LDB, ymm, remM_);
+    aux_loadB<endN, endN, packetIndexOffset, remM, remN_>(B_arr, LDB, ymm, remM_);
   }
 
   template <int64_t endN, int64_t packetIndexOffset, bool remK, bool remM>
@@ -438,13 +444,13 @@
     aux_storeB<endN, endN, packetIndexOffset, remK, remM>(B_arr, LDB, ymm, rem_);
   }
 
-  template <int64_t unrollN, bool toTemp, bool remM>
+  template <int64_t unrollN, bool toTemp, bool remM, int64_t remN_ = 0>
   static EIGEN_ALWAYS_INLINE void loadBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
                                              PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
                                              int64_t remM_ = 0) {
-    EIGEN_IF_CONSTEXPR(toTemp) { transB::template loadB<unrollN, 0, remM>(&B_arr[0], LDB, ymm, remM_); }
+    EIGEN_IF_CONSTEXPR(toTemp) { transB::template loadB<unrollN, 0, remM, 0>(&B_arr[0], LDB, ymm, remM_); }
     else {
-      aux_loadBBlock<unrollN, unrollN, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      aux_loadBBlock<unrollN, unrollN, toTemp, remM, remN_>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
     }
   }
 
@@ -550,13 +556,13 @@
     }
     else EIGEN_IF_CONSTEXPR(unrollN == 2) {
       // load Lx2 B col major, transpose Lx2 row major
-      transB::template loadBBlock<2, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template loadBBlock<2, toTemp, remM, 2>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
       transB::template transposeLxL<0>(ymm);
       transB::template storeBBlock<2, toTemp, remM, 2>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
     }
     else EIGEN_IF_CONSTEXPR(unrollN == 1) {
       // load Lx1 B col major, transpose Lx1 row major
-      transB::template loadBBlock<1, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template loadBBlock<1, toTemp, remM, 1>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
       transB::template transposeLxL<0>(ymm);
       transB::template storeBBlock<1, toTemp, remM, 1>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
     }
diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
index 60f49a3..02e6335 100644
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -55,6 +55,10 @@
   return  cat256(_mm512_cvtpd_ps(a), _mm512_cvtpd_ps(b));
 }
 
+template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet8d, Packet16i>(const Packet8d& a, const Packet8d& b) {
+  return  cat256i(_mm512_cvttpd_epi32(a), _mm512_cvttpd_epi32(b));
+}
+
 template<> EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
   return _mm512_castps_si512(a);
 }
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index e448bb6..69cc068 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -91,7 +91,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -360,7 +359,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 1,
-    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index e443a63..d477ab7 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -157,7 +157,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 1,
 
     HasAdd = 1,
     HasSub = 1,
@@ -206,7 +205,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
-    HasHalfPacket = 0,
 
     HasAdd = 1,
     HasSub = 1,
@@ -250,7 +248,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 0,
 
     HasAdd   = 1,
     HasSub   = 1,
@@ -274,7 +271,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
-    HasHalfPacket = 0,
 
     HasAdd  = 1,
     HasSub  = 1,
@@ -293,7 +289,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
-    HasHalfPacket = 0,
 
     HasAdd  = 1,
     HasSub  = 1,
@@ -312,7 +307,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 16,
-    HasHalfPacket = 0,
 
     HasAdd  = 1,
     HasSub  = 1,
@@ -331,7 +325,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 16,
-    HasHalfPacket = 0,
 
     HasAdd  = 1,
     HasSub  = 1,
@@ -2710,7 +2703,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=2,
-    HasHalfPacket = 1,
 
     HasAdd  = 1,
     HasSub  = 1,
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index 17dd8fb..a04c563 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -46,7 +46,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=4,
-    HasHalfPacket = 0,
 
     HasDiv  = 1,
     HasSin  = 0,
@@ -82,7 +81,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=2,
-    HasHalfPacket = 0,
 
     HasDiv  = 1,
     HasLog  = 1,
@@ -534,7 +532,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=8,
-    HasHalfPacket = 0,
     HasAdd    = 1,
     HasSub    = 1,
     HasMul    = 1,
diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h
index b11a9b4..83239c0 100644
--- a/Eigen/src/Core/arch/MSA/Complex.h
+++ b/Eigen/src/Core/arch/MSA/Complex.h
@@ -107,7 +107,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 0,
 
     HasAdd = 1,
     HasSub = 1,
@@ -422,7 +421,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 1,
-    HasHalfPacket = 0,
 
     HasAdd = 1,
     HasSub = 1,
diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
index f03dbed..4e6bcdf 100644
--- a/Eigen/src/Core/arch/MSA/PacketMath.h
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -80,7 +80,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 0,  // Packet2f intrinsics not implemented yet
     // FIXME check the Has*
     HasDiv = 1,
     HasSin = EIGEN_FAST_MATH,
@@ -106,7 +105,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 0,  // Packet2i intrinsics not implemented yet
     // FIXME check the Has*
     HasDiv = 1,
     HasBlend = 1
@@ -850,7 +848,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 0,
     // FIXME check the Has*
     HasDiv = 1,
     HasExp = 1,
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index e436360..97f4116 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -59,7 +59,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 1,
 
     HasAdd       = 1,
     HasSub       = 1,
@@ -400,7 +399,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 1,
-    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index fac0219..e52e3fb 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -187,7 +187,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 1,
 
     HasCmp       = 1,
     HasAdd       = 1,
@@ -237,7 +236,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 16,
-    HasHalfPacket = 1,
 
     HasCmp       = 1,
     HasAdd       = 1,
@@ -267,7 +265,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 16,
-    HasHalfPacket = 1,
 
     HasCmp       = 1,
     HasAdd       = 1,
@@ -299,7 +296,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
-    HasHalfPacket = 1,
 
     HasCmp       = 1,
     HasAdd       = 1,
@@ -329,7 +325,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
-    HasHalfPacket = 1,
 
     HasCmp       = 1,
     HasAdd       = 1,
@@ -360,7 +355,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 1,
 
     HasCmp       = 1,
     HasAdd       = 1,
@@ -390,7 +384,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 1,
 
     HasCmp       = 1,
     HasAdd       = 1,
@@ -422,7 +415,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 0,
 
     HasCmp       = 1,
     HasAdd       = 1,
@@ -452,7 +444,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 0,
 
     HasCmp       = 1,
     HasAdd       = 1,
@@ -3410,7 +3401,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 0,
 
     HasCmp       = 1,
     HasAdd       = 1,
@@ -3784,7 +3774,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 0,
 
     HasCmp       = 1,
     HasAdd       = 1,
@@ -4027,7 +4016,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
-    HasHalfPacket = 1,
 
     HasCmp = 1,
     HasCast = 1,
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 60308ce..366daa7 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -35,7 +35,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -185,7 +184,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 1,
-    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 7d608bb..499c16b 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -132,7 +132,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 0,
 
     HasCmp  = 1,
     HasDiv = 1,
@@ -171,7 +170,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=2,
-    HasHalfPacket = 0,
 
     HasCmp  = 1,
     HasDiv  = 1,
@@ -212,7 +210,6 @@
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    HasHalfPacket = 0,
     size=16,
     
     HasAdd       = 1,
@@ -1478,7 +1475,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 0,
     HasAdd    = 1,
     HasSub    = 1,
     HasMul    = 1,
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index 2ab0943..df5c72c 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -27,13 +27,14 @@
 };
 
 template <>
-struct type_casting_traits<float, int> {
+struct type_casting_traits<float, double> {
   enum {
     VectorizedCast = 1,
     SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
+    TgtCoeffRatio = 2
   };
 };
+#endif
 
 template <>
 struct type_casting_traits<int, float> {
@@ -45,14 +46,22 @@
 };
 
 template <>
-struct type_casting_traits<float, double> {
+struct type_casting_traits<float, int> {
   enum {
     VectorizedCast = 1,
     SrcCoeffRatio = 1,
-    TgtCoeffRatio = 2
+    TgtCoeffRatio = 1
   };
 };
-#endif
+
+template <>
+struct type_casting_traits<double, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
 
 template <>
 struct type_casting_traits<double, float> {
@@ -91,6 +100,12 @@
   return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
+  return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm_cvttpd_epi32(a)),
+                                         _mm_castsi128_ps(_mm_cvttpd_epi32(b)),
+                                         (1 << 2) | (1 << 6)));
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
   // Simply discard the second half of the input
   return _mm_cvtps_pd(a);
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 9c106b3..a2f292f 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -42,7 +42,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
-    HasHalfPacket = 0,
 
     HasAdd = 1,
     HasSub = 1,
@@ -377,7 +376,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
-    HasHalfPacket = 0,
 
     HasAdd = 1,
     HasSub = 1,
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index df5c8d4..7f22e5c 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -53,7 +53,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -78,7 +77,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 1,
-    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
index 26b6f0d..892e3a1 100644
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -160,7 +160,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 0,
 
     HasAdd  = 1,
     HasSub  = 1,
@@ -178,7 +177,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 0,
 
     HasAdd = 1,
     HasSub = 1,
@@ -211,7 +209,6 @@
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=2,
-    HasHalfPacket = 1,
 
     HasAdd  = 1,
     HasSub  = 1,
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 13d029c..f4217e2 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -84,6 +84,35 @@
 
 namespace internal {
 
+/*****************************************************************************
+*** Implementation of portable aligned versions of malloc/free/realloc     ***
+*****************************************************************************/
+
+#ifdef EIGEN_NO_MALLOC
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
+{
+  eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
+}
+#elif defined EIGEN_RUNTIME_NO_MALLOC
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
+{
+  EIGEN_MALLOC_CHECK_THREAD_LOCAL static bool value = true;
+  if (update == 1)
+    value = new_value;
+  return value;
+}
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
+EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
+{
+  eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
+}
+#else
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
+{}
+#endif
+
+
 EIGEN_DEVICE_FUNC
 inline void throw_std_bad_alloc()
 {
@@ -121,7 +150,10 @@
 EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
 {
   eigen_assert(alignment >= sizeof(void*) && alignment <= 128 && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*), less than or equal to 128, and a power of 2");
-  void* original = std::malloc(size + alignment);
+
+  check_that_malloc_is_allowed();
+  EIGEN_USING_STD(malloc)
+  void* original = malloc(size + alignment);
   if (original == 0) return 0;
   uint8_t offset = static_cast<uint8_t>(alignment - (reinterpret_cast<std::size_t>(original) & (alignment - 1)));
   void* aligned = static_cast<void*>(static_cast<uint8_t*>(original) + offset);
@@ -135,7 +167,10 @@
   if (ptr) {
     uint8_t offset = static_cast<uint8_t>(*(static_cast<uint8_t*>(ptr) - 1));
     void* original = static_cast<void*>(static_cast<uint8_t*>(ptr) - offset);
-    std::free(original);
+
+    check_that_malloc_is_allowed();
+    EIGEN_USING_STD(free)
+    free(original);
   }
 }
 
@@ -146,11 +181,14 @@
   */
 EIGEN_DEVICE_FUNC inline void* handmade_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
 {
-  if (ptr == 0) return handmade_aligned_malloc(new_size, alignment);
+  if (ptr == nullptr) return handmade_aligned_malloc(new_size, alignment);
   uint8_t old_offset = *(static_cast<uint8_t*>(ptr) - 1);
   void* old_original = static_cast<uint8_t*>(ptr) - old_offset;
-  void* original = std::realloc(old_original, new_size + alignment);
-  if (original == 0) return 0;
+
+  check_that_malloc_is_allowed();
+  EIGEN_USING_STD(realloc)
+  void* original = realloc(old_original, new_size + alignment);
+  if (original == nullptr) return nullptr;
   if (original == old_original) return ptr;
   uint8_t offset = static_cast<uint8_t>(alignment - (reinterpret_cast<std::size_t>(original) & (alignment - 1)));
   void* aligned = static_cast<void*>(static_cast<uint8_t*>(original) + offset);
@@ -163,44 +201,17 @@
   return aligned;
 }
 
-/*****************************************************************************
-*** Implementation of portable aligned versions of malloc/free/realloc     ***
-*****************************************************************************/
-
-#ifdef EIGEN_NO_MALLOC
-EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
-{
-  eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
-}
-#elif defined EIGEN_RUNTIME_NO_MALLOC
-EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
-{
-  EIGEN_MALLOC_CHECK_THREAD_LOCAL static bool value = true;
-  if (update == 1)
-    value = new_value;
-  return value;
-}
-EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
-EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
-EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
-{
-  eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
-}
-#else
-EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
-{}
-#endif
-
 /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
   * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
   */
 EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
 {
-  check_that_malloc_is_allowed();
-
+  if (size == 0) return nullptr;
+  
   void *result;
   #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
 
+    check_that_malloc_is_allowed();
     EIGEN_USING_STD(malloc)
     result = malloc(size);
 
@@ -222,6 +233,8 @@
 {
   #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
 
+    if(ptr)
+      check_that_malloc_is_allowed();
     EIGEN_USING_STD(free)
     free(ptr);
 
@@ -237,11 +250,17 @@
   */
 EIGEN_DEVICE_FUNC inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size)
 {
-  if (ptr == 0) return aligned_malloc(new_size);
+  if (ptr == nullptr) return aligned_malloc(new_size);
+  if (old_size == new_size) return ptr;
+  if (new_size == 0) { aligned_free(ptr); return nullptr; }
+
   void *result;
 #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
   EIGEN_UNUSED_VARIABLE(old_size)
-  result = std::realloc(ptr,new_size);
+
+  check_that_malloc_is_allowed();
+  EIGEN_USING_STD(realloc)
+  result = realloc(ptr,new_size);
 #else
   result = handmade_aligned_realloc(ptr,new_size,old_size);
 #endif
@@ -249,11 +268,6 @@
   if (!result && new_size)
     throw_std_bad_alloc();
 
-#ifdef EIGEN_RUNTIME_NO_MALLOC
-  if (result != ptr)
-    check_that_malloc_is_allowed();
-#endif
-
   return result;
 }
 
@@ -271,8 +285,9 @@
 
 template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std::size_t size)
 {
-  check_that_malloc_is_allowed();
+  if (size == 0) return nullptr;
 
+  check_that_malloc_is_allowed();
   EIGEN_USING_STD(malloc)
   void *result = malloc(size);
 
@@ -289,6 +304,8 @@
 
 template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
 {
+  if(ptr)
+    check_that_malloc_is_allowed();
   EIGEN_USING_STD(free)
   free(ptr);
 }
@@ -298,9 +315,15 @@
   return aligned_realloc(ptr, new_size, old_size);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_realloc<false>(void* ptr, std::size_t new_size, std::size_t)
+template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_realloc<false>(void* ptr, std::size_t new_size, std::size_t old_size)
 {
-  return std::realloc(ptr, new_size);
+  if (ptr == nullptr) return conditional_aligned_malloc<false>(new_size);
+  if (old_size == new_size) return ptr;
+  if (new_size == 0) { conditional_aligned_free<false>(ptr); return nullptr; }
+
+  check_that_malloc_is_allowed();
+  EIGEN_USING_STD(realloc)
+  return realloc(ptr, new_size);
 }
 
 /*****************************************************************************
@@ -424,7 +447,7 @@
 template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
-  Eigen::internal::aligned_free(ptr);
+  aligned_free(ptr);
 }
 
 /** \internal Deletes objects constructed with conditional_aligned_new
diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h
index 011fcbe..b796b39 100644
--- a/Eigen/src/plugins/IndexedViewMethods.h
+++ b/Eigen/src/plugins/IndexedViewMethods.h
@@ -9,200 +9,179 @@
 
 #if !defined(EIGEN_PARSED_BY_DOXYGEN)
 
-// This file is automatically included twice to generate const and non-const versions
-
-#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
-#define EIGEN_INDEXED_VIEW_METHOD_CONST const
-#define EIGEN_INDEXED_VIEW_METHOD_TYPE  ConstIndexedViewType
-#else
-#define EIGEN_INDEXED_VIEW_METHOD_CONST
-#define EIGEN_INDEXED_VIEW_METHOD_TYPE IndexedViewType
-#endif
-
-#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
 protected:
-
 // define some aliases to ease readability
 
-template<typename Indices>
-struct IvcRowType : public internal::IndexedViewCompatibleType<Indices,RowsAtCompileTime> {};
+template <typename Indices>
+using IvcRowType = typename internal::IndexedViewCompatibleType<Indices, RowsAtCompileTime>::type;
 
-template<typename Indices>
-struct IvcColType : public internal::IndexedViewCompatibleType<Indices,ColsAtCompileTime> {};
+template <typename Indices>
+using IvcColType = typename internal::IndexedViewCompatibleType<Indices, ColsAtCompileTime>::type;
 
-template<typename Indices>
-struct IvcType : public internal::IndexedViewCompatibleType<Indices,SizeAtCompileTime> {};
+template <typename Indices>
+using IvcType = typename internal::IndexedViewCompatibleType<Indices, SizeAtCompileTime>::type;
 
-typedef typename internal::IndexedViewCompatibleType<Index,1>::type IvcIndex;
+typedef typename internal::IndexedViewCompatibleType<Index, 1>::type IvcIndex;
 
-template<typename Indices>
-typename IvcRowType<Indices>::type
-ivcRow(const Indices& indices) const {
-  return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,RowsAtCompileTime>(derived().rows()),Specialized);
+template <typename Indices>
+IvcRowType<Indices> ivcRow(const Indices& indices) const {
+  return internal::makeIndexedViewCompatible(
+      indices, internal::variable_if_dynamic<Index, RowsAtCompileTime>(derived().rows()), Specialized);
 }
 
-template<typename Indices>
-typename IvcColType<Indices>::type
-ivcCol(const Indices& indices) const {
-  return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,ColsAtCompileTime>(derived().cols()),Specialized);
+template <typename Indices>
+IvcColType<Indices> ivcCol(const Indices& indices) const {
+  return internal::makeIndexedViewCompatible(
+      indices, internal::variable_if_dynamic<Index, ColsAtCompileTime>(derived().cols()), Specialized);
 }
 
-template<typename Indices>
-typename IvcColType<Indices>::type
-ivcSize(const Indices& indices) const {
-  return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,SizeAtCompileTime>(derived().size()),Specialized);
+template <typename Indices>
+IvcColType<Indices> ivcSize(const Indices& indices) const {
+  return internal::makeIndexedViewCompatible(
+      indices, internal::variable_if_dynamic<Index, SizeAtCompileTime>(derived().size()), Specialized);
 }
 
 public:
 
-#endif
+template <typename RowIndices, typename ColIndices>
+using IndexedViewType = IndexedView<Derived, IvcRowType<RowIndices>, IvcColType<ColIndices>>;
 
-template<typename RowIndices, typename ColIndices>
-struct EIGEN_INDEXED_VIEW_METHOD_TYPE {
-  typedef IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,
-                      typename IvcRowType<RowIndices>::type,
-                      typename IvcColType<ColIndices>::type> type;
-};
+template <typename RowIndices, typename ColIndices>
+using ConstIndexedViewType = IndexedView<const Derived, IvcRowType<RowIndices>, IvcColType<ColIndices>>;
 
 // This is the generic version
 
-template<typename RowIndices, typename ColIndices>
-std::enable_if_t<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
-  && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsIndexedView,
-  typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>
-operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  return typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type
-            (derived(), ivcRow(rowIndices), ivcCol(colIndices));
+template <typename RowIndices, typename ColIndices>
+std::enable_if_t<internal::valid_indexed_view_overload<RowIndices, ColIndices>::value &&
+                     internal::traits<IndexedViewType<RowIndices, ColIndices>>::ReturnAsIndexedView,
+                 IndexedViewType<RowIndices, ColIndices>>
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices) {
+  return IndexedViewType<RowIndices, ColIndices>(derived(), ivcRow(rowIndices), ivcCol(colIndices));
+}
+
+template <typename RowIndices, typename ColIndices>
+std::enable_if_t<internal::valid_indexed_view_overload<RowIndices, ColIndices>::value &&
+                     internal::traits<ConstIndexedViewType<RowIndices, ColIndices>>::ReturnAsIndexedView,
+                 ConstIndexedViewType<RowIndices, ColIndices>>
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices) const {
+  return ConstIndexedViewType<RowIndices, ColIndices>(derived(), ivcRow(rowIndices), ivcCol(colIndices));
 }
 
 // The following overload returns a Block<> object
 
-template<typename RowIndices, typename ColIndices>
-std::enable_if_t<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
-  && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsBlock,
-  typename internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::BlockType>
-operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  typedef typename internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::BlockType BlockType;
-  typename IvcRowType<RowIndices>::type actualRowIndices = ivcRow(rowIndices);
-  typename IvcColType<ColIndices>::type actualColIndices = ivcCol(colIndices);
-  return BlockType(derived(),
-                   internal::first(actualRowIndices),
-                   internal::first(actualColIndices),
-                   internal::index_list_size(actualRowIndices),
-                   internal::index_list_size(actualColIndices));
+template <typename RowIndices, typename ColIndices>
+std::enable_if_t<internal::valid_indexed_view_overload<RowIndices, ColIndices>::value &&
+                     internal::traits<IndexedViewType<RowIndices, ColIndices>>::ReturnAsBlock,
+                 typename internal::traits<IndexedViewType<RowIndices, ColIndices>>::BlockType>
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices) {
+  typedef typename internal::traits<IndexedViewType<RowIndices, ColIndices>>::BlockType BlockType;
+  IvcRowType<RowIndices> actualRowIndices = ivcRow(rowIndices);
+  IvcColType<ColIndices> actualColIndices = ivcCol(colIndices);
+  return BlockType(derived(), internal::first(actualRowIndices), internal::first(actualColIndices),
+                   internal::index_list_size(actualRowIndices), internal::index_list_size(actualColIndices));
+}
+
+template <typename RowIndices, typename ColIndices>
+std::enable_if_t<internal::valid_indexed_view_overload<RowIndices, ColIndices>::value &&
+                     internal::traits<ConstIndexedViewType<RowIndices, ColIndices>>::ReturnAsBlock,
+                 typename internal::traits<ConstIndexedViewType<RowIndices, ColIndices>>::BlockType>
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices) const {
+  typedef typename internal::traits<ConstIndexedViewType<RowIndices, ColIndices>>::BlockType BlockType;
+  IvcRowType<RowIndices> actualRowIndices = ivcRow(rowIndices);
+  IvcColType<ColIndices> actualColIndices = ivcCol(colIndices);
+  return BlockType(derived(), internal::first(actualRowIndices), internal::first(actualColIndices),
+                   internal::index_list_size(actualRowIndices), internal::index_list_size(actualColIndices));
 }
 
 // The following overload returns a Scalar
 
-template<typename RowIndices, typename ColIndices>
-std::enable_if_t<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
-  && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsScalar,
-  CoeffReturnType >
-operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  return Base::operator()(internal::eval_expr_given_size(rowIndices,rows()),internal::eval_expr_given_size(colIndices,cols()));
+template <typename RowIndices, typename ColIndices>
+std::enable_if_t<internal::valid_indexed_view_overload<RowIndices, ColIndices>::value &&
+                     internal::traits<IndexedViewType<RowIndices, ColIndices>>::ReturnAsScalar && internal::is_lvalue<Derived>::value,
+                 Scalar&>
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices) {
+  return Base::operator()(internal::eval_expr_given_size(rowIndices, rows()),
+                          internal::eval_expr_given_size(colIndices, cols()));
 }
 
-// The following three overloads are needed to handle raw Index[N] arrays.
-
-template<typename RowIndicesT, std::size_t RowIndicesN, typename ColIndices>
-IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],typename IvcColType<ColIndices>::type>
-operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],typename IvcColType<ColIndices>::type>
-                    (derived(), rowIndices, ivcCol(colIndices));
+template <typename RowIndices, typename ColIndices>
+std::enable_if_t<internal::valid_indexed_view_overload<RowIndices, ColIndices>::value &&
+                     internal::traits<ConstIndexedViewType<RowIndices, ColIndices>>::ReturnAsScalar,
+                 CoeffReturnType>
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices) const {
+  return Base::operator()(internal::eval_expr_given_size(rowIndices, rows()),
+                          internal::eval_expr_given_size(colIndices, cols()));
 }
 
-template<typename RowIndices, typename ColIndicesT, std::size_t ColIndicesN>
-IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcRowType<RowIndices>::type, const ColIndicesT (&)[ColIndicesN]>
-operator()(const RowIndices& rowIndices, const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcRowType<RowIndices>::type,const ColIndicesT (&)[ColIndicesN]>
-                    (derived(), ivcRow(rowIndices), colIndices);
-}
-
-template<typename RowIndicesT, std::size_t RowIndicesN, typename ColIndicesT, std::size_t ColIndicesN>
-IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN], const ColIndicesT (&)[ColIndicesN]>
-operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],const ColIndicesT (&)[ColIndicesN]>
-                    (derived(), rowIndices, colIndices);
-}
-
-
 // Overloads for 1D vectors/arrays
 
-template<typename Indices>
-std::enable_if_t<
-  IsRowMajor && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
-  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type> >
-operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
+template <typename Indices>
+std::enable_if_t<IsRowMajor && (!(internal::get_compile_time_incr<IvcType<Indices>>::value == 1 ||
+                                  internal::is_valid_index_type<Indices>::value)),
+                 IndexedView<Derived, IvcIndex, IvcType<Indices>>>
+operator()(const Indices& indices) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type>
-            (derived(), IvcIndex(0), ivcCol(indices));
+  return IndexedView<Derived, IvcIndex, IvcType<Indices>>(derived(), IvcIndex(0), ivcCol(indices));
 }
 
-template<typename Indices>
-std::enable_if_t<
-  (!IsRowMajor) && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
-  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex> >
-operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
+template <typename Indices>
+std::enable_if_t<IsRowMajor && (!(internal::get_compile_time_incr<IvcType<Indices>>::value == 1 ||
+                                  internal::is_valid_index_type<Indices>::value)),
+                 IndexedView<const Derived, IvcIndex, IvcType<Indices>>>
+operator()(const Indices& indices) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex>
-            (derived(), ivcRow(indices), IvcIndex(0));
+  return IndexedView<const Derived, IvcIndex, IvcType<Indices>>(derived(), IvcIndex(0), ivcCol(indices));
 }
 
-template<typename Indices>
-std::enable_if_t<
-  (internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1) && (!internal::is_valid_index_type<Indices>::value) && (!symbolic::is_symbolic<Indices>::value),
-  VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value> >
-operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
+template <typename Indices>
+std::enable_if_t<(!IsRowMajor) && (!(internal::get_compile_time_incr<IvcType<Indices>>::value == 1 ||
+                                     internal::is_valid_index_type<Indices>::value)),
+                 IndexedView<Derived, IvcType<Indices>, IvcIndex>>
+operator()(const Indices& indices) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  typename IvcType<Indices>::type actualIndices = ivcSize(indices);
-  return VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value>
-            (derived(), internal::first(actualIndices), internal::index_list_size(actualIndices));
+  return IndexedView<Derived, IvcType<Indices>, IvcIndex>(derived(), ivcRow(indices), IvcIndex(0));
 }
 
-template<typename IndexType>
-std::enable_if_t<symbolic::is_symbolic<IndexType>::value, CoeffReturnType >
-operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  return Base::operator()(internal::eval_expr_given_size(id,size()));
-}
-
-template<typename IndicesT, std::size_t IndicesN>
-std::enable_if_t<IsRowMajor,
-  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,const IndicesT (&)[IndicesN]> >
-operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
+template <typename Indices>
+std::enable_if_t<(!IsRowMajor) && (!(internal::get_compile_time_incr<IvcType<Indices>>::value == 1 ||
+                                     internal::is_valid_index_type<Indices>::value)),
+                 IndexedView<const Derived, IvcType<Indices>, IvcIndex>>
+operator()(const Indices& indices) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,const IndicesT (&)[IndicesN]>
-            (derived(), IvcIndex(0), indices);
+  return IndexedView<const Derived, IvcType<Indices>, IvcIndex>(derived(), ivcRow(indices), IvcIndex(0));
 }
 
-template<typename IndicesT, std::size_t IndicesN>
-std::enable_if_t<!IsRowMajor,
-  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const IndicesT (&)[IndicesN],IvcIndex> >
-operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
+template <typename Indices>
+std::enable_if_t<(internal::get_compile_time_incr<IvcType<Indices>>::value == 1) &&
+                     (!internal::is_valid_index_type<Indices>::value) && (!symbolic::is_symbolic<Indices>::value),
+                 VectorBlock<Derived, internal::array_size<Indices>::value>>
+operator()(const Indices& indices) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const IndicesT (&)[IndicesN],IvcIndex>
-            (derived(), indices, IvcIndex(0));
+  IvcType<Indices> actualIndices = ivcSize(indices);
+  return VectorBlock<Derived, internal::array_size<Indices>::value>(derived(), internal::first(actualIndices),
+                                                                    internal::index_list_size(actualIndices));
 }
 
-#undef EIGEN_INDEXED_VIEW_METHOD_CONST
-#undef EIGEN_INDEXED_VIEW_METHOD_TYPE
+template <typename Indices>
+std::enable_if_t<(internal::get_compile_time_incr<IvcType<Indices>>::value == 1) &&
+                     (!internal::is_valid_index_type<Indices>::value) && (!symbolic::is_symbolic<Indices>::value),
+                 VectorBlock<const Derived, internal::array_size<Indices>::value>>
+operator()(const Indices& indices) const {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  IvcType<Indices> actualIndices = ivcSize(indices);
+  return VectorBlock<const Derived, internal::array_size<Indices>::value>(derived(), internal::first(actualIndices),
+                                                                          internal::index_list_size(actualIndices));
+}
 
-#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
-#define EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
-#include "IndexedViewMethods.h"
-#undef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
-#endif
+template <typename IndexType>
+std::enable_if_t<symbolic::is_symbolic<IndexType>::value && internal::is_lvalue<Derived>::value, Scalar&> operator()(const IndexType& id) {
+  return Base::operator()(internal::eval_expr_given_size(id, size()));
+}
+
+template <typename IndexType>
+std::enable_if_t<symbolic::is_symbolic<IndexType>::value, CoeffReturnType> operator()(const IndexType& id) const {
+  return Base::operator()(internal::eval_expr_given_size(id, size()));
+}
 
 #else // EIGEN_PARSED_BY_DOXYGEN
 
diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp
index d149960..84a4767 100644
--- a/test/indexed_view.cpp
+++ b/test/indexed_view.cpp
@@ -289,19 +289,11 @@
   VERIFY( (A(all, std::array<int,4>{{1,3,2,4}})).ColsAtCompileTime == 4);
 
   VERIFY_IS_APPROX( (A(std::array<int,3>{{1,3,5}}, std::array<int,4>{{9,6,3,0}})), A(seqN(1,3,2), seqN(9,4,-3)) );
+  VERIFY_IS_EQUAL(A(std::array<int, 3>{1, 3, 5}, std::array<int, 4>{3, 1, 6, 5}).RowsAtCompileTime, 3);
+  VERIFY_IS_EQUAL(A(std::array<int, 3>{1, 3, 5}, std::array<int, 4>{3, 1, 6, 5}).ColsAtCompileTime, 4);
 
-  VERIFY_IS_APPROX( A({3, 1, 6, 5}, all), A(std::array<int,4>{{3, 1, 6, 5}}, all) );
-  VERIFY_IS_APPROX( A(all,{3, 1, 6, 5}), A(all,std::array<int,4>{{3, 1, 6, 5}}) );
-  VERIFY_IS_APPROX( A({1,3,5},{3, 1, 6, 5}), A(std::array<int,3>{{1,3,5}},std::array<int,4>{{3, 1, 6, 5}}) );
-
-  VERIFY_IS_EQUAL( A({1,3,5},{3, 1, 6, 5}).RowsAtCompileTime, 3 );
-  VERIFY_IS_EQUAL( A({1,3,5},{3, 1, 6, 5}).ColsAtCompileTime, 4 );
-
-  VERIFY_IS_APPROX( a({3, 1, 6, 5}), a(std::array<int,4>{{3, 1, 6, 5}}) );
-  VERIFY_IS_EQUAL( a({1,3,5}).SizeAtCompileTime, 3 );
-
-  VERIFY_IS_APPROX( b({3, 1, 6, 5}), b(std::array<int,4>{{3, 1, 6, 5}}) );
-  VERIFY_IS_EQUAL( b({1,3,5}).SizeAtCompileTime, 3 );
+  VERIFY_IS_EQUAL( a(std::array<int,3>{1,3,5}).SizeAtCompileTime, 3 );
+  VERIFY_IS_EQUAL( b(std::array<int,3>{1,3,5}).SizeAtCompileTime, 3 );
 
   // check mat(i,j) with weird types for i and j
   {
@@ -364,6 +356,9 @@
   A(X,Y) = 1;
   A(XX,Y) = 1;
   A(X,YY) = 1;
+  // check symbolic indices
+  a(last) = 1;
+  A(last, last) = 1;
 
   // Check compilation of varying integer types as index types:
   Index i = n/2;
diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp
index 689a4cc..4b7934f 100644
--- a/test/nomalloc.cpp
+++ b/test/nomalloc.cpp
@@ -225,4 +225,7 @@
   CALL_SUBTEST_6(test_reference(Matrix<float,32,32>()));
   CALL_SUBTEST_7(test_reference(R1));
   CALL_SUBTEST_8(Ref<MatrixXd> R2 = M1.topRows<2>(); test_reference(R2));
+
+  // freeing is now possible
+  Eigen::internal::set_is_malloc_allowed(true);
 }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 3a917a0..55369e1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -91,7 +91,6 @@
     eigen_assert(rhs_block);
     BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
     char* block_mem = static_cast<char*>(d.allocate(sz.lhs_size + sz.rhs_size));
-    eigen_assert(block_mem);
     *lhs_block = reinterpret_cast<LhsScalar*>(block_mem);
     *rhs_block = reinterpret_cast<RhsScalar*>(block_mem + sz.lhs_size);
     return block_mem;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 839ff69..5174d8d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -166,7 +166,6 @@
     Vectorizable = 0,
     size = 1,
     AlignedOnScalar = 0,
-    HasHalfPacket = 0
   };
   enum {
     HasAdd    = 0,
diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp
index 7b67738..81d81ef 100644
--- a/unsupported/test/cxx11_tensor_casts.cpp
+++ b/unsupported/test/cxx11_tensor_casts.cpp
@@ -15,113 +15,23 @@
 using Eigen::Tensor;
 using Eigen::array;
 
-static void test_simple_cast()
-{
-  Tensor<float, 2> ftensor(20,30);
-  ftensor = ftensor.random() * 100.f;
-  Tensor<char, 2> chartensor(20,30);
-  chartensor.setRandom();
-  Tensor<std::complex<float>, 2> cplextensor(20,30);
-  cplextensor.setRandom();
-
-  chartensor = ftensor.cast<char>();
-  cplextensor = ftensor.cast<std::complex<float> >();
-
-  for (int i = 0; i < 20; ++i) {
-    for (int j = 0; j < 30; ++j) {
-      VERIFY_IS_EQUAL(chartensor(i,j), static_cast<char>(ftensor(i,j)));
-      VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float> >(ftensor(i,j)));
-    }
-  }
-}
-
-
-static void test_vectorized_cast()
-{
-  Tensor<int, 2> itensor(20,30);
-  itensor = itensor.random() / 1000;
-  Tensor<float, 2> ftensor(20,30);
-  ftensor.setRandom();
-  Tensor<double, 2> dtensor(20,30);
-  dtensor.setRandom();
-
-  ftensor = itensor.cast<float>();
-  dtensor = itensor.cast<double>();
-
-  for (int i = 0; i < 20; ++i) {
-    for (int j = 0; j < 30; ++j) {
-      VERIFY_IS_EQUAL(itensor(i,j), static_cast<int>(ftensor(i,j)));
-      VERIFY_IS_EQUAL(dtensor(i,j), static_cast<double>(ftensor(i,j)));
-    }
-  }
-}
-
-
-static void test_float_to_int_cast()
-{
-  Tensor<float, 2> ftensor(20,30);
-  ftensor = ftensor.random() * 1000.0f;
-  Tensor<double, 2> dtensor(20,30);
-  dtensor = dtensor.random() * 1000.0;
-
-  Tensor<int, 2> i1tensor = ftensor.cast<int>();
-  Tensor<int, 2> i2tensor = dtensor.cast<int>();
-
-  for (int i = 0; i < 20; ++i) {
-    for (int j = 0; j < 30; ++j) {
-      VERIFY_IS_EQUAL(i1tensor(i,j), static_cast<int>(ftensor(i,j)));
-      VERIFY_IS_EQUAL(i2tensor(i,j), static_cast<int>(dtensor(i,j)));
-    }
-  }
-}
-
-
-static void test_big_to_small_type_cast()
-{
-  Tensor<double, 2> dtensor(20, 30);
-  dtensor.setRandom();
-  Tensor<float, 2> ftensor(20, 30);
-  ftensor = dtensor.cast<float>();
-
-  for (int i = 0; i < 20; ++i) {
-    for (int j = 0; j < 30; ++j) {
-      VERIFY_IS_APPROX(dtensor(i,j), static_cast<double>(ftensor(i,j)));
-    }
-  }
-}
-
-
-static void test_small_to_big_type_cast()
-{
-  Tensor<float, 2> ftensor(20, 30);
-  ftensor.setRandom();
-  Tensor<double, 2> dtensor(20, 30);
-  dtensor = ftensor.cast<double>();
-
-  for (int i = 0; i < 20; ++i) {
-    for (int j = 0; j < 30; ++j) {
-      VERIFY_IS_APPROX(dtensor(i,j), static_cast<double>(ftensor(i,j)));
-    }
-  }
-}
-
 template <typename FromType, typename ToType>
 static void test_type_cast() {
-  Tensor<FromType, 2> ftensor(100, 200);
+  Tensor<FromType, 2> ftensor(101, 201);
   // Generate random values for a valid cast.
-  for (int i = 0; i < 100; ++i) {
-    for (int j = 0; j < 200; ++j) {
+  for (int i = 0; i < 101; ++i) {
+    for (int j = 0; j < 201; ++j) {
       ftensor(i, j) = internal::random_without_cast_overflow<FromType,ToType>::value();
     }
   }
 
-  Tensor<ToType, 2> ttensor(100, 200);
+  Tensor<ToType, 2> ttensor(101, 201);
   ttensor = ftensor.template cast<ToType>();
 
-  for (int i = 0; i < 100; ++i) {
-    for (int j = 0; j < 200; ++j) {
-      const ToType ref = internal::cast<FromType,ToType>(ftensor(i, j));
-      VERIFY_IS_APPROX(ttensor(i, j), ref);
+  for (int i = 0; i < 101; ++i) {
+    for (int j = 0; j < 201; ++j) {
+      const ToType ref = static_cast<ToType>(ftensor(i, j));
+      VERIFY_IS_EQUAL(ttensor(i, j), ref);
     }
   }
 }
@@ -161,12 +71,6 @@
 
 EIGEN_DECLARE_TEST(cxx11_tensor_casts)
 {
-  CALL_SUBTEST(test_simple_cast());
-  CALL_SUBTEST(test_vectorized_cast());
-  CALL_SUBTEST(test_float_to_int_cast());
-  CALL_SUBTEST(test_big_to_small_type_cast());
-  CALL_SUBTEST(test_small_to_big_type_cast());
-
   CALL_SUBTEST(test_cast_runner<bool>::run());
   CALL_SUBTEST(test_cast_runner<int8_t>::run());
   CALL_SUBTEST(test_cast_runner<int16_t>::run());