Update Eigen to commit:5635d37f46acc2985aa66d9385665a76c3b9d5c7 CHANGELOG ========= 5635d37f4 - more pblend optimizations f0795d35e - Fix new psincos for ppc and arm32. ad452e575 - Fix compilation problems with PacketI on PowerPC. fcaf03ef7 - fix pendantic compiler warnings b5feca5d0 - Fix build for pblend and psin_double, pcos_double when AVX but not AVX2 is supported. 888fca0e2 - Simd sincos double 6ad2ccea4 - Eigen pblend 9099c5eac - Handle missing AVX512 intrinsic PiperOrigin-RevId: 626425279 Change-Id: I008ca76bc6c357da4c37c0adfa654b053ac9c18b
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 61f0eb9..fc5d757 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h
@@ -135,7 +135,14 @@ struct unpacket_traits { typedef T type; typedef T half; - enum { size = 1, alignment = 1, vectorizable = false, masked_load_available = false, masked_store_available = false }; + typedef typename numext::get_integer_by_size<sizeof(T)>::signed_type integer_packet; + enum { + size = 1, + alignment = alignof(T), + vectorizable = false, + masked_load_available = false, + masked_store_available = false + }; }; template <typename T> @@ -1401,12 +1408,6 @@ template <size_t N> struct Selector { bool select[N]; - template <typename MaskType = int> - EIGEN_DEVICE_FUNC inline MaskType mask(size_t begin = 0, size_t end = N) const { - MaskType res = 0; - for (size_t i = begin; i < end; i++) res |= (static_cast<MaskType>(select[i]) << i); - return res; - } }; template <typename Packet>
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index b125d59..321188c 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -22,7 +22,15 @@ namespace internal { EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet8f) -EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet4d) + +EIGEN_DOUBLE_PACKET_FUNCTION(atan, Packet4d) +EIGEN_DOUBLE_PACKET_FUNCTION(log, Packet4d) +EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet4d) +EIGEN_DOUBLE_PACKET_FUNCTION(exp, Packet4d) +#ifdef EIGEN_VECTORIZE_AVX2 +EIGEN_DOUBLE_PACKET_FUNCTION(sin, Packet4d) +EIGEN_DOUBLE_PACKET_FUNCTION(cos, Packet4d) +#endif // Notice that for newer processors, it is counterproductive to use Newton // iteration for square root. In particular, Skylake and Zen2 processors
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index a53c38d..dac43fc 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -142,6 +142,10 @@ HasCmp = 1, HasDiv = 1, +#ifdef EIGEN_VECTORIZE_AVX2 + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, +#endif HasLog = 1, HasExp = 1, HasSqrt = 1, @@ -2130,40 +2134,29 @@ kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); } +EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<4>& ifPacket) { + return _mm256_set_epi64x(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], + 0 - ifPacket.select[0]); +} + +EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<8>& ifPacket) { + return _mm256_set_epi32(0 - ifPacket.select[7], 0 - ifPacket.select[6], 0 - ifPacket.select[5], + 0 - ifPacket.select[4], 0 - ifPacket.select[3], 0 - ifPacket.select[2], + 0 - ifPacket.select[1], 0 - ifPacket.select[0]); +} + template <> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { -#ifdef EIGEN_VECTORIZE_AVX2 - const __m256i zero = _mm256_setzero_si256(); - const __m256i select = - _mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], - ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m256i false_mask = _mm256_cmpeq_epi32(zero, select); - return _mm256_blendv_ps(thenPacket, elsePacket, _mm256_castsi256_ps(false_mask)); -#else - const __m256 zero = _mm256_setzero_ps(); - const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], - ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); - return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); -#endif + const __m256 true_mask = _mm256_castsi256_ps(avx_blend_mask(ifPacket)); + return pselect<Packet8f>(true_mask, thenPacket, elsePacket); } template <> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { -#ifdef EIGEN_VECTORIZE_AVX2 - const __m256i zero = _mm256_setzero_si256(); - const __m256i select = - _mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m256i false_mask = _mm256_cmpeq_epi64(select, zero); - return _mm256_blendv_pd(thenPacket, elsePacket, _mm256_castsi256_pd(false_mask)); -#else - const __m256d zero = _mm256_setzero_pd(); - const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); - return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); -#endif + const __m256d true_mask = _mm256_castsi256_pd(avx_blend_mask(ifPacket)); + return pselect<Packet4d>(true_mask, thenPacket, elsePacket); } // Packet math for Eigen::half
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 5c53556..aa338d1 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -156,6 +156,8 @@ HasBlend = 1, HasSqrt = 1, HasRsqrt = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, HasATan = 1, @@ -1285,7 +1287,11 @@ } template <> EIGEN_STRONG_INLINE int pfirst<Packet16i>(const Packet16i& a) { +#if EIGEN_GNUC_STRICT_LESS_THAN(11, 0, 0) + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +#else return _mm512_cvtsi512_si32(a); +#endif } template <> @@ -2144,16 +2150,24 @@ PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 3, 1); } +template <size_t N> +EIGEN_STRONG_INLINE int avx512_blend_mask(const Selector<N>& ifPacket) { + alignas(__m128i) uint8_t aux[sizeof(__m128i)]; + for (size_t i = 0; i < N; i++) aux[i] = static_cast<uint8_t>(ifPacket.select[i]); + __m128i paux = _mm_sub_epi8(_mm_setzero_si128(), _mm_load_si128(reinterpret_cast<const __m128i*>(aux))); + return _mm_movemask_epi8(paux); +} + template <> EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& ifPacket, const Packet16f& thenPacket, const Packet16f& elsePacket) { - __mmask16 m = ifPacket.mask<__mmask16>(); + __mmask16 m = avx512_blend_mask(ifPacket); return _mm512_mask_blend_ps(m, elsePacket, thenPacket); } template <> EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, const Packet8d& thenPacket, const Packet8d& elsePacket) { - __mmask8 m = ifPacket.mask<__mmask8>(); + __mmask8 m = avx512_blend_mask(ifPacket); return _mm512_mask_blend_pd(m, elsePacket, thenPacket); }
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 6a2f0e6..eed545c 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -3178,8 +3178,8 @@ HasMin = 1, HasMax = 1, HasAbs = 1, - HasSin = 0, - HasCos = 0, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, HasATan = 0, HasLog = 0, HasExp = 1, @@ -3201,6 +3201,7 @@ template <> struct unpacket_traits<Packet2d> { typedef double type; + typedef Packet2l integer_packet; enum { size = 2, alignment = Aligned16, @@ -3210,6 +3211,18 @@ }; typedef Packet2d half; }; +template <> +struct unpacket_traits<Packet2l> { + typedef int64_t type; + typedef Packet2l half; + enum { + size = 2, + alignment = Aligned16, + vectorizable = false, + masked_load_available = false, + masked_store_available = false + }; +}; inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) { union { @@ -3259,6 +3272,11 @@ Packet2d v = {from, from}; return v; } +template <> +EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) { + Packet2l v = {from, from}; + return v; +} template <> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index c973efd..4ee035d 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -696,6 +696,174 @@ return psincos_float<false>(x); } +// Trigonometric argument reduction for double for inputs smaller than 15. +// Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant +// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t. +template <typename Packet> +Packet trig_reduce_small_double(const Packet& x, const Packet& q) { + // Pi/2 split into 2 values + const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803); + const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10); + + Packet t; + t = pmadd(cst_pio2_a, q, x); + t = pmadd(cst_pio2_b, q, t); + return t; +} + +// Trigonometric argument reduction for double for inputs smaller than 1e14. +// Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant +// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t. +template <typename Packet> +Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) { + // Pi/2 split into 4 values + const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803); + const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10); + const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17); + const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25); + + Packet t; + t = pmadd(cst_pio2_a, q_high, x); + t = pmadd(cst_pio2_a, q_low, t); + t = pmadd(cst_pio2_b, q_high, t); + t = pmadd(cst_pio2_b, q_low, t); + t = pmadd(cst_pio2_c, q_high, t); + t = pmadd(cst_pio2_c, q_low, t); + t = pmadd(cst_pio2_d, padd(q_low, q_high), t); + return t; +} + +template <bool ComputeSine, typename Packet, bool ComputeBoth = false> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +#if EIGEN_COMP_GNUC_STRICT + __attribute__((optimize("-fno-unsafe-math-optimizations"))) +#endif + Packet + psincos_double(const Packet& x) { + typedef typename unpacket_traits<Packet>::integer_packet PacketI; + typedef typename unpacket_traits<PacketI>::type ScalarI; + + const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint64_t>(0x8000000000000000u)); + + // If the argument is smaller than this value, use a simpler argument reduction + const double small_th = 15; + // If the argument is bigger than this value, use the non-vectorized std version + const double huge_th = 1e14; + + const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006); // 2/PI + // Integer Packet constants + const PacketI cst_one = pset1<PacketI>(ScalarI(1)); + // Constant for splitting + const Packet cst_split = pset1<Packet>(1 << 24); + + Packet x_abs = pabs(x); + + // Scale x by 2/Pi + PacketI q_int; + Packet s; + + // TODO Implement huge angle argument reduction + if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) { + Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split); + Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high); + q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5))); + Packet q_low = pcast<PacketI, Packet>(q_int); + s = trig_reduce_medium_double(x_abs, q_high, q_low); + } else { + Packet qval_noround = pmul(x_abs, cst_2oPI); + q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5))); + Packet q = pcast<PacketI, Packet>(q_int); + s = trig_reduce_small_double(x_abs, q); + } + + // All the upcoming approximating polynomials have even exponents + Packet ss = pmul(s, s); + + // Padé approximant of cos(x) + // Assuring < 1 ULP error on the interval [-pi/4, pi/4] + // cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 + + // 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600) + // MATLAB code to compute those coefficients: + // syms x; + // cosf = @(x) cos(x); + // pade_cosf = pade(cosf(x), x, 0, 'Order', 8) + Packet sc1_num = pmadd(ss, pset1<Packet>(80737373), pset1<Packet>(-13853547000)); + Packet sc2_num = pmadd(sc1_num, ss, pset1<Packet>(727718024880)); + Packet sc3_num = pmadd(sc2_num, ss, pset1<Packet>(-11275015752000)); + Packet sc4_num = pmadd(sc3_num, ss, pset1<Packet>(23594700729600)); + Packet sc1_denum = pmadd(ss, pset1<Packet>(147173), pset1<Packet>(39328920)); + Packet sc2_denum = pmadd(sc1_denum, ss, pset1<Packet>(5772800880)); + Packet sc3_denum = pmadd(sc2_denum, ss, pset1<Packet>(522334612800)); + Packet sc4_denum = pmadd(sc3_denum, ss, pset1<Packet>(23594700729600)); + Packet scos = pdiv(sc4_num, sc4_denum); + + // Padé approximant of sin(x) + // Assuring < 1 ULP error on the interval [-pi/4, pi/4] + // sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 + + // 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960)) + // MATLAB code to compute those coefficients: + // syms x; + // sinf = @(x) sin(x); + // pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative') + Packet ss1_num = pmadd(ss, pset1<Packet>(4585922449), pset1<Packet>(-1066023933480)); + Packet ss2_num = pmadd(ss1_num, ss, pset1<Packet>(83284044283440)); + Packet ss3_num = pmadd(ss2_num, ss, pset1<Packet>(-2303682236856000)); + Packet ss4_num = pmadd(ss3_num, ss, pset1<Packet>(15605159573203200)); + Packet ss1_denum = pmadd(ss, pset1<Packet>(1029037), pset1<Packet>(345207016)); + Packet ss2_denum = pmadd(ss1_denum, ss, pset1<Packet>(61570292784)); + Packet ss3_denum = pmadd(ss2_denum, ss, pset1<Packet>(6603948711360)); + Packet ss4_denum = pmadd(ss3_denum, ss, pset1<Packet>(346781323848960)); + Packet ssin = pdiv(pmul(s, ss4_num), pmul(pset1<Packet>(45), ss4_denum)); + + Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int))); + + Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int))); + Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one))); + Packet sign_bit, sFinalRes; + if (ComputeBoth) { + Packet peven = peven_mask(x); + sign_bit = pselect((s), sign_sin, sign_cos); + sFinalRes = pselect(pxor(peven, poly_mask), ssin, scos); + } else { + sign_bit = ComputeSine ? sign_sin : sign_cos; + sFinalRes = ComputeSine ? pselect(poly_mask, ssin, scos) : pselect(poly_mask, scos, ssin); + } + sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit + sFinalRes = pxor(sFinalRes, sign_bit); + + // If the inputs values are higher than that a value that the argument reduction can currently address, compute them + // using std::sin and std::cos + // TODO Remove it when huge angle argument reduction is implemented + if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) { + const int PacketSize = unpacket_traits<Packet>::size; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double sincos_vals[PacketSize]; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double x_cpy[PacketSize]; + pstoreu(x_cpy, x); + pstoreu(sincos_vals, sFinalRes); + for (int k = 0; k < PacketSize; ++k) { + double val = x_cpy[k]; + if (std::abs(val) > huge_th && (numext::isfinite)(val)) { + if (ComputeBoth) + sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val); + else + sincos_vals[k] = ComputeSine ? std::sin(val) : std::cos(val); + } + } + sFinalRes = ploadu<Packet>(sincos_vals); + } + return sFinalRes; +} + +template <typename Packet> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) { + return psincos_double<true>(x); +} + +template <typename Packet> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) { + return psincos_double<false>(x); +} + // Generic implementation of acos(x). template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index 9560de2..05cac5c 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -82,6 +82,14 @@ template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x); +/** \internal \returns sin(x) for double precision float */ +template <typename Packet> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x); + +/** \internal \returns cos(x) for double precision float */ +template <typename Packet> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x); + /** \internal \returns asin(x) for single precision float */ template <typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x); @@ -158,6 +166,8 @@ #define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PACKET) \ EIGEN_DOUBLE_PACKET_FUNCTION(atan, PACKET) \ EIGEN_DOUBLE_PACKET_FUNCTION(log, PACKET) \ + EIGEN_DOUBLE_PACKET_FUNCTION(sin, PACKET) \ + EIGEN_DOUBLE_PACKET_FUNCTION(cos, PACKET) \ EIGEN_DOUBLE_PACKET_FUNCTION(log2, PACKET) \ EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET)
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 71e5f5f..2c18b5d 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -5177,8 +5177,8 @@ HasLog = 1, HasATan = 1, #endif - HasSin = 0, - HasCos = 0, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, HasSqrt = 1, HasRsqrt = 1, HasTanh = 0,
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index e91ef4d..7bac3f9 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -218,6 +218,8 @@ HasCmp = 1, HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, HasSqrt = 1, @@ -2230,29 +2232,25 @@ kernel.packet[15] = _mm_unpackhi_epi64(u7, uf); } +EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<2>& ifPacket) { + return _mm_set_epi64x(0 - ifPacket.select[1], 0 - ifPacket.select[0]); +} + +EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<4>& ifPacket) { + return _mm_set_epi32(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], 0 - ifPacket.select[0]); +} + template <> EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket, const Packet2l& elsePacket) { - const __m128i zero = _mm_setzero_si128(); - const __m128i select = _mm_set_epi64x(ifPacket.select[1], ifPacket.select[0]); - __m128i false_mask = pcmp_eq<Packet2l>(select, zero); -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); -#else - return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); -#endif + const __m128i true_mask = sse_blend_mask(ifPacket); + return pselect<Packet2l>(true_mask, thenPacket, elsePacket); } template <> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { - const __m128i zero = _mm_setzero_si128(); - const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m128i false_mask = _mm_cmpeq_epi32(select, zero); -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); -#else - return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); -#endif + const __m128i true_mask = sse_blend_mask(ifPacket); + return pselect<Packet4i>(true_mask, thenPacket, elsePacket); } template <> EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4ui& thenPacket, @@ -2262,26 +2260,14 @@ template <> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { - const __m128 zero = _mm_setzero_ps(); - const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m128 false_mask = _mm_cmpeq_ps(select, zero); -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blendv_ps(thenPacket, elsePacket, false_mask); -#else - return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); -#endif + const __m128i true_mask = sse_blend_mask(ifPacket); + return pselect<Packet4f>(_mm_castsi128_ps(true_mask), thenPacket, elsePacket); } template <> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { - const __m128d zero = _mm_setzero_pd(); - const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); - __m128d false_mask = _mm_cmpeq_pd(select, zero); -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blendv_pd(thenPacket, elsePacket, false_mask); -#else - return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); -#endif + const __m128i true_mask = sse_blend_mask(ifPacket); + return pselect<Packet2d>(_mm_castsi128_pd(true_mask), thenPacket, elsePacket); } // Scalar path for pmadd with FMA to ensure consistency with vectorized path.