blob: 1a7cd03d4986298edd441087593631084f785280 [file] [log] [blame]
#ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_
#define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_
#ifdef _MSC_VER
#include <emmintrin.h>
#include <immintrin.h>
#include <smmintrin.h>
#endif
namespace Eigen {
namespace internal {
typedef eigen_packet_wrapper<__m256i, 10> Packet32q8i;
typedef eigen_packet_wrapper<__m128i, 11> Packet16q8i;
template <>
struct packet_traits<QInt8> : default_packet_traits {
typedef Packet32q8i type;
typedef Packet16q8i half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 32,
};
enum {
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasConj = 0,
HasSetLinear = 0
};
};
template <>
struct unpacket_traits<Packet32q8i> {
typedef QInt8 type;
typedef Packet16q8i half;
enum {
size = 32,
alignment = Aligned32,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
struct unpacket_traits<Packet16q8i> {
typedef QInt8 type;
typedef Packet16q8i half;
enum {
size = 16,
alignment = Aligned32,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
return _mm256_set1_epi8(from.value);
}
template <>
EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
reinterpret_cast<const __m256i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
reinterpret_cast<const __m128i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
reinterpret_cast<const __m256i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
reinterpret_cast<const __m128i*>(from));
}
template <>
EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
reinterpret_cast<__m256i*>(to), from.m_val);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
from.m_val);
}
template <>
EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
from.m_val);
}
template <>
EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
from.m_val);
}
typedef __m256 Packet8f;
template <>
struct type_casting_traits<float, QInt8> {
enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
};
template <>
EIGEN_STRONG_INLINE Packet32q8i
pcast<Packet8f, Packet32q8i>(const Packet8f& a, const Packet8f& b,
const Packet8f& c, const Packet8f& d) {
const __m256i a_conv = _mm256_cvtps_epi32(a);
const __m256i b_conv = _mm256_cvtps_epi32(b);
const __m256i c_conv = _mm256_cvtps_epi32(c);
const __m256i d_conv = _mm256_cvtps_epi32(d);
__m128i low = _mm256_castsi256_si128(a_conv);
__m128i high = _mm256_extractf128_si256(a_conv, 1);
__m128i tmp = _mm_packs_epi32(low, high);
__m128i low2 = _mm256_castsi256_si128(b_conv);
__m128i high2 = _mm256_extractf128_si256(b_conv, 1);
__m128i tmp2 = _mm_packs_epi32(low2, high2);
__m128i converted_low = _mm_packs_epi16(tmp, tmp2);
low = _mm256_castsi256_si128(c_conv);
high = _mm256_extractf128_si256(c_conv, 1);
tmp = _mm_packs_epi32(low, high);
low2 = _mm256_castsi256_si128(d_conv);
high2 = _mm256_extractf128_si256(d_conv, 1);
tmp2 = _mm_packs_epi32(low2, high2);
__m128i converted_high = _mm_packs_epi16(tmp, tmp2);
return _mm256_insertf128_si256(_mm256_castsi128_si256(converted_low),
converted_high, 1);
}
} // end namespace internal
} // end namespace Eigen
#endif // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_