44 #include <private/qsimd_p.h>
766 #if defined(__SSSE3__)
767 using ShuffleMask =
uchar[16];
768 alignas(16)
static const ShuffleMask shuffleMasks[3] = {
770 {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
772 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12},
774 {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}
778 const __m128i *shuffleMaskPtr) noexcept
781 const __m128i shuffleMask = _mm_load_si128(shuffleMaskPtr);
784 const __m256i shuffleMask256 = _mm256_inserti128_si256(_mm256_castsi128_si256(shuffleMask), shuffleMask, 1);
785 for ( ;
i +
sizeof(__m256i) <= bytes;
i +=
sizeof(__m256i)) {
786 __m256i
data = _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(
src +
i));
787 data = _mm256_shuffle_epi8(
data, shuffleMask256);
788 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(
dst +
i),
data);
791 for ( ;
i + 2 *
sizeof(__m128i) <= bytes;
i += 2 *
sizeof(__m128i)) {
792 __m128i data1 = _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(
src +
i));
793 __m128i data2 = _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(
src +
i) + 1);
794 data1 = _mm_shuffle_epi8(data1, shuffleMask);
795 data2 = _mm_shuffle_epi8(data2, shuffleMask);
796 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(
dst +
i), data1);
797 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(
dst +
i) + 1, data2);
801 if (
i +
sizeof(__m128i) <= bytes) {
802 __m128i
data = _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(
src +
i));
803 data = _mm_shuffle_epi8(
data, shuffleMask);
804 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(
dst +
i),
data);
805 i +=
sizeof(__m128i);
811 template <
typename T>
static Q_ALWAYS_INLINE
814 auto shuffleMaskPtr =
reinterpret_cast<const __m128i *
>(shuffleMasks[0]);
816 size_t i = sseSwapLoop(
src, bytes,
dst, shuffleMaskPtr);
819 for (
size_t _i = 0;
i < bytes && _i <
sizeof(__m128i);
i +=
sizeof(
T), _i +=
sizeof(
T))
825 #elif defined(__SSE2__)
826 template <
typename T>
static
827 size_t simdSwapLoop(
const uchar *,
size_t,
uchar *) noexcept
834 template <>
size_t simdSwapLoop<quint16>(
const uchar *
src,
size_t bytes,
uchar *
dst) noexcept
836 auto swapEndian = [](__m128i &
data) {
837 __m128i lows = _mm_srli_epi16(
data, 8);
838 __m128i highs = _mm_slli_epi16(
data, 8);
839 data = _mm_xor_si128(lows, highs);
843 for ( ;
i + 2 *
sizeof(__m128i) <= bytes;
i += 2 *
sizeof(__m128i)) {
844 __m128i data1 = _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(
src +
i));
845 __m128i data2 = _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(
src +
i) + 1);
848 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(
dst +
i), data1);
849 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(
dst +
i) + 1, data2);
852 if (
i +
sizeof(__m128i) <= bytes) {
853 __m128i
data = _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(
src +
i));
855 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(
dst +
i),
data);
856 i +=
sizeof(__m128i);
860 for (
size_t _i = 0 ;
i < bytes && _i <
sizeof(__m128i);
i +=
sizeof(
quint16), _i +=
sizeof(
quint16))
867 template <
typename T>
static Q_ALWAYS_INLINE
868 size_t simdSwapLoop(
const uchar *,
size_t,
uchar *) noexcept
874 template <
typename T>
static Q_ALWAYS_INLINE
888 size_t i = simdSwapLoop<T>(
src,
n,
dst);
890 for (;
i <
n;
i +=
sizeof(
T))
901 return bswapLoop<quint16>(
src,
n << 1,
dst);
910 return bswapLoop<quint32>(
src,
n << 2,
dst);
919 return bswapLoop<quint64>(
src,
n << 3,
dst);
small capitals from c petite p scientific i
[1]
constexpr uint qCountTrailingZeroBits(quint32 v) noexcept
void * qbswap< 8 >(const void *source, qsizetype n, void *dest) noexcept
void * qbswap< 2 >(const void *source, qsizetype n, void *dest) noexcept
void * qbswap< 4 >(const void *source, qsizetype n, void *dest) noexcept
constexpr T qbswap(T source)
QT_BEGIN_INCLUDE_NAMESPACE typedef unsigned char uchar
GLint GLsizei GLsizei GLenum GLenum GLsizei void * data
GLsizei GLsizei GLchar * source