QtBase  v6.3.1
41 #include <qstringconverter.h>
42 #include <private/qstringconverter_p.h>
43 #include "qendian.h"
45 #include "private/qsimd_p.h"
46 #include "private/qstringiterator_p.h"
47 #include "private/qtools_p.h"
48 #include "qbytearraymatcher.h"
50 #ifdef Q_OS_WIN
51 #include <qt_windows.h>
53 #include <QtCore/qvarlengtharray.h>
54 #endif // !QT_BOOTSTRAPPED
55 #endif
57 #if __has_include(<bit>) && __cplusplus > 201703L
58 #include <bit>
59 #endif
63 enum { Endian = 0, Data = 1 };
65 static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
67 #if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
68  || defined(__ARM_NEON__)
69 static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
70 {
71 #if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
72  return std::bit_width(v) - 1;
73 #else
75  // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
76  // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
77  // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
78  result ^= sizeof(unsigned) * 8 - 1;
79  return result;
80 #endif
81 }
82 #endif
84 #if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
85 static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
86 {
87  // do sixteen characters at a time
88  for ( ; end - src >= 16; src += 16, dst += 16) {
89 # ifdef __AVX2__
90  __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
91  __m128i data1 = _mm256_castsi256_si128(data);
92  __m128i data2 = _mm256_extracti128_si256(data, 1);
93 # else
94  __m128i data1 = _mm_loadu_si128((const __m128i*)src);
95  __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src);
96 # endif
98  // check if everything is ASCII
99  // the highest ASCII value is U+007F
100  // Do the packing directly:
101  // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
102  // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
103  // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
104  // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
105  // "non-ASCII", but it's an acceptable compromise.
106  __m128i packed = _mm_packus_epi16(data1, data2);
107  __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
109  // store, even if there are non-ASCII characters here
110  _mm_storeu_si128((__m128i*)dst, packed);
112  // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
113  ushort n = ~_mm_movemask_epi8(nonAscii);
114  if (n) {
115  // find the next probable ASCII character
116  // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
117  // characters still coming
118  nextAscii = src + qBitScanReverse(n) + 1;
121  dst += n;
122  src += n;
123  return false;
124  }
125  }
127  if (end - src >= 8) {
128  // do eight characters at a time
129  __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
130  __m128i packed = _mm_packus_epi16(data, data);
131  __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
133  // store even non-ASCII
134  _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
136  uchar n = ~_mm_movemask_epi8(nonAscii);
137  if (n) {
138  nextAscii = src + qBitScanReverse(n) + 1;
140  dst += n;
141  src += n;
142  return false;
143  }
144  }
146  return src == end;
147 }
149 static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
150 {
151  // do sixteen characters at a time
152  for ( ; end - src >= 16; src += 16, dst += 16) {
153  __m128i data = _mm_loadu_si128((const __m128i*)src);
155 #ifdef __AVX2__
156  const int BitSpacing = 2;
157  // load and zero extend to an YMM register
158  const __m256i extended = _mm256_cvtepu8_epi16(data);
160  uint n = _mm256_movemask_epi8(extended);
161  if (!n) {
162  // store
163  _mm256_storeu_si256((__m256i*)dst, extended);
164  continue;
165  }
166 #else
167  const int BitSpacing = 1;
169  // check if everything is ASCII
170  // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
171  uint n = _mm_movemask_epi8(data);
172  if (!n) {
173  // unpack
174  _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
175  _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
176  continue;
177  }
178 #endif
180  // copy the front part that is still ASCII
181  while (!(n & 1)) {
182  *dst++ = *src++;
183  n >>= BitSpacing;
184  }
186  // find the next probable ASCII character
187  // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
188  // characters still coming
189  n = qBitScanReverse(n);
190  nextAscii = src + (n / BitSpacing) + 1;
191  return false;
193  }
195  if (end - src >= 8) {
196  __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
197  uint n = _mm_movemask_epi8(data) & 0xff;
198  if (!n) {
199  // unpack and store
200  _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
201  } else {
202  while (!(n & 1)) {
203  *dst++ = *src++;
204  n >>= 1;
205  }
207  n = qBitScanReverse(n);
208  nextAscii = src + n + 1;
209  return false;
210  }
211  }
213  return src == end;
214 }
216 static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
217 {
218 #ifdef __AVX2__
219  // do 32 characters at a time
220  // (this is similar to simdTestMask in qstring.cpp)
221  const __m256i mask = _mm256_set1_epi8(0x80);
222  for ( ; end - src >= 32; src += 32) {
223  __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
224  if (_mm256_testz_si256(mask, data))
225  continue;
227  uint n = _mm256_movemask_epi8(data);
228  Q_ASSUME(n);
230  // find the next probable ASCII character
231  // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
232  // characters still coming
233  nextAscii = src + qBitScanReverse(n) + 1;
235  // return the non-ASCII character
236  return src + qCountTrailingZeroBits(n);
237  }
238 #endif
240  // do sixteen characters at a time
241  for ( ; end - src >= 16; src += 16) {
242  __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
244  // check if everything is ASCII
245  // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
246  uint n = _mm_movemask_epi8(data);
247  if (!n)
248  continue;
250  // find the next probable ASCII character
251  // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
252  // characters still coming
253  nextAscii = src + qBitScanReverse(n) + 1;
255  // return the non-ASCII character
256  return src + qCountTrailingZeroBits(n);
257  }
259  // do four characters at a time
260  for ( ; end - src >= 4; src += 4) {
261  quint32 data = qFromUnaligned<quint32>(src);
262  data &= 0x80808080U;
263  if (!data)
264  continue;
266  // We don't try to guess which of the three bytes is ASCII and which
267  // one isn't. The chance that at least two of them are non-ASCII is
268  // better than 75%.
269  nextAscii = src;
270  return src;
271  }
272  nextAscii = end;
273  return src;
274 }
276 // Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
277 // and advance src8 and src16 to the first character that could not be compared
278 static void simdCompareAscii(const char8_t *&src8, const char8_t *end8, const char16_t *&src16, const char16_t *end16)
279 {
280  int bitSpacing = 1;
281  qptrdiff len = qMin(end8 - src8, end16 - src16);
282  qptrdiff offset = 0;
283  uint mask = 0;
285  // do sixteen characters at a time
286  for ( ; offset + 16 < len; offset += 16) {
287  __m128i data8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src8 + offset));
288 #ifdef __AVX2__
289  // AVX2 version, use 256-bit registers and VPMOVXZBW
290  __m256i data16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src16 + offset));
292  // expand US-ASCII as if it were Latin1 and confirm it's US-ASCII
293  __m256i datax8 = _mm256_cvtepu8_epi16(data8);
294  mask = _mm256_movemask_epi8(datax8);
295  if (mask)
296  break;
298  // compare Latin1 to UTF-16
299  __m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
300  mask = ~_mm256_movemask_epi8(latin1cmp);
301  if (mask)
302  break;
303 #else
304  // non-AVX2 code
305  __m128i datalo16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
306  __m128i datahi16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset) + 1);
308  // expand US-ASCII as if it were Latin1, we'll confirm later
309  __m128i datalo8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
310  __m128i datahi8 = _mm_unpackhi_epi8(data8, _mm_setzero_si128());
312  // compare Latin1 to UTF-16
313  __m128i latin1cmplo = _mm_cmpeq_epi16(datalo8, datalo16);
314  __m128i latin1cmphi = _mm_cmpeq_epi16(datahi8, datahi16);
315  mask = _mm_movemask_epi8(latin1cmphi) << 16;
316  mask |= ushort(_mm_movemask_epi8(latin1cmplo));
317  mask = ~mask;
318  if (mask)
319  break;
321  // confirm it was US-ASCII
322  mask = _mm_movemask_epi8(data8);
323  if (mask) {
324  bitSpacing = 0;
325  break;
326  }
327 #endif
328  }
330  // helper for comparing 4 or 8 characters
331  auto cmp_lt_16 = [&mask, &offset](int n, __m128i data8, __m128i data16) {
332  // n = 4 -> sizemask = 0xff
333  // n = 8 -> sizemask = 0xffff
334  unsigned sizemask = (1U << (2 * n)) - 1;
336  // expand as if Latin1
337  data8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
339  // compare and confirm it's US-ASCII
340  __m128i latin1cmp = _mm_cmpeq_epi16(data8, data16);
341  mask = ~_mm_movemask_epi8(latin1cmp) & sizemask;
342  mask |= _mm_movemask_epi8(data8);
343  if (mask == 0)
344  offset += n;
345  };
347  // do eight characters at a time
348  if (mask == 0 && offset + 8 < len) {
349  __m128i data8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src8 + offset));
350  __m128i data16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
351  cmp_lt_16(8, data8, data16);
352  }
354  // do four characters
355  if (mask == 0 && offset + 4 < len) {
356  __m128i data8 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src8 + offset));
357  __m128i data16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src16 + offset));
358  cmp_lt_16(4, data8, data16);
359  }
361  // correct the source pointers to point to the first character we couldn't deal with
362  if (mask)
363  offset += qCountTrailingZeroBits(mask) >> bitSpacing;
364  src8 += offset;
365  src16 += offset;
366 }
367 #elif defined(__ARM_NEON__)
368 static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
369 {
370  uint16x8_t maxAscii = vdupq_n_u16(0x7f);
371  uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
372  uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
374  // do sixteen characters at a time
375  for ( ; end - src >= 16; src += 16, dst += 16) {
376  // load 2 lanes (or: "load interleaved")
377  uint16x8x2_t in = vld2q_u16(reinterpret_cast<const uint16_t *>(src));
379  // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
380  // add those together into a scalar, and merge the scalars.
381  uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
382  | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
384  // merge the two lanes by shifting the values of the second by 8 and inserting them
385  uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
387  // store, even if there are non-ASCII characters here
388  vst1q_u8(dst, vreinterpretq_u8_u16(out));
390  if (nonAscii) {
391  // find the next probable ASCII character
392  // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
393  // characters still coming
394  nextAscii = src + qBitScanReverse(nonAscii) + 1;
396  nonAscii = qCountTrailingZeroBits(nonAscii);
397  dst += nonAscii;
398  src += nonAscii;
399  return false;
400  }
401  }
402  return src == end;
403 }
405 static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
406 {
407  // do eight characters at a time
408  uint8x8_t msb_mask = vdup_n_u8(0x80);
409  uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
410  for ( ; end - src >= 8; src += 8, dst += 8) {
411  uint8x8_t c = vld1_u8(src);
412  uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
413  if (!n) {
414  // store
415  vst1q_u16(reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
416  continue;
417  }
419  // copy the front part that is still ASCII
420  while (!(n & 1)) {
421  *dst++ = *src++;
422  n >>= 1;
423  }
425  // find the next probable ASCII character
426  // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
427  // characters still coming
428  n = qBitScanReverse(n);
429  nextAscii = src + n + 1;
430  return false;
432  }
433  return src == end;
434 }
436 static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
437 {
438  // The SIMD code below is untested, so just force an early return until
439  // we've had the time to verify it works.
440  nextAscii = end;
441  return src;
443  // do eight characters at a time
444  uint8x8_t msb_mask = vdup_n_u8(0x80);
445  uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
446  for ( ; end - src >= 8; src += 8) {
447  uint8x8_t c = vld1_u8(src);
448  uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
449  if (!n)
450  continue;
452  // find the next probable ASCII character
453  // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
454  // characters still coming
455  nextAscii = src + qBitScanReverse(n) + 1;
457  // return the non-ASCII character
458  return src + qCountTrailingZeroBits(n);
459  }
460  nextAscii = end;
461  return src;
462 }
464 static void simdCompareAscii(const char8_t *&, const char8_t *, const char16_t *&, const char16_t *)
465 {
466 }
467 #else
468 static inline bool simdEncodeAscii(uchar *, const char16_t *, const char16_t *, const char16_t *)
469 {
470  return false;
471 }
473 static inline bool simdDecodeAscii(char16_t *, const uchar *, const uchar *, const uchar *)
474 {
475  return false;
476 }
478 static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
479 {
480  nextAscii = end;
481  return src;
482 }
484 static void simdCompareAscii(const char8_t *&, const char8_t *, const char16_t *&, const char16_t *)
485 {
486 }
487 #endif
489 enum { HeaderDone = 1 };
492 {
493  qsizetype len = in.size();
495  // create a QByteArray with the worst case scenario size
497  uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
498  const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
499  const char16_t *const end = src + len;
501  while (src != end) {
502  const char16_t *nextAscii = end;
503  if (simdEncodeAscii(dst, nextAscii, src, end))
504  break;
506  do {
507  char16_t u = *src++;
508  int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
509  if (res < 0) {
510  // encoding error - append '?'
511  *dst++ = '?';
512  }
513  } while (src < nextAscii);
514  }
516  result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
517  return result;
518 }
521 {
522  QByteArray ba(3*in.size() +3, Qt::Uninitialized);
523  char *end = convertFromUnicode(ba.data(), in, state);
524  ba.truncate(end - ba.data());
525  return ba;
526 }
529 {
530  Q_ASSERT(state);
531  const QChar *uc = in.data();
532  qsizetype len = in.length();
533  if (!len)
534  return out;
536  auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
538  *cursor++ = 0;
539  } else {
540  // QChar::replacement encoded in utf8
541  *cursor++ = 0xef;
542  *cursor++ = 0xbf;
543  *cursor++ = 0xbd;
544  }
545  return cursor;
546  };
548  uchar *cursor = reinterpret_cast<uchar *>(out);
549  const char16_t *src = reinterpret_cast<const char16_t *>(uc);
550  const char16_t *const end = src + len;
552  if (!(state->flags & QStringDecoder::Flag::Stateless)) {
553  if (state->remainingChars) {
554  int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[0], cursor, src, end);
555  if (res < 0)
556  cursor = appendReplacementChar(cursor);
557  state->state_data[0] = 0;
558  state->remainingChars = 0;
559  } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
560  // append UTF-8 BOM
561  *cursor++ = utf8bom[0];
562  *cursor++ = utf8bom[1];
563  *cursor++ = utf8bom[2];
564  state->internalState |= HeaderDone;
565  }
566  }
568  while (src != end) {
569  const char16_t *nextAscii = end;
570  if (simdEncodeAscii(cursor, nextAscii, src, end))
571  break;
573  do {
574  char16_t uc = *src++;
575  int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
576  if (Q_LIKELY(res >= 0))
577  continue;
579  if (res == QUtf8BaseTraits::Error) {
580  // encoding error
581  ++state->invalidChars;
582  cursor = appendReplacementChar(cursor);
583  } else if (res == QUtf8BaseTraits::EndOfString) {
585  ++state->invalidChars;
586  cursor = appendReplacementChar(cursor);
587  } else {
588  state->remainingChars = 1;
589  state->state_data[0] = uc;
590  }
591  return reinterpret_cast<char *>(cursor);
592  }
593  } while (src < nextAscii);
594  }
596  return reinterpret_cast<char *>(cursor);
597 }
600 {
601  // UTF-8 to UTF-16 always needs the exact same number of words or less:
602  // UTF-8 UTF-16
603  // 1 byte 1 word
604  // 2 bytes 1 word
605  // 3 bytes 1 word
606  // 4 bytes 2 words (one surrogate pair)
607  // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
608  // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
609  // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
610  //
611  // The table holds for invalid sequences too: we'll insert one replacement char
612  // per invalid byte.
614  QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
615  const QChar *end = convertToUnicode(data, in);
616  result.truncate(end - data);
617  return result;
618 }
638 {
639  char16_t *dst = reinterpret_cast<char16_t *>(buffer);
640  const uchar *const start = reinterpret_cast<const uchar *>(in.data());
641  const uchar *src = start;
642  const uchar *end = src + in.size();
644  // attempt to do a full decoding in SIMD
645  const uchar *nextAscii = end;
646  if (!simdDecodeAscii(dst, nextAscii, src, end)) {
647  // at least one non-ASCII entry
648  // check if we failed to decode the UTF-8 BOM; if so, skip it
649  if (Q_UNLIKELY(src == start)
650  && end - src >= 3
651  && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
652  src += 3;
653  }
655  while (src < end) {
656  nextAscii = end;
657  if (simdDecodeAscii(dst, nextAscii, src, end))
658  break;
660  do {
661  uchar b = *src++;
662  int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
663  if (res < 0) {
664  // decoding error
666  }
667  } while (src < nextAscii);
668  }
669  }
671  return reinterpret_cast<QChar *>(dst);
672 }
675 {
676  // See above for buffer requirements for stateless decoding. However, that
677  // fails if the state is not empty. The following situations can add to the
678  // requirements:
679  // state contains chars starts with requirement
680  // 1 of 2 bytes valid continuation 0
681  // 2 of 3 bytes same 0
682  // 3 bytes of 4 same +1 (need to insert surrogate pair)
683  // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
684  // 2 of 3 bytes same +1 (same)
685  // 3 of 4 bytes same +1 (same)
686  QString result(in.size() + 1, Qt::Uninitialized);
687  QChar *end = convertToUnicode(result.data(), in, state);
688  result.truncate(end - result.constData());
689  return result;
690 }
693 {
694  qsizetype len = in.size();
696  Q_ASSERT(state);
697  if (!len)
698  return out;
701  char16_t replacement = QChar::ReplacementCharacter;
703  replacement = QChar::Null;
705  int res;
706  uchar ch = 0;
708  char16_t *dst = reinterpret_cast<char16_t *>(out);
709  const uchar *src = reinterpret_cast<const uchar *>(in.data());
710  const uchar *end = src + len;
712  if (!(state->flags & QStringConverter::Flag::Stateless)) {
713  bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
714  if (state->remainingChars || !headerdone) {
715  // handle incoming state first
716  uchar remainingCharsData[4]; // longest UTF-8 sequence possible
717  qsizetype remainingCharsCount = state->remainingChars;
718  qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
720  memset(remainingCharsData, 0, sizeof(remainingCharsData));
721  memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
722  memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
724  const uchar *begin = &remainingCharsData[1];
725  res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
726  static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
727  if (res == QUtf8BaseTraits::Error) {
728  ++state->invalidChars;
729  *dst++ = replacement;
730  ++src;
731  } else if (res == QUtf8BaseTraits::EndOfString) {
732  // if we got EndOfString again, then there were too few bytes in src;
733  // copy to our state and return
734  state->remainingChars = remainingCharsCount + newCharsToCopy;
735  memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
736  return out;
737  } else if (!headerdone) {
738  // eat the UTF-8 BOM
739  if (dst[-1] == 0xfeff)
740  --dst;
741  }
742  state->internalState |= HeaderDone;
744  // adjust src now that we have maybe consumed a few chars
745  if (res >= 0) {
746  Q_ASSERT(res > remainingCharsCount);
747  src += res - remainingCharsCount;
748  }
749  }
750  } else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
751  // stateless, remove initial BOM
752  if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
753  // skip BOM
754  src += 3;
755  }
757  // main body, stateless decoding
758  res = 0;
759  const uchar *nextAscii = src;
760  while (res >= 0 && src < end) {
761  if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
762  break;
764  ch = *src++;
765  res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
766  if (res == QUtf8BaseTraits::Error) {
767  res = 0;
768  ++state->invalidChars;
769  *dst++ = replacement;
770  }
771  }
774  // unterminated UTF sequence
777  ++state->invalidChars;
778  while (src++ < end) {
780  ++state->invalidChars;
781  }
782  state->remainingChars = 0;
783  } else {
784  --src; // unread the byte in ch
785  state->remainingChars = end - src;
786  memcpy(&state->state_data[0], src, end - src);
787  }
788  } else {
789  state->remainingChars = 0;
790  }
792  return reinterpret_cast<QChar *>(dst);
793 }
796 {
797  struct NoOutput {};
798  static void appendUtf16(const NoOutput &, char16_t) {}
799  static void appendUcs4(const NoOutput &, char32_t) {}
800 };
803 {
804  const uchar *src = reinterpret_cast<const uchar *>(in.data());
805  const uchar *end = src + in.size();
806  const uchar *nextAscii = src;
807  bool isValidAscii = true;
809  while (src < end) {
810  if (src >= nextAscii)
811  src = simdFindNonAscii(src, end, nextAscii);
812  if (src == end)
813  break;
815  do {
816  uchar b = *src++;
817  if ((b & 0x80) == 0)
818  continue;
820  isValidAscii = false;
822  int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
823  if (res < 0) {
824  // decoding error
825  return { false, false };
826  }
827  } while (src < nextAscii);
828  }
830  return { true, isValidAscii };
831 }
834 {
835  auto src1 = reinterpret_cast<const char8_t *>(utf8.data());
836  auto end1 = src1 + utf8.size();
837  auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
838  auto end2 = src2 + utf16.size();
840  do {
841  simdCompareAscii(src1, end1, src2, end2);
843  if (src1 < end1 && src2 < end2) {
844  char32_t uc1 = *src1++;
845  char32_t uc2 = *src2++;
847  if (uc1 >= 0x80) {
848  char32_t *output = &uc1;
849  int res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(uc1, output, src1, end1);
850  if (res < 0) {
851  // decoding error
853  }
855  // Only decode the UTF-16 surrogate pair if the UTF-8 code point
856  // wasn't US-ASCII (a surrogate cannot match US-ASCII).
857  if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2))
858  uc2 = QChar::surrogateToUcs4(uc2, *src2++);
859  }
861  if (uc1 != uc2)
862  return int(uc1) - int(uc2);
863  }
864  } while (src1 < end1 && src2 < end2);
866  // the shorter string sorts first
867  return (end1 > src1) - int(end2 > src2);
868 }
871 {
872  char32_t uc1 = QChar::Null;
873  auto src1 = reinterpret_cast<const uchar *>(utf8.data());
874  auto end1 = src1 + utf8.size();
875  auto src2 = reinterpret_cast<const uchar *>(s.latin1());
876  auto end2 = src2 + s.size();
878  while (src1 < end1 && src2 < end2) {
879  uchar b = *src1++;
880  char32_t *output = &uc1;
881  int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
882  if (res < 0) {
883  // decoding error
885  }
887  char32_t uc2 = *src2++;
888  if (uc1 != uc2)
889  return int(uc1) - int(uc2);
890  }
892  // the shorter string sorts first
893  return (end1 > src1) - (end2 > src2);
894 }
897 {
898  bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
899  qsizetype length = 2 * in.size();
900  if (writeBom)
901  length += 2;
904  char *end = convertFromUnicode(d.data(), in, state, endian);
905  Q_ASSERT(end - d.constData() == d.length());
906  Q_UNUSED(end);
907  return d;
908 }
911 {
912  Q_ASSERT(state);
913  bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
915  if (endian == DetectEndianness)
918  if (writeBom) {
919  // set them up the BOM
921  if (endian == BigEndianness)
922  qToBigEndian(bom.unicode(), out);
923  else
924  qToLittleEndian(bom.unicode(), out);
925  out += 2;
926  }
927  if (endian == BigEndianness)
928  qToBigEndian<char16_t>(in.data(), in.length(), out);
929  else
930  qToLittleEndian<char16_t>(in.data(), in.length(), out);
932  state->remainingChars = 0;
933  state->internalState |= HeaderDone;
934  return out + 2*in.length();
935 }
938 {
939  QString result((in.size() + 1) >> 1, Qt::Uninitialized); // worst case
940  QChar *qch = convertToUnicode(result.data(), in, state, endian);
941  result.truncate(qch - result.constData());
942  return result;
943 }
946 {
947  qsizetype len = in.size();
948  const char *chars = in.data();
950  Q_ASSERT(state);
952  if (endian == DetectEndianness)
953  endian = (DataEndianness)state->state_data[Endian];
955  const char *end = chars + len;
957  // make sure we can decode at least one char
958  if (state->remainingChars + len < 2) {
959  if (len) {
960  Q_ASSERT(state->remainingChars == 0 && len == 1);
961  state->remainingChars = 1;
962  state->state_data[Data] = *chars;
963  }
964  return out;
965  }
967  bool headerdone = state && state->internalState & HeaderDone;
969  headerdone = true;
971  if (!headerdone || state->remainingChars) {
972  uchar buf;
973  if (state->remainingChars)
974  buf = state->state_data[Data];
975  else
976  buf = *chars++;
978  // detect BOM, set endianness
979  state->internalState |= HeaderDone;
980  QChar ch(buf, *chars++);
981  if (endian == DetectEndianness) {
982  // someone set us up the BOM
983  if (ch == QChar::ByteOrderSwapped) {
984  endian = BigEndianness;
985  } else if (ch == QChar::ByteOrderMark) {
986  endian = LittleEndianness;
987  } else {
989  endian = BigEndianness;
990  } else {
991  endian = LittleEndianness;
992  }
993  }
994  }
995  if (endian == BigEndianness)
996  ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
997  if (headerdone || ch != QChar::ByteOrderMark)
998  *out++ = ch;
999  } else if (endian == DetectEndianness) {
1001  }
1003  qsizetype nPairs = (end - chars) >> 1;
1004  if (endian == BigEndianness)
1005  qFromBigEndian<char16_t>(chars, nPairs, out);
1006  else
1007  qFromLittleEndian<char16_t>(chars, nPairs, out);
1008  out += nPairs;
1010  state->state_data[Endian] = endian;
1011  state->remainingChars = 0;
1012  if ((end - chars) & 1) {
1015  } else {
1016  state->remainingChars = 1;
1017  state->state_data[Data] = *(end - 1);
1018  }
1019  } else {
1020  state->state_data[Data] = 0;
1021  }
1023  return out;
1024 }
1027 {
1028  bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1029  qsizetype length = 4*in.size();
1030  if (writeBom)
1031  length += 4;
1033  char *end = convertFromUnicode(ba.data(), in, state, endian);
1034  ba.truncate(end - ba.constData());
1035  return ba;
1036 }
1039 {
1040  Q_ASSERT(state);
1042  bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1043  if (endian == DetectEndianness)
1046  if (writeBom) {
1047  // set them up the BOM
1048  if (endian == BigEndianness) {
1049  out[0] = 0;
1050  out[1] = 0;
1051  out[2] = (char)0xfe;
1052  out[3] = (char)0xff;
1053  } else {
1054  out[0] = (char)0xff;
1055  out[1] = (char)0xfe;
1056  out[2] = 0;
1057  out[3] = 0;
1058  }
1059  out += 4;
1060  state->internalState |= HeaderDone;
1061  }
1063  const QChar *uc = in.data();
1064  const QChar *end = in.data() + in.length();
1065  QChar ch;
1066  char32_t ucs4;
1067  if (state->remainingChars == 1) {
1068  auto character = state->state_data[Data];
1069  Q_ASSERT(character <= 0xFFFF);
1070  ch = QChar(character);
1071  // this is ugly, but shortcuts a whole lot of logic that would otherwise be required
1072  state->remainingChars = 0;
1073  goto decode_surrogate;
1074  }
1076  while (uc < end) {
1077  ch = *uc++;
1078  if (Q_LIKELY(!ch.isSurrogate())) {
1079  ucs4 = ch.unicode();
1080  } else if (Q_LIKELY(ch.isHighSurrogate())) {
1081 decode_surrogate:
1082  if (uc == end) {
1085  } else {
1086  state->remainingChars = 1;
1087  state->state_data[Data] = ch.unicode();
1088  return out;
1089  }
1090  } else if (uc->isLowSurrogate()) {
1091  ucs4 = QChar::surrogateToUcs4(ch, *uc++);
1092  } else {
1094  }
1095  } else {
1097  }
1098  if (endian == BigEndianness)
1099  qToBigEndian(ucs4, out);
1100  else
1101  qToLittleEndian(ucs4, out);
1102  out += 4;
1103  }
1105  return out;
1106 }
1109 {
1110  QString result;
1111  result.resize((in.size() + 7) >> 1); // worst case
1112  QChar *end = convertToUnicode(result.data(), in, state, endian);
1113  result.truncate(end - result.constData());
1114  return result;
1115 }
1118 {
1119  qsizetype len = in.size();
1120  const char *chars = in.data();
1122  Q_ASSERT(state);
1123  if (endian == DetectEndianness)
1124  endian = (DataEndianness)state->state_data[Endian];
1126  const char *end = chars + len;
1128  uchar tuple[4];
1129  memcpy(tuple, &state->state_data[Data], 4);
1131  // make sure we can decode at least one char
1132  if (state->remainingChars + len < 4) {
1133  if (len) {
1134  while (chars < end) {
1135  tuple[state->remainingChars] = *chars;
1136  ++state->remainingChars;
1137  ++chars;
1138  }
1139  Q_ASSERT(state->remainingChars < 4);
1140  memcpy(&state->state_data[Data], tuple, 4);
1141  }
1142  return out;
1143  }
1145  bool headerdone = state->internalState & HeaderDone;
1147  headerdone = true;
1149  qsizetype num = state->remainingChars;
1150  state->remainingChars = 0;
1152  if (!headerdone || endian == DetectEndianness || num) {
1153  while (num < 4)
1154  tuple[num++] = *chars++;
1155  if (endian == DetectEndianness) {
1156  // someone set us up the BOM?
1157  if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0) {
1158  endian = LittleEndianness;
1159  } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff) {
1160  endian = BigEndianness;
1161  } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1162  endian = BigEndianness;
1163  } else {
1164  endian = LittleEndianness;
1165  }
1166  }
1167  char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
1168  if (headerdone || code != QChar::ByteOrderMark) {
1172  } else {
1173  *out++ = QChar(code);
1174  }
1175  }
1176  num = 0;
1177  } else if (endian == DetectEndianness) {
1179  }
1180  state->state_data[Endian] = endian;
1181  state->internalState |= HeaderDone;
1183  while (chars < end) {
1184  tuple[num++] = *chars++;
1185  if (num == 4) {
1186  char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
1187  for (char16_t c : QChar::fromUcs4(code))
1188  *out++ = c;
1189  num = 0;
1190  }
1191  }
1193  if (num) {
1194  if (state->flags & QStringDecoder::Flag::Stateless) {
1196  } else {
1197  state->state_data[Endian] = endian;
1198  state->remainingChars = num;
1199  memcpy(&state->state_data[Data], tuple, 4);
1200  }
1201  }
1203  return out;
1204 }
1206 #if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1207 static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state)
1208 {
1209  qsizetype length = in.size();
1210  const char *chars = in.data();
1212  Q_ASSERT(state);
1213  if (state->flags & QStringConverter::Flag::Stateless) // temporary
1214  state = nullptr;
1216  if (!chars || !length)
1217  return QString();
1219  qsizetype copyLocation = 0;
1220  qsizetype extra = 2;
1221  if (state && state->remainingChars) {
1222  copyLocation = state->remainingChars;
1223  extra += copyLocation;
1224  }
1225  qsizetype newLength = length + extra;
1226  char *mbcs = new char[newLength];
1227  //ensure that we have a NULL terminated string
1228  mbcs[newLength-1] = 0;
1229  mbcs[newLength-2] = 0;
1230  memcpy(&(mbcs[copyLocation]), chars, length);
1231  if (copyLocation) {
1232  //copy the last character from the state
1233  mbcs[0] = (char)state->state_data[0];
1234  state->remainingChars = 0;
1235  }
1236  const char *mb = mbcs;
1237  const char *next = 0;
1238  QString s;
1239  while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
1240  wchar_t wc[2] ={0};
1241  int charlength = next - mb;
1242  int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
1243  if (len>0) {
1244  s.append(QChar(wc[0]));
1245  } else {
1246  int r = GetLastError();
1247  //check if the character being dropped is the last character
1248  if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
1249  state->remainingChars = 1;
1250  state->state_data[0] = (char)*mb;
1251  }
1252  }
1253  mb = next;
1254  }
1255  delete [] mbcs;
1256  return s;
1257 }
1261 {
1262  qsizetype length = in.size();
1264  Q_ASSERT(length < INT_MAX); // ### FIXME
1265  const char *mb = in.data();
1266  int mblen = length;
1268  if (!mb || !mblen)
1269  return QString();
1272  int len;
1273  QString sp;
1274  bool prepend = false;
1275  char state_data = 0;
1276  int remainingChars = 0;
1278  //save the current state information
1279  if (state) {
1280  state_data = (char)state->state_data[0];
1281  remainingChars = state->remainingChars;
1282  }
1284  //convert the pending character (if available)
1285  if (state && remainingChars) {
1286  char prev[3] = {0};
1287  prev[0] = state_data;
1288  prev[1] = mb[0];
1289  remainingChars = 0;
1290  len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
1291  prev, 2, wc.data(), wc.length());
1292  if (len) {
1293  sp.append(QChar(wc[0]));
1294  if (mblen == 1) {
1295  state->remainingChars = 0;
1296  return sp;
1297  }
1298  prepend = true;
1299  mb++;
1300  mblen--;
1301  wc[0] = 0;
1302  }
1303  }
1305  while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
1306  mb, mblen, wc.data(), wc.length()))) {
1307  int r = GetLastError();
1309  const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
1310  mb, mblen, 0, 0);
1311  wc.resize(wclen);
1312  } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1313  //find the last non NULL character
1314  while (mblen > 1 && !(mb[mblen-1]))
1315  mblen--;
1316  //check whether, we hit an invalid character in the middle
1317  if ((mblen <= 1) || (remainingChars && state_data))
1318  return convertToUnicodeCharByChar(in, state);
1319  //Remove the last character and try again...
1320  state_data = mb[mblen-1];
1321  remainingChars = 1;
1322  mblen--;
1323  } else {
1324  // Fail.
1325  qWarning("MultiByteToWideChar: Cannot convert multibyte text");
1326  break;
1327  }
1328  }
1330  if (len <= 0)
1331  return QString();
1333  if (wc[len-1] == 0) // len - 1: we don't want terminator
1334  --len;
1336  //save the new state information
1337  if (state) {
1338  state->state_data[0] = (char)state_data;
1339  state->remainingChars = remainingChars;
1340  }
1341  QString s((QChar*)wc.data(), len);
1342  if (prepend) {
1343  return sp+s;
1344  }
1345  return s;
1346 }
1349 {
1350  const QChar *ch = in.data();
1351  qsizetype uclen = in.size();
1353  Q_ASSERT(uclen < INT_MAX); // ### FIXME
1354  Q_ASSERT(state);
1355  Q_UNUSED(state); // ### Fixme
1356  if (state->flags & QStringConverter::Flag::Stateless) // temporary
1357  state = nullptr;
1359  if (!ch)
1360  return QByteArray();
1361  if (uclen == 0)
1362  return QByteArray("");
1363  BOOL used_def;
1364  QByteArray mb(4096, 0);
1365  int len;
1366  while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
1367  mb.data(), mb.size()-1, 0, &used_def)))
1368  {
1369  int r = GetLastError();
1371  mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
1372  (const wchar_t*)ch, uclen,
1373  0, 0, 0, &used_def));
1374  // and try again...
1375  } else {
1376  // Fail. Probably can't happen in fact (dwFlags is 0).
1377 #ifndef QT_NO_DEBUG
1378  // Can't use qWarning(), as it'll recurse to handle %ls
1379  fprintf(stderr,
1380  "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
1381  r, reinterpret_cast<const wchar_t*>(QString(ch, uclen).utf16()));
1382 #endif
1383  break;
1384  }
1385  }
1386  mb.resize(len);
1387  return mb;
1388 }
1389 #endif
1392 {
1393  if (clearFn)
1394  clearFn(this);
1395  else
1396  state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
1397  remainingChars = 0;
1398  invalidChars = 0;
1399  internalState = 0;
1400 }
1403 {
1405 }
1407 static char *toUtf16(char *out, QStringView in, QStringConverter::State *state)
1408 {
1410 }
1412 static QChar *fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1413 {
1415 }
1417 static char *toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
1418 {
1420 }
1422 static QChar *fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1423 {
1425 }
1427 static char *toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
1428 {
1430 }
1433 {
1435 }
1437 static char *toUtf32(char *out, QStringView in, QStringConverter::State *state)
1438 {
1440 }
1442 static QChar *fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1443 {
1445 }
1447 static char *toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
1448 {
1450 }
1452 static QChar *fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1453 {
1455 }
1457 static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
1458 {
1460 }
1462 void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept;
1465 {
1466  Q_ASSERT(state);
1467  Q_UNUSED(state);
1469  qt_from_latin1(reinterpret_cast<char16_t *>(out), in.data(), size_t(in.size()));
1470  return out + in.size();
1471 }
1474 static char *toLatin1(char *out, QStringView in, QStringConverter::State *state)
1475 {
1476  Q_ASSERT(state);
1477  if (state->flags & QStringConverter::Flag::Stateless) // temporary
1478  state = nullptr;
1480  const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
1481  qsizetype invalid = 0;
1482  for (qsizetype i = 0; i < in.length(); ++i) {
1483  if (in[i] > QChar(0xff)) {
1484  *out = replacement;
1485  ++invalid;
1486  } else {
1487  *out = (char)in[i].cell();
1488  }
1489  ++out;
1490  }
1491  if (state)
1492  state->invalidChars += invalid;
1493  return out;
1494 }
1496 static QChar *fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
1497 {
1499  memcpy(out, s.constData(), s.length()*sizeof(QChar));
1500  return out + s.length();
1501 }
1503 static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
1504 {
1506  memcpy(out, s.constData(), s.length());
1507  return out + s.length();
1508 }
1511 static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
1512 static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
1514 static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
1515 static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
1517 static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
1518 static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
1520 static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
1521 static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
1650 const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
1651 {
1652  { "UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
1653  { "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
1654  { "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
1655  { "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
1656  { "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
1657  { "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
1658  { "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
1659  { "ISO-8859-1", fromLatin1, fromLatin1Len, toLatin1, toLatin1Len },
1660  { "Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len }
1661 };
1663 // match names case insensitive and skipping '-' and '_'
1664 static bool nameMatch(const char *a, const char *b)
1665 {
1666  while (*a && *b) {
1667  if (*a == '-' || *a == '_') {
1668  ++a;
1669  continue;
1670  }
1671  if (*b == '-' || *b == '_') {
1672  ++b;
1673  continue;
1674  }
1676  return false;
1677  ++a;
1678  ++b;
1679  }
1680  return !*a && !*b;
1681 }
1698  : iface(nullptr), state(f)
1699 {
1700  auto e = encodingForName(name);
1701  if (e)
1702  iface = encodingInterfaces + int(e.value());
1703 }
1743 std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name)
1744 {
1745  for (int i = 0; i < LastEncoding + 1; ++i) {
1746  if (nameMatch(encodingInterfaces[i].name, name))
1747  return QStringConverter::Encoding(i);
1748  }
1749  if (nameMatch(name, "latin1"))
1750  return QStringConverter::Latin1;
1751  return std::nullopt;
1752 }
1761 std::optional<QStringConverter::Encoding> QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter)
1762 {
1763  // someone set us up the BOM?
1764  qsizetype arraySize = data.size();
1765  if (arraySize > 3) {
1766  char32_t uc = qFromUnaligned<char32_t>(data.data());
1767  if (uc == qToBigEndian(char32_t(QChar::ByteOrderMark)))
1769  if (uc == qToLittleEndian(char32_t(QChar::ByteOrderMark)))
1771  if (expectedFirstCharacter) {
1772  // catch also anything starting with the expected character
1773  if (qToLittleEndian(uc) == expectedFirstCharacter)
1775  else if (qToBigEndian(uc) == expectedFirstCharacter)
1777  }
1778  }
1780  if (arraySize > 2) {
1781  if (memcmp(data.data(), utf8bom, sizeof(utf8bom)) == 0)
1782  return QStringConverter::Utf8;
1783  }
1785  if (arraySize > 1) {
1786  char16_t uc = qFromUnaligned<char16_t>(data.data());
1787  if (uc == qToBigEndian(char16_t(QChar::ByteOrderMark)))
1789  if (uc == qToLittleEndian(char16_t(QChar::ByteOrderMark)))
1791  if (expectedFirstCharacter) {
1792  // catch also anything starting with the expected character
1793  if (qToLittleEndian(uc) == expectedFirstCharacter)
1795  else if (qToBigEndian(uc) == expectedFirstCharacter)
1797  }
1798  }
1799  return std::nullopt;
1800 }
1808 std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
1809 {
1810  // determine charset
1811  auto encoding = encodingForData(data);
1812  if (encoding)
1813  // trust the initial BOM
1814  return encoding;
1816  static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher("meta ");
1817  static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher("charset=");
1819  QByteArray header = data.first(qMin(data.size(), qsizetype(1024))).toByteArray().toLower();
1820  qsizetype pos = metaSearcher.indexIn(header);
1821  if (pos != -1) {
1822  pos = charsetSearcher.indexIn(header, pos);
1823  if (pos != -1) {
1824  pos += int(qstrlen("charset="));
1825  if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
1826  ++pos;
1828  qsizetype pos2 = pos;
1829  // The attribute can be closed with either """, "'", ">" or "/",
1830  // none of which are valid charset characters.
1831  while (++pos2 < header.size()) {
1832  char ch = header.at(pos2);
1833  if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
1834  QByteArray name = header.mid(pos, pos2 - pos);
1835  qsizetype colon = name.indexOf(':');
1836  if (colon > 0)
1837  name = name.left(colon);
1838  name = name.simplified();
1839  if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
1840  name = QByteArrayLiteral("UTF-8");
1841  if (!name.isEmpty())
1842  return encodingForName(name);
1843  }
1844  }
1845  }
1846  }
1847  return Utf8;
1848 }
1854 {
1855  return encodingInterfaces[int(e)].name;
1856 }
