QtBase  v6.3.1
qstringconverter.cpp
Go to the documentation of this file.
1 /****************************************************************************
2 **
3 ** Copyright (C) 2020 The Qt Company Ltd.
4 ** Copyright (C) 2020 Intel Corporation.
5 ** Contact: https://www.qt.io/licensing/
6 **
7 ** This file is part of the QtCore module of the Qt Toolkit.
8 **
9 ** $QT_BEGIN_LICENSE:LGPL$
10 ** Commercial License Usage
11 ** Licensees holding valid commercial Qt licenses may use this file in
12 ** accordance with the commercial license agreement provided with the
13 ** Software or, alternatively, in accordance with the terms contained in
14 ** a written agreement between you and The Qt Company. For licensing terms
15 ** and conditions see https://www.qt.io/terms-conditions. For further
16 ** information use the contact form at https://www.qt.io/contact-us.
17 **
18 ** GNU Lesser General Public License Usage
19 ** Alternatively, this file may be used under the terms of the GNU Lesser
20 ** General Public License version 3 as published by the Free Software
21 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
22 ** packaging of this file. Please review the following information to
23 ** ensure the GNU Lesser General Public License version 3 requirements
24 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25 **
26 ** GNU General Public License Usage
27 ** Alternatively, this file may be used under the terms of the GNU
28 ** General Public License version 2.0 or (at your option) the GNU General
29 ** Public license version 3 or any later version approved by the KDE Free
30 ** Qt Foundation. The licenses are as published by the Free Software
31 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32 ** included in the packaging of this file. Please review the following
33 ** information to ensure the GNU General Public License requirements will
34 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35 ** https://www.gnu.org/licenses/gpl-3.0.html.
36 **
37 ** $QT_END_LICENSE$
38 **
39 ****************************************************************************/
40 
41 #include <qstringconverter.h>
42 #include <private/qstringconverter_p.h>
43 #include "qendian.h"
44 
45 #include "private/qsimd_p.h"
46 #include "private/qstringiterator_p.h"
47 #include "private/qtools_p.h"
48 #include "qbytearraymatcher.h"
49 
50 #ifdef Q_OS_WIN
51 #include <qt_windows.h>
52 #ifndef QT_BOOTSTRAPPED
53 #include <QtCore/qvarlengtharray.h>
54 #endif // !QT_BOOTSTRAPPED
55 #endif
56 
57 #if __has_include(<bit>) && __cplusplus > 201703L
58 #include <bit>
59 #endif
60 
62 
63 enum { Endian = 0, Data = 1 };
64 
65 static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
66 
67 #if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
68  || defined(__ARM_NEON__)
69 static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
70 {
71 #if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
72  return std::bit_width(v) - 1;
73 #else
75  // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
76  // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
77  // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
78  result ^= sizeof(unsigned) * 8 - 1;
79  return result;
80 #endif
81 }
82 #endif
83 
84 #if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
85 static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
86 {
87  // do sixteen characters at a time
88  for ( ; end - src >= 16; src += 16, dst += 16) {
89 # ifdef __AVX2__
90  __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
91  __m128i data1 = _mm256_castsi256_si128(data);
92  __m128i data2 = _mm256_extracti128_si256(data, 1);
93 # else
94  __m128i data1 = _mm_loadu_si128((const __m128i*)src);
95  __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src);
96 # endif
97 
98  // check if everything is ASCII
99  // the highest ASCII value is U+007F
100  // Do the packing directly:
101  // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
102  // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
103  // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
104  // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
105  // "non-ASCII", but it's an acceptable compromise.
106  __m128i packed = _mm_packus_epi16(data1, data2);
107  __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
108 
109  // store, even if there are non-ASCII characters here
110  _mm_storeu_si128((__m128i*)dst, packed);
111 
112  // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
113  ushort n = ~_mm_movemask_epi8(nonAscii);
114  if (n) {
115  // find the next probable ASCII character
116  // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
117  // characters still coming
118  nextAscii = src + qBitScanReverse(n) + 1;
119 
121  dst += n;
122  src += n;
123  return false;
124  }
125  }
126 
127  if (end - src >= 8) {
128  // do eight characters at a time
129  __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
130  __m128i packed = _mm_packus_epi16(data, data);
131  __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
132 
133  // store even non-ASCII
134  _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
135 
136  uchar n = ~_mm_movemask_epi8(nonAscii);
137  if (n) {
138  nextAscii = src + qBitScanReverse(n) + 1;
140  dst += n;
141  src += n;
142  return false;
143  }
144  }
145 
146  return src == end;
147 }
148 
149 static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
150 {
151  // do sixteen characters at a time
152  for ( ; end - src >= 16; src += 16, dst += 16) {
153  __m128i data = _mm_loadu_si128((const __m128i*)src);
154 
155 #ifdef __AVX2__
156  const int BitSpacing = 2;
157  // load and zero extend to an YMM register
158  const __m256i extended = _mm256_cvtepu8_epi16(data);
159 
160  uint n = _mm256_movemask_epi8(extended);
161  if (!n) {
162  // store
163  _mm256_storeu_si256((__m256i*)dst, extended);
164  continue;
165  }
166 #else
167  const int BitSpacing = 1;
168 
169  // check if everything is ASCII
170  // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
171  uint n = _mm_movemask_epi8(data);
172  if (!n) {
173  // unpack
174  _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
175  _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
176  continue;
177  }
178 #endif
179 
180  // copy the front part that is still ASCII
181  while (!(n & 1)) {
182  *dst++ = *src++;
183  n >>= BitSpacing;
184  }
185 
186  // find the next probable ASCII character
187  // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
188  // characters still coming
189  n = qBitScanReverse(n);
190  nextAscii = src + (n / BitSpacing) + 1;
191  return false;
192 
193  }
194 
195  if (end - src >= 8) {
196  __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
197  uint n = _mm_movemask_epi8(data) & 0xff;
198  if (!n) {
199  // unpack and store
200  _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
201  } else {
202  while (!(n & 1)) {
203  *dst++ = *src++;
204  n >>= 1;
205  }
206 
207  n = qBitScanReverse(n);
208  nextAscii = src + n + 1;
209  return false;
210  }
211  }
212 
213  return src == end;
214 }
215 
216 static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
217 {
218 #ifdef __AVX2__
219  // do 32 characters at a time
220  // (this is similar to simdTestMask in qstring.cpp)
221  const __m256i mask = _mm256_set1_epi8(0x80);
222  for ( ; end - src >= 32; src += 32) {
223  __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
224  if (_mm256_testz_si256(mask, data))
225  continue;
226 
227  uint n = _mm256_movemask_epi8(data);
228  Q_ASSUME(n);
229 
230  // find the next probable ASCII character
231  // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
232  // characters still coming
233  nextAscii = src + qBitScanReverse(n) + 1;
234 
235  // return the non-ASCII character
236  return src + qCountTrailingZeroBits(n);
237  }
238 #endif
239 
240  // do sixteen characters at a time
241  for ( ; end - src >= 16; src += 16) {
242  __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
243 
244  // check if everything is ASCII
245  // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
246  uint n = _mm_movemask_epi8(data);
247  if (!n)
248  continue;
249 
250  // find the next probable ASCII character
251  // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
252  // characters still coming
253  nextAscii = src + qBitScanReverse(n) + 1;
254 
255  // return the non-ASCII character
256  return src + qCountTrailingZeroBits(n);
257  }
258 
259  // do four characters at a time
260  for ( ; end - src >= 4; src += 4) {
261  quint32 data = qFromUnaligned<quint32>(src);
262  data &= 0x80808080U;
263  if (!data)
264  continue;
265 
266  // We don't try to guess which of the three bytes is ASCII and which
267  // one isn't. The chance that at least two of them are non-ASCII is
268  // better than 75%.
269  nextAscii = src;
270  return src;
271  }
272  nextAscii = end;
273  return src;
274 }
275 
276 // Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
277 // and advance src8 and src16 to the first character that could not be compared
278 static void simdCompareAscii(const char8_t *&src8, const char8_t *end8, const char16_t *&src16, const char16_t *end16)
279 {
280  int bitSpacing = 1;
281  qptrdiff len = qMin(end8 - src8, end16 - src16);
282  qptrdiff offset = 0;
283  uint mask = 0;
284 
285  // do sixteen characters at a time
286  for ( ; offset + 16 < len; offset += 16) {
287  __m128i data8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src8 + offset));
288 #ifdef __AVX2__
289  // AVX2 version, use 256-bit registers and VPMOVXZBW
290  __m256i data16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src16 + offset));
291 
292  // expand US-ASCII as if it were Latin1 and confirm it's US-ASCII
293  __m256i datax8 = _mm256_cvtepu8_epi16(data8);
294  mask = _mm256_movemask_epi8(datax8);
295  if (mask)
296  break;
297 
298  // compare Latin1 to UTF-16
299  __m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
300  mask = ~_mm256_movemask_epi8(latin1cmp);
301  if (mask)
302  break;
303 #else
304  // non-AVX2 code
305  __m128i datalo16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
306  __m128i datahi16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset) + 1);
307 
308  // expand US-ASCII as if it were Latin1, we'll confirm later
309  __m128i datalo8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
310  __m128i datahi8 = _mm_unpackhi_epi8(data8, _mm_setzero_si128());
311 
312  // compare Latin1 to UTF-16
313  __m128i latin1cmplo = _mm_cmpeq_epi16(datalo8, datalo16);
314  __m128i latin1cmphi = _mm_cmpeq_epi16(datahi8, datahi16);
315  mask = _mm_movemask_epi8(latin1cmphi) << 16;
316  mask |= ushort(_mm_movemask_epi8(latin1cmplo));
317  mask = ~mask;
318  if (mask)
319  break;
320 
321  // confirm it was US-ASCII
322  mask = _mm_movemask_epi8(data8);
323  if (mask) {
324  bitSpacing = 0;
325  break;
326  }
327 #endif
328  }
329 
330  // helper for comparing 4 or 8 characters
331  auto cmp_lt_16 = [&mask, &offset](int n, __m128i data8, __m128i data16) {
332  // n = 4 -> sizemask = 0xff
333  // n = 8 -> sizemask = 0xffff
334  unsigned sizemask = (1U << (2 * n)) - 1;
335 
336  // expand as if Latin1
337  data8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
338 
339  // compare and confirm it's US-ASCII
340  __m128i latin1cmp = _mm_cmpeq_epi16(data8, data16);
341  mask = ~_mm_movemask_epi8(latin1cmp) & sizemask;
342  mask |= _mm_movemask_epi8(data8);
343  if (mask == 0)
344  offset += n;
345  };
346 
347  // do eight characters at a time
348  if (mask == 0 && offset + 8 < len) {
349  __m128i data8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src8 + offset));
350  __m128i data16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
351  cmp_lt_16(8, data8, data16);
352  }
353 
354  // do four characters
355  if (mask == 0 && offset + 4 < len) {
356  __m128i data8 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src8 + offset));
357  __m128i data16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src16 + offset));
358  cmp_lt_16(4, data8, data16);
359  }
360 
361  // correct the source pointers to point to the first character we couldn't deal with
362  if (mask)
363  offset += qCountTrailingZeroBits(mask) >> bitSpacing;
364  src8 += offset;
365  src16 += offset;
366 }
367 #elif defined(__ARM_NEON__)
368 static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
369 {
370  uint16x8_t maxAscii = vdupq_n_u16(0x7f);
371  uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
372  uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
373 
374  // do sixteen characters at a time
375  for ( ; end - src >= 16; src += 16, dst += 16) {
376  // load 2 lanes (or: "load interleaved")
377  uint16x8x2_t in = vld2q_u16(reinterpret_cast<const uint16_t *>(src));
378 
379  // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
380  // add those together into a scalar, and merge the scalars.
381  uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
382  | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
383 
384  // merge the two lanes by shifting the values of the second by 8 and inserting them
385  uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
386 
387  // store, even if there are non-ASCII characters here
388  vst1q_u8(dst, vreinterpretq_u8_u16(out));
389 
390  if (nonAscii) {
391  // find the next probable ASCII character
392  // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
393  // characters still coming
394  nextAscii = src + qBitScanReverse(nonAscii) + 1;
395 
396  nonAscii = qCountTrailingZeroBits(nonAscii);
397  dst += nonAscii;
398  src += nonAscii;
399  return false;
400  }
401  }
402  return src == end;
403 }
404 
405 static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
406 {
407  // do eight characters at a time
408  uint8x8_t msb_mask = vdup_n_u8(0x80);
409  uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
410  for ( ; end - src >= 8; src += 8, dst += 8) {
411  uint8x8_t c = vld1_u8(src);
412  uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
413  if (!n) {
414  // store
415  vst1q_u16(reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
416  continue;
417  }
418 
419  // copy the front part that is still ASCII
420  while (!(n & 1)) {
421  *dst++ = *src++;
422  n >>= 1;
423  }
424 
425  // find the next probable ASCII character
426  // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
427  // characters still coming
428  n = qBitScanReverse(n);
429  nextAscii = src + n + 1;
430  return false;
431 
432  }
433  return src == end;
434 }
435 
436 static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
437 {
438  // The SIMD code below is untested, so just force an early return until
439  // we've had the time to verify it works.
440  nextAscii = end;
441  return src;
442 
443  // do eight characters at a time
444  uint8x8_t msb_mask = vdup_n_u8(0x80);
445  uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
446  for ( ; end - src >= 8; src += 8) {
447  uint8x8_t c = vld1_u8(src);
448  uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
449  if (!n)
450  continue;
451 
452  // find the next probable ASCII character
453  // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
454  // characters still coming
455  nextAscii = src + qBitScanReverse(n) + 1;
456 
457  // return the non-ASCII character
458  return src + qCountTrailingZeroBits(n);
459  }
460  nextAscii = end;
461  return src;
462 }
463 
464 static void simdCompareAscii(const char8_t *&, const char8_t *, const char16_t *&, const char16_t *)
465 {
466 }
467 #else
468 static inline bool simdEncodeAscii(uchar *, const char16_t *, const char16_t *, const char16_t *)
469 {
470  return false;
471 }
472 
473 static inline bool simdDecodeAscii(char16_t *, const uchar *, const uchar *, const uchar *)
474 {
475  return false;
476 }
477 
478 static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
479 {
480  nextAscii = end;
481  return src;
482 }
483 
484 static void simdCompareAscii(const char8_t *&, const char8_t *, const char16_t *&, const char16_t *)
485 {
486 }
487 #endif
488 
489 enum { HeaderDone = 1 };
490 
492 {
493  qsizetype len = in.size();
494 
495  // create a QByteArray with the worst case scenario size
497  uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
498  const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
499  const char16_t *const end = src + len;
500 
501  while (src != end) {
502  const char16_t *nextAscii = end;
503  if (simdEncodeAscii(dst, nextAscii, src, end))
504  break;
505 
506  do {
507  char16_t u = *src++;
508  int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
509  if (res < 0) {
510  // encoding error - append '?'
511  *dst++ = '?';
512  }
513  } while (src < nextAscii);
514  }
515 
516  result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
517  return result;
518 }
519 
521 {
522  QByteArray ba(3*in.size() +3, Qt::Uninitialized);
523  char *end = convertFromUnicode(ba.data(), in, state);
524  ba.truncate(end - ba.data());
525  return ba;
526 }
527 
529 {
530  Q_ASSERT(state);
531  const QChar *uc = in.data();
532  qsizetype len = in.length();
533  if (!len)
534  return out;
535 
536  auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
538  *cursor++ = 0;
539  } else {
540  // QChar::replacement encoded in utf8
541  *cursor++ = 0xef;
542  *cursor++ = 0xbf;
543  *cursor++ = 0xbd;
544  }
545  return cursor;
546  };
547 
548  uchar *cursor = reinterpret_cast<uchar *>(out);
549  const char16_t *src = reinterpret_cast<const char16_t *>(uc);
550  const char16_t *const end = src + len;
551 
552  if (!(state->flags & QStringDecoder::Flag::Stateless)) {
553  if (state->remainingChars) {
554  int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[0], cursor, src, end);
555  if (res < 0)
556  cursor = appendReplacementChar(cursor);
557  state->state_data[0] = 0;
558  state->remainingChars = 0;
559  } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
560  // append UTF-8 BOM
561  *cursor++ = utf8bom[0];
562  *cursor++ = utf8bom[1];
563  *cursor++ = utf8bom[2];
564  state->internalState |= HeaderDone;
565  }
566  }
567 
568  while (src != end) {
569  const char16_t *nextAscii = end;
570  if (simdEncodeAscii(cursor, nextAscii, src, end))
571  break;
572 
573  do {
574  char16_t uc = *src++;
575  int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
576  if (Q_LIKELY(res >= 0))
577  continue;
578 
579  if (res == QUtf8BaseTraits::Error) {
580  // encoding error
581  ++state->invalidChars;
582  cursor = appendReplacementChar(cursor);
583  } else if (res == QUtf8BaseTraits::EndOfString) {
585  ++state->invalidChars;
586  cursor = appendReplacementChar(cursor);
587  } else {
588  state->remainingChars = 1;
589  state->state_data[0] = uc;
590  }
591  return reinterpret_cast<char *>(cursor);
592  }
593  } while (src < nextAscii);
594  }
595 
596  return reinterpret_cast<char *>(cursor);
597 }
598 
600 {
601  // UTF-8 to UTF-16 always needs the exact same number of words or less:
602  // UTF-8 UTF-16
603  // 1 byte 1 word
604  // 2 bytes 1 word
605  // 3 bytes 1 word
606  // 4 bytes 2 words (one surrogate pair)
607  // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
608  // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
609  // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
610  //
611  // The table holds for invalid sequences too: we'll insert one replacement char
612  // per invalid byte.
614  QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
615  const QChar *end = convertToUnicode(data, in);
616  result.truncate(end - data);
617  return result;
618 }
619 
638 {
639  char16_t *dst = reinterpret_cast<char16_t *>(buffer);
640  const uchar *const start = reinterpret_cast<const uchar *>(in.data());
641  const uchar *src = start;
642  const uchar *end = src + in.size();
643 
644  // attempt to do a full decoding in SIMD
645  const uchar *nextAscii = end;
646  if (!simdDecodeAscii(dst, nextAscii, src, end)) {
647  // at least one non-ASCII entry
648  // check if we failed to decode the UTF-8 BOM; if so, skip it
649  if (Q_UNLIKELY(src == start)
650  && end - src >= 3
651  && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
652  src += 3;
653  }
654 
655  while (src < end) {
656  nextAscii = end;
657  if (simdDecodeAscii(dst, nextAscii, src, end))
658  break;
659 
660  do {
661  uchar b = *src++;
662  int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
663  if (res < 0) {
664  // decoding error
666  }
667  } while (src < nextAscii);
668  }
669  }
670 
671  return reinterpret_cast<QChar *>(dst);
672 }
673 
675 {
676  // See above for buffer requirements for stateless decoding. However, that
677  // fails if the state is not empty. The following situations can add to the
678  // requirements:
679  // state contains chars starts with requirement
680  // 1 of 2 bytes valid continuation 0
681  // 2 of 3 bytes same 0
682  // 3 bytes of 4 same +1 (need to insert surrogate pair)
683  // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
684  // 2 of 3 bytes same +1 (same)
685  // 3 of 4 bytes same +1 (same)
686  QString result(in.size() + 1, Qt::Uninitialized);
687  QChar *end = convertToUnicode(result.data(), in, state);
688  result.truncate(end - result.constData());
689  return result;
690 }
691 
693 {
694  qsizetype len = in.size();
695 
696  Q_ASSERT(state);
697  if (!len)
698  return out;
699 
700 
701  char16_t replacement = QChar::ReplacementCharacter;
703  replacement = QChar::Null;
704 
705  int res;
706  uchar ch = 0;
707 
708  char16_t *dst = reinterpret_cast<char16_t *>(out);
709  const uchar *src = reinterpret_cast<const uchar *>(in.data());
710  const uchar *end = src + len;
711 
712  if (!(state->flags & QStringConverter::Flag::Stateless)) {
713  bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
714  if (state->remainingChars || !headerdone) {
715  // handle incoming state first
716  uchar remainingCharsData[4]; // longest UTF-8 sequence possible
717  qsizetype remainingCharsCount = state->remainingChars;
718  qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
719 
720  memset(remainingCharsData, 0, sizeof(remainingCharsData));
721  memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
722  memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
723 
724  const uchar *begin = &remainingCharsData[1];
725  res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
726  static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
727  if (res == QUtf8BaseTraits::Error) {
728  ++state->invalidChars;
729  *dst++ = replacement;
730  ++src;
731  } else if (res == QUtf8BaseTraits::EndOfString) {
732  // if we got EndOfString again, then there were too few bytes in src;
733  // copy to our state and return
734  state->remainingChars = remainingCharsCount + newCharsToCopy;
735  memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
736  return out;
737  } else if (!headerdone) {
738  // eat the UTF-8 BOM
739  if (dst[-1] == 0xfeff)
740  --dst;
741  }
742  state->internalState |= HeaderDone;
743 
744  // adjust src now that we have maybe consumed a few chars
745  if (res >= 0) {
746  Q_ASSERT(res > remainingCharsCount);
747  src += res - remainingCharsCount;
748  }
749  }
750  } else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
751  // stateless, remove initial BOM
752  if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
753  // skip BOM
754  src += 3;
755  }
756 
757  // main body, stateless decoding
758  res = 0;
759  const uchar *nextAscii = src;
760  while (res >= 0 && src < end) {
761  if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
762  break;
763 
764  ch = *src++;
765  res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
766  if (res == QUtf8BaseTraits::Error) {
767  res = 0;
768  ++state->invalidChars;
769  *dst++ = replacement;
770  }
771  }
772 
774  // unterminated UTF sequence
777  ++state->invalidChars;
778  while (src++ < end) {
780  ++state->invalidChars;
781  }
782  state->remainingChars = 0;
783  } else {
784  --src; // unread the byte in ch
785  state->remainingChars = end - src;
786  memcpy(&state->state_data[0], src, end - src);
787  }
788  } else {
789  state->remainingChars = 0;
790  }
791 
792  return reinterpret_cast<QChar *>(dst);
793 }
794 
796 {
797  struct NoOutput {};
798  static void appendUtf16(const NoOutput &, char16_t) {}
799  static void appendUcs4(const NoOutput &, char32_t) {}
800 };
801 
803 {
804  const uchar *src = reinterpret_cast<const uchar *>(in.data());
805  const uchar *end = src + in.size();
806  const uchar *nextAscii = src;
807  bool isValidAscii = true;
808 
809  while (src < end) {
810  if (src >= nextAscii)
811  src = simdFindNonAscii(src, end, nextAscii);
812  if (src == end)
813  break;
814 
815  do {
816  uchar b = *src++;
817  if ((b & 0x80) == 0)
818  continue;
819 
820  isValidAscii = false;
822  int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
823  if (res < 0) {
824  // decoding error
825  return { false, false };
826  }
827  } while (src < nextAscii);
828  }
829 
830  return { true, isValidAscii };
831 }
832 
834 {
835  auto src1 = reinterpret_cast<const char8_t *>(utf8.data());
836  auto end1 = src1 + utf8.size();
837  auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
838  auto end2 = src2 + utf16.size();
839 
840  do {
841  simdCompareAscii(src1, end1, src2, end2);
842 
843  if (src1 < end1 && src2 < end2) {
844  char32_t uc1 = *src1++;
845  char32_t uc2 = *src2++;
846 
847  if (uc1 >= 0x80) {
848  char32_t *output = &uc1;
849  int res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(uc1, output, src1, end1);
850  if (res < 0) {
851  // decoding error
853  }
854 
855  // Only decode the UTF-16 surrogate pair if the UTF-8 code point
856  // wasn't US-ASCII (a surrogate cannot match US-ASCII).
857  if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2))
858  uc2 = QChar::surrogateToUcs4(uc2, *src2++);
859  }
860 
861  if (uc1 != uc2)
862  return int(uc1) - int(uc2);
863  }
864  } while (src1 < end1 && src2 < end2);
865 
866  // the shorter string sorts first
867  return (end1 > src1) - int(end2 > src2);
868 }
869 
871 {
872  char32_t uc1 = QChar::Null;
873  auto src1 = reinterpret_cast<const uchar *>(utf8.data());
874  auto end1 = src1 + utf8.size();
875  auto src2 = reinterpret_cast<const uchar *>(s.latin1());
876  auto end2 = src2 + s.size();
877 
878  while (src1 < end1 && src2 < end2) {
879  uchar b = *src1++;
880  char32_t *output = &uc1;
881  int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
882  if (res < 0) {
883  // decoding error
885  }
886 
887  char32_t uc2 = *src2++;
888  if (uc1 != uc2)
889  return int(uc1) - int(uc2);
890  }
891 
892  // the shorter string sorts first
893  return (end1 > src1) - (end2 > src2);
894 }
895 
897 {
898  bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
899  qsizetype length = 2 * in.size();
900  if (writeBom)
901  length += 2;
902 
904  char *end = convertFromUnicode(d.data(), in, state, endian);
905  Q_ASSERT(end - d.constData() == d.length());
906  Q_UNUSED(end);
907  return d;
908 }
909 
911 {
912  Q_ASSERT(state);
913  bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
914 
915  if (endian == DetectEndianness)
917 
918  if (writeBom) {
919  // set them up the BOM
921  if (endian == BigEndianness)
922  qToBigEndian(bom.unicode(), out);
923  else
924  qToLittleEndian(bom.unicode(), out);
925  out += 2;
926  }
927  if (endian == BigEndianness)
928  qToBigEndian<char16_t>(in.data(), in.length(), out);
929  else
930  qToLittleEndian<char16_t>(in.data(), in.length(), out);
931 
932  state->remainingChars = 0;
933  state->internalState |= HeaderDone;
934  return out + 2*in.length();
935 }
936 
938 {
939  QString result((in.size() + 1) >> 1, Qt::Uninitialized); // worst case
940  QChar *qch = convertToUnicode(result.data(), in, state, endian);
941  result.truncate(qch - result.constData());
942  return result;
943 }
944 
946 {
947  qsizetype len = in.size();
948  const char *chars = in.data();
949 
950  Q_ASSERT(state);
951 
952  if (endian == DetectEndianness)
953  endian = (DataEndianness)state->state_data[Endian];
954 
955  const char *end = chars + len;
956 
957  // make sure we can decode at least one char
958  if (state->remainingChars + len < 2) {
959  if (len) {
960  Q_ASSERT(state->remainingChars == 0 && len == 1);
961  state->remainingChars = 1;
962  state->state_data[Data] = *chars;
963  }
964  return out;
965  }
966 
967  bool headerdone = state && state->internalState & HeaderDone;
969  headerdone = true;
970 
971  if (!headerdone || state->remainingChars) {
972  uchar buf;
973  if (state->remainingChars)
974  buf = state->state_data[Data];
975  else
976  buf = *chars++;
977 
978  // detect BOM, set endianness
979  state->internalState |= HeaderDone;
980  QChar ch(buf, *chars++);
981  if (endian == DetectEndianness) {
982  // someone set us up the BOM
983  if (ch == QChar::ByteOrderSwapped) {
984  endian = BigEndianness;
985  } else if (ch == QChar::ByteOrderMark) {
986  endian = LittleEndianness;
987  } else {
989  endian = BigEndianness;
990  } else {
991  endian = LittleEndianness;
992  }
993  }
994  }
995  if (endian == BigEndianness)
996  ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
997  if (headerdone || ch != QChar::ByteOrderMark)
998  *out++ = ch;
999  } else if (endian == DetectEndianness) {
1001  }
1002 
1003  qsizetype nPairs = (end - chars) >> 1;
1004  if (endian == BigEndianness)
1005  qFromBigEndian<char16_t>(chars, nPairs, out);
1006  else
1007  qFromLittleEndian<char16_t>(chars, nPairs, out);
1008  out += nPairs;
1009 
1010  state->state_data[Endian] = endian;
1011  state->remainingChars = 0;
1012  if ((end - chars) & 1) {
1015  } else {
1016  state->remainingChars = 1;
1017  state->state_data[Data] = *(end - 1);
1018  }
1019  } else {
1020  state->state_data[Data] = 0;
1021  }
1022 
1023  return out;
1024 }
1025 
1027 {
1028  bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1029  qsizetype length = 4*in.size();
1030  if (writeBom)
1031  length += 4;
1033  char *end = convertFromUnicode(ba.data(), in, state, endian);
1034  ba.truncate(end - ba.constData());
1035  return ba;
1036 }
1037 
1039 {
1040  Q_ASSERT(state);
1041 
1042  bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1043  if (endian == DetectEndianness)
1045 
1046  if (writeBom) {
1047  // set them up the BOM
1048  if (endian == BigEndianness) {
1049  out[0] = 0;
1050  out[1] = 0;
1051  out[2] = (char)0xfe;
1052  out[3] = (char)0xff;
1053  } else {
1054  out[0] = (char)0xff;
1055  out[1] = (char)0xfe;
1056  out[2] = 0;
1057  out[3] = 0;
1058  }
1059  out += 4;
1060  state->internalState |= HeaderDone;
1061  }
1062 
1063  const QChar *uc = in.data();
1064  const QChar *end = in.data() + in.length();
1065  QChar ch;
1066  char32_t ucs4;
1067  if (state->remainingChars == 1) {
1068  auto character = state->state_data[Data];
1069  Q_ASSERT(character <= 0xFFFF);
1070  ch = QChar(character);
1071  // this is ugly, but shortcuts a whole lot of logic that would otherwise be required
1072  state->remainingChars = 0;
1073  goto decode_surrogate;
1074  }
1075 
1076  while (uc < end) {
1077  ch = *uc++;
1078  if (Q_LIKELY(!ch.isSurrogate())) {
1079  ucs4 = ch.unicode();
1080  } else if (Q_LIKELY(ch.isHighSurrogate())) {
1081 decode_surrogate:
1082  if (uc == end) {
1085  } else {
1086  state->remainingChars = 1;
1087  state->state_data[Data] = ch.unicode();
1088  return out;
1089  }
1090  } else if (uc->isLowSurrogate()) {
1091  ucs4 = QChar::surrogateToUcs4(ch, *uc++);
1092  } else {
1094  }
1095  } else {
1097  }
1098  if (endian == BigEndianness)
1099  qToBigEndian(ucs4, out);
1100  else
1101  qToLittleEndian(ucs4, out);
1102  out += 4;
1103  }
1104 
1105  return out;
1106 }
1107 
1109 {
1110  QString result;
1111  result.resize((in.size() + 7) >> 1); // worst case
1112  QChar *end = convertToUnicode(result.data(), in, state, endian);
1113  result.truncate(end - result.constData());
1114  return result;
1115 }
1116 
1118 {
1119  qsizetype len = in.size();
1120  const char *chars = in.data();
1121 
1122  Q_ASSERT(state);
1123  if (endian == DetectEndianness)
1124  endian = (DataEndianness)state->state_data[Endian];
1125 
1126  const char *end = chars + len;
1127 
1128  uchar tuple[4];
1129  memcpy(tuple, &state->state_data[Data], 4);
1130 
1131  // make sure we can decode at least one char
1132  if (state->remainingChars + len < 4) {
1133  if (len) {
1134  while (chars < end) {
1135  tuple[state->remainingChars] = *chars;
1136  ++state->remainingChars;
1137  ++chars;
1138  }
1139  Q_ASSERT(state->remainingChars < 4);
1140  memcpy(&state->state_data[Data], tuple, 4);
1141  }
1142  return out;
1143  }
1144 
1145  bool headerdone = state->internalState & HeaderDone;
1147  headerdone = true;
1148 
1149  qsizetype num = state->remainingChars;
1150  state->remainingChars = 0;
1151 
1152  if (!headerdone || endian == DetectEndianness || num) {
1153  while (num < 4)
1154  tuple[num++] = *chars++;
1155  if (endian == DetectEndianness) {
1156  // someone set us up the BOM?
1157  if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0) {
1158  endian = LittleEndianness;
1159  } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff) {
1160  endian = BigEndianness;
1161  } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1162  endian = BigEndianness;
1163  } else {
1164  endian = LittleEndianness;
1165  }
1166  }
1167  char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
1168  if (headerdone || code != QChar::ByteOrderMark) {
1172  } else {
1173  *out++ = QChar(code);
1174  }
1175  }
1176  num = 0;
1177  } else if (endian == DetectEndianness) {
1179  }
1180  state->state_data[Endian] = endian;
1181  state->internalState |= HeaderDone;
1182 
1183  while (chars < end) {
1184  tuple[num++] = *chars++;
1185  if (num == 4) {
1186  char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
1187  for (char16_t c : QChar::fromUcs4(code))
1188  *out++ = c;
1189  num = 0;
1190  }
1191  }
1192 
1193  if (num) {
1194  if (state->flags & QStringDecoder::Flag::Stateless) {
1196  } else {
1197  state->state_data[Endian] = endian;
1198  state->remainingChars = num;
1199  memcpy(&state->state_data[Data], tuple, 4);
1200  }
1201  }
1202 
1203  return out;
1204 }
1205 
1206 #if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1207 static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state)
1208 {
1209  qsizetype length = in.size();
1210  const char *chars = in.data();
1211 
1212  Q_ASSERT(state);
1213  if (state->flags & QStringConverter::Flag::Stateless) // temporary
1214  state = nullptr;
1215 
1216  if (!chars || !length)
1217  return QString();
1218 
1219  qsizetype copyLocation = 0;
1220  qsizetype extra = 2;
1221  if (state && state->remainingChars) {
1222  copyLocation = state->remainingChars;
1223  extra += copyLocation;
1224  }
1225  qsizetype newLength = length + extra;
1226  char *mbcs = new char[newLength];
1227  //ensure that we have a NULL terminated string
1228  mbcs[newLength-1] = 0;
1229  mbcs[newLength-2] = 0;
1230  memcpy(&(mbcs[copyLocation]), chars, length);
1231  if (copyLocation) {
1232  //copy the last character from the state
1233  mbcs[0] = (char)state->state_data[0];
1234  state->remainingChars = 0;
1235  }
1236  const char *mb = mbcs;
1237  const char *next = 0;
1238  QString s;
1239  while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
1240  wchar_t wc[2] ={0};
1241  int charlength = next - mb;
1242  int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
1243  if (len>0) {
1244  s.append(QChar(wc[0]));
1245  } else {
1246  int r = GetLastError();
1247  //check if the character being dropped is the last character
1248  if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
1249  state->remainingChars = 1;
1250  state->state_data[0] = (char)*mb;
1251  }
1252  }
1253  mb = next;
1254  }
1255  delete [] mbcs;
1256  return s;
1257 }
1258 
1259 
1261 {
1262  qsizetype length = in.size();
1263 
1264  Q_ASSERT(length < INT_MAX); // ### FIXME
1265  const char *mb = in.data();
1266  int mblen = length;
1267 
1268  if (!mb || !mblen)
1269  return QString();
1270 
1272  int len;
1273  QString sp;
1274  bool prepend = false;
1275  char state_data = 0;
1276  int remainingChars = 0;
1277 
1278  //save the current state information
1279  if (state) {
1280  state_data = (char)state->state_data[0];
1281  remainingChars = state->remainingChars;
1282  }
1283 
1284  //convert the pending character (if available)
1285  if (state && remainingChars) {
1286  char prev[3] = {0};
1287  prev[0] = state_data;
1288  prev[1] = mb[0];
1289  remainingChars = 0;
1290  len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
1291  prev, 2, wc.data(), wc.length());
1292  if (len) {
1293  sp.append(QChar(wc[0]));
1294  if (mblen == 1) {
1295  state->remainingChars = 0;
1296  return sp;
1297  }
1298  prepend = true;
1299  mb++;
1300  mblen--;
1301  wc[0] = 0;
1302  }
1303  }
1304 
1305  while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
1306  mb, mblen, wc.data(), wc.length()))) {
1307  int r = GetLastError();
1308  if (r == ERROR_INSUFFICIENT_BUFFER) {
1309  const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
1310  mb, mblen, 0, 0);
1311  wc.resize(wclen);
1312  } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1313  //find the last non NULL character
1314  while (mblen > 1 && !(mb[mblen-1]))
1315  mblen--;
1316  //check whether, we hit an invalid character in the middle
1317  if ((mblen <= 1) || (remainingChars && state_data))
1318  return convertToUnicodeCharByChar(in, state);
1319  //Remove the last character and try again...
1320  state_data = mb[mblen-1];
1321  remainingChars = 1;
1322  mblen--;
1323  } else {
1324  // Fail.
1325  qWarning("MultiByteToWideChar: Cannot convert multibyte text");
1326  break;
1327  }
1328  }
1329 
1330  if (len <= 0)
1331  return QString();
1332 
1333  if (wc[len-1] == 0) // len - 1: we don't want terminator
1334  --len;
1335 
1336  //save the new state information
1337  if (state) {
1338  state->state_data[0] = (char)state_data;
1339  state->remainingChars = remainingChars;
1340  }
1341  QString s((QChar*)wc.data(), len);
1342  if (prepend) {
1343  return sp+s;
1344  }
1345  return s;
1346 }
1347 
1349 {
1350  const QChar *ch = in.data();
1351  qsizetype uclen = in.size();
1352 
1353  Q_ASSERT(uclen < INT_MAX); // ### FIXME
1354  Q_ASSERT(state);
1355  Q_UNUSED(state); // ### Fixme
1356  if (state->flags & QStringConverter::Flag::Stateless) // temporary
1357  state = nullptr;
1358 
1359  if (!ch)
1360  return QByteArray();
1361  if (uclen == 0)
1362  return QByteArray("");
1363  BOOL used_def;
1364  QByteArray mb(4096, 0);
1365  int len;
1366  while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
1367  mb.data(), mb.size()-1, 0, &used_def)))
1368  {
1369  int r = GetLastError();
1370  if (r == ERROR_INSUFFICIENT_BUFFER) {
1371  mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
1372  (const wchar_t*)ch, uclen,
1373  0, 0, 0, &used_def));
1374  // and try again...
1375  } else {
1376  // Fail. Probably can't happen in fact (dwFlags is 0).
1377 #ifndef QT_NO_DEBUG
1378  // Can't use qWarning(), as it'll recurse to handle %ls
1379  fprintf(stderr,
1380  "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
1381  r, reinterpret_cast<const wchar_t*>(QString(ch, uclen).utf16()));
1382 #endif
1383  break;
1384  }
1385  }
1386  mb.resize(len);
1387  return mb;
1388 }
1389 #endif
1390 
1392 {
1393  if (clearFn)
1394  clearFn(this);
1395  else
1396  state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
1397  remainingChars = 0;
1398  invalidChars = 0;
1399  internalState = 0;
1400 }
1401 
1403 {
1405 }
1406 
1407 static char *toUtf16(char *out, QStringView in, QStringConverter::State *state)
1408 {
1410 }
1411 
1412 static QChar *fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1413 {
1415 }
1416 
1417 static char *toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
1418 {
1420 }
1421 
1422 static QChar *fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1423 {
1425 }
1426 
1427 static char *toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
1428 {
1430 }
1431 
1433 {
1435 }
1436 
1437 static char *toUtf32(char *out, QStringView in, QStringConverter::State *state)
1438 {
1440 }
1441 
1442 static QChar *fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1443 {
1445 }
1446 
1447 static char *toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
1448 {
1450 }
1451 
1452 static QChar *fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1453 {
1455 }
1456 
1457 static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
1458 {
1460 }
1461 
1462 void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept;
1463 
1465 {
1466  Q_ASSERT(state);
1467  Q_UNUSED(state);
1468 
1469  qt_from_latin1(reinterpret_cast<char16_t *>(out), in.data(), size_t(in.size()));
1470  return out + in.size();
1471 }
1472 
1473 
1474 static char *toLatin1(char *out, QStringView in, QStringConverter::State *state)
1475 {
1476  Q_ASSERT(state);
1477  if (state->flags & QStringConverter::Flag::Stateless) // temporary
1478  state = nullptr;
1479 
1480  const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
1481  qsizetype invalid = 0;
1482  for (qsizetype i = 0; i < in.length(); ++i) {
1483  if (in[i] > QChar(0xff)) {
1484  *out = replacement;
1485  ++invalid;
1486  } else {
1487  *out = (char)in[i].cell();
1488  }
1489  ++out;
1490  }
1491  if (state)
1492  state->invalidChars += invalid;
1493  return out;
1494 }
1495 
1496 static QChar *fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
1497 {
1499  memcpy(out, s.constData(), s.length()*sizeof(QChar));
1500  return out + s.length();
1501 }
1502 
1503 static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
1504 {
1506  memcpy(out, s.constData(), s.length());
1507  return out + s.length();
1508 }
1509 
1510 
1511 static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
1512 static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
1513 
1514 static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
1515 static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
1516 
1517 static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
1518 static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
1519 
1520 static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
1521 static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
1522 
1523 
1524 
1650 const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
1651 {
1652  { "UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
1653  { "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
1654  { "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
1655  { "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
1656  { "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
1657  { "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
1658  { "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
1659  { "ISO-8859-1", fromLatin1, fromLatin1Len, toLatin1, toLatin1Len },
1660  { "Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len }
1661 };
1662 
1663 // match names case insensitive and skipping '-' and '_'
1664 static bool nameMatch(const char *a, const char *b)
1665 {
1666  while (*a && *b) {
1667  if (*a == '-' || *a == '_') {
1668  ++a;
1669  continue;
1670  }
1671  if (*b == '-' || *b == '_') {
1672  ++b;
1673  continue;
1674  }
1676  return false;
1677  ++a;
1678  ++b;
1679  }
1680  return !*a && !*b;
1681 }
1682 
1683 
1698  : iface(nullptr), state(f)
1699 {
1700  auto e = encodingForName(name);
1701  if (e)
1702  iface = encodingInterfaces + int(e.value());
1703 }
1704 
1743 std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name)
1744 {
1745  for (int i = 0; i < LastEncoding + 1; ++i) {
1746  if (nameMatch(encodingInterfaces[i].name, name))
1747  return QStringConverter::Encoding(i);
1748  }
1749  if (nameMatch(name, "latin1"))
1750  return QStringConverter::Latin1;
1751  return std::nullopt;
1752 }
1753 
1761 std::optional<QStringConverter::Encoding> QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter)
1762 {
1763  // someone set us up the BOM?
1764  qsizetype arraySize = data.size();
1765  if (arraySize > 3) {
1766  char32_t uc = qFromUnaligned<char32_t>(data.data());
1767  if (uc == qToBigEndian(char32_t(QChar::ByteOrderMark)))
1769  if (uc == qToLittleEndian(char32_t(QChar::ByteOrderMark)))
1771  if (expectedFirstCharacter) {
1772  // catch also anything starting with the expected character
1773  if (qToLittleEndian(uc) == expectedFirstCharacter)
1775  else if (qToBigEndian(uc) == expectedFirstCharacter)
1777  }
1778  }
1779 
1780  if (arraySize > 2) {
1781  if (memcmp(data.data(), utf8bom, sizeof(utf8bom)) == 0)
1782  return QStringConverter::Utf8;
1783  }
1784 
1785  if (arraySize > 1) {
1786  char16_t uc = qFromUnaligned<char16_t>(data.data());
1787  if (uc == qToBigEndian(char16_t(QChar::ByteOrderMark)))
1789  if (uc == qToLittleEndian(char16_t(QChar::ByteOrderMark)))
1791  if (expectedFirstCharacter) {
1792  // catch also anything starting with the expected character
1793  if (qToLittleEndian(uc) == expectedFirstCharacter)
1795  else if (qToBigEndian(uc) == expectedFirstCharacter)
1797  }
1798  }
1799  return std::nullopt;
1800 }
1801 
1808 std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
1809 {
1810  // determine charset
1811  auto encoding = encodingForData(data);
1812  if (encoding)
1813  // trust the initial BOM
1814  return encoding;
1815 
1816  static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher("meta ");
1817  static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher("charset=");
1818 
1819  QByteArray header = data.first(qMin(data.size(), qsizetype(1024))).toByteArray().toLower();
1820  qsizetype pos = metaSearcher.indexIn(header);
1821  if (pos != -1) {
1822  pos = charsetSearcher.indexIn(header, pos);
1823  if (pos != -1) {
1824  pos += int(qstrlen("charset="));
1825  if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
1826  ++pos;
1827 
1828  qsizetype pos2 = pos;
1829  // The attribute can be closed with either """, "'", ">" or "/",
1830  // none of which are valid charset characters.
1831  while (++pos2 < header.size()) {
1832  char ch = header.at(pos2);
1833  if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
1834  QByteArray name = header.mid(pos, pos2 - pos);
1835  qsizetype colon = name.indexOf(':');
1836  if (colon > 0)
1837  name = name.left(colon);
1838  name = name.simplified();
1839  if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
1840  name = QByteArrayLiteral("UTF-8");
1841  if (!name.isEmpty())
1842  return encodingForName(name);
1843  }
1844  }
1845  }
1846  }
1847  return Utf8;
1848 }
1849 
1854 {
1855  return encodingInterfaces[int(e)].name;
1856 }
1857 
small capitals from c petite p scientific f u
Definition: afcover.h:88
small capitals from c petite p scientific i
[1]
Definition: afcover.h:80
The QByteArray class provides an array of bytes.
Definition: qbytearray.h:85
char * data()
Definition: qbytearray.h:516
qsizetype size() const noexcept
Definition: qbytearray.h:470
const char * constData() const noexcept
Definition: qbytearray.h:144
void truncate(qsizetype pos)
constexpr qsizetype size() const noexcept
constexpr const_pointer data() const noexcept
The QChar class provides a 16-bit Unicode character.
Definition: qchar.h:84
static constexpr QChar fromUcs2(char16_t c) noexcept
Definition: qchar.h:134
static constexpr char32_t surrogateToUcs4(char16_t high, char16_t low) noexcept
Definition: qchar.h:539
@ ReplacementCharacter
Definition: qchar.h:95
@ ByteOrderSwapped
Definition: qchar.h:98
@ ByteOrderMark
Definition: qchar.h:97
@ Null
Definition: qchar.h:87
static constexpr bool requiresSurrogates(char32_t ucs4) noexcept
Definition: qchar.h:535
static constexpr auto fromUcs4(char32_t c) noexcept
Definition: qstringview.h:485
static constexpr char16_t highSurrogate(char32_t ucs4) noexcept
Definition: qchar.h:549
constexpr bool isLowSurrogate() const noexcept
Definition: qchar.h:511
static constexpr char16_t lowSurrogate(char32_t ucs4) noexcept
Definition: qchar.h:553
constexpr bool isHighSurrogate() const noexcept
Definition: qchar.h:510
The QLatin1String class provides a thin wrapper around an US-ASCII/Latin-1 encoded string literal.
Definition: qstring.h:84
static Q_CORE_EXPORT std::optional< Encoding > encodingForHtml(QByteArrayView data)
static Q_CORE_EXPORT const char * nameForEncoding(Encoding e)
static Q_CORE_EXPORT std::optional< Encoding > encodingForName(const char *name)
QSTRINGCONVERTER_CONSTEXPR QStringConverter()
const Interface * iface
static Q_CORE_EXPORT std::optional< Encoding > encodingForData(QByteArrayView data, char16_t expectedFirstCharacter=0)
The QString class provides a Unicode character string.
Definition: qstring.h:388
qsizetype size() const
Definition: qstring.h:413
The QStringView class provides a unified view on UTF-16 strings with a read-only subset of the QStrin...
Definition: qstringview.h:122
@ BigEndian
Definition: qsysinfo.h:61
@ ByteOrder
Definition: qsysinfo.h:66
QString str
[2]
QCursor cursor
double e
else opt state
[0]
short next
Definition: keywords.cpp:454
QTextStream & bom(QTextStream &stream)
constexpr Initialization Uninitialized
Definition: qnamespace.h:1613
constexpr char toAsciiLower(char ch) noexcept
Definition: qtools_p.h:88
#define QString()
Definition: parse-defines.h:51
int BOOL
set set set set set set set macro pixldst1 abits if abits op else op endif endm macro pixldst2 abits if abits op else op endif endm macro pixldst4 abits if abits op else op endif endm macro pixldst0 abits op endm macro pixldst3 mem_operand op endm macro pixldst30 mem_operand op endm macro pixldst abits if abits elseif abits elseif abits elseif abits elseif abits pixldst0 abits else pixldst0 abits pixldst0 abits pixldst0 abits pixldst0 abits endif elseif abits else pixldst0 abits pixldst0 abits endif elseif abits else error unsupported bpp *numpix else pixst endif endm macro vuzp8 reg2 vuzp d d &reg2 endm macro vzip8 reg2 vzip d d &reg2 endm macro pixdeinterleave basereg basereg basereg basereg basereg endif endm macro pixinterleave basereg basereg basereg basereg basereg endif endm macro PF boost_increment endif if endif PF tst PF addne PF subne PF cmp ORIG_W if endif if endif if endif PF subge ORIG_W PF subges if endif if endif if endif endif endm macro cache_preload_simple endif if dst_r_bpp pld[DST_R, #(PREFETCH_DISTANCE_SIMPLE *dst_r_bpp/8)] endif if mask_bpp pld if[MASK, #(PREFETCH_DISTANCE_SIMPLE *mask_bpp/8)] endif endif endm macro ensure_destination_ptr_alignment process_pixblock_tail_head if beq irp skip1(dst_w_bpp<=(lowbit *8)) &&((lowbit *8)<(pixblock_size *dst_w_bpp)) .if lowbit< 16 tst DST_R
[3]
QT_POPCOUNT_RELAXED_CONSTEXPR uint qCountLeadingZeroBits(quint32 v) noexcept
Definition: qalgorithms.h:411
constexpr uint qCountTrailingZeroBits(quint32 v) noexcept
Definition: qalgorithms.h:362
#define QByteArrayLiteral(str)
Definition: qbytearray.h:80
size_t qstrlen(const char *str)
constexpr QStaticByteArrayMatcher< N > qMakeStaticByteArrayMatcher(const char(&pattern)[N]) noexcept
#define Q_UNLIKELY(x)
#define Q_LIKELY(x)
#define Q_ASSUME(Expr)
typedef QByteArray(EGLAPIENTRYP PFNQGSGETDISPLAYSPROC)()
constexpr T qToBigEndian(T source)
Definition: qendian.h:187
constexpr T qToLittleEndian(T source)
Definition: qendian.h:191
Flags
unsigned int quint32
Definition: qglobal.h:288
QT_BEGIN_INCLUDE_NAMESPACE typedef unsigned char uchar
Definition: qglobal.h:332
ptrdiff_t qptrdiff
Definition: qglobal.h:307
ptrdiff_t qsizetype
Definition: qglobal.h:308
unsigned int uint
Definition: qglobal.h:334
unsigned short ushort
Definition: qglobal.h:333
#define qWarning
Definition: qlogging.h:179
GLenum GLuint GLenum GLsizei length
Definition: qopengl.h:270
GLboolean GLboolean GLboolean b
GLsizei const GLfloat * v
[13]
GLboolean r
[2]
GLboolean GLboolean GLboolean GLboolean a
[7]
GLenum GLuint GLintptr GLsizeiptr size
[1]
GLuint GLuint end
GLfloat GLfloat f
GLenum src
GLenum GLuint buffer
GLenum GLenum dst
GLenum GLuint GLenum GLsizei const GLchar * buf
GLint GLsizei GLsizei GLenum GLenum GLsizei void * data
GLuint start
GLenum GLuint GLintptr offset
GLuint name
GLint GLint GLint GLint GLint GLint GLint GLbitfield mask
GLfloat n
GLuint res
Definition: qopenglext.h:8867
const GLubyte * c
Definition: qopenglext.h:12701
GLenum GLsizei len
Definition: qopenglext.h:3292
GLuint in
Definition: qopenglext.h:8870
GLuint64EXT * result
[6]
Definition: qopenglext.h:10932
GLdouble s
[6]
Definition: qopenglext.h:235
GLuint num
Definition: qopenglext.h:5654
GLuint GLenum GLsizei GLsizei GLint GLint GLboolean packed
Definition: qopenglext.h:9781
#define Q_ASSERT(cond)
Definition: qrandom.cpp:84
QtPrivate::QRegularExpressionMatchIteratorRangeBasedForIterator begin(const QRegularExpressionMatchIterator &iterator)
void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept
Definition: qstring.cpp:783
@ Endian
@ Data
@ HeaderDone
DataEndianness
@ LittleEndianness
@ DetectEndianness
@ BigEndianness
#define sp
Q_UNUSED(salary)
[21]
QByteArray ba
[0]
QTextStream out(stdout)
[7]
QObject::connect nullptr
list prepend("one")
QHttpRequestHeader header("GET", QUrl::toPercentEncoding("/index.html"))
[1]
static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state)
static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state)
static Q_CORE_EXPORT QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness=DetectEndianness)
static Q_CORE_EXPORT QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness=DetectEndianness)
static Q_CORE_EXPORT QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness=DetectEndianness)
static QChar * convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian)
static const int Error
static const int EndOfString
static Q_CORE_EXPORT QByteArray convertFromUnicode(QStringView in)
static ValidUtf8Result isValidUtf8(QByteArrayView in)
static Q_CORE_EXPORT QChar * convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
static int compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept
static void appendUtf16(const NoOutput &, char16_t)
static void appendUcs4(const NoOutput &, char32_t)
Definition: inftrees.h:24