QtBase  v6.3.1
qunicodetools.cpp
Go to the documentation of this file.
1 /****************************************************************************
2 **
3 ** Copyright (C) 2020 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39 
40 #include "qunicodetools_p.h"
41 
42 #include "qunicodetables_p.h"
43 #include "qvarlengtharray.h"
44 #if QT_CONFIG(library)
45 #include "qlibrary.h"
46 #endif
47 
48 #include <limits.h>
49 
50 #define FLAG(x) (1 << (x))
51 
53 
55 
56 namespace QUnicodeTools {
57 
58 // -----------------------------------------------------------------------------------------------------
59 //
60 // The text boundaries determination algorithm.
61 // See https://www.unicode.org/reports/tr29/tr29-37.html
62 //
63 // -----------------------------------------------------------------------------------------------------
64 
65 namespace GB {
66 
67 // This table is indexed by the grapheme break classes of two
68 // (adjacent) code points.
69 // The class of the first code point selects an entry.
70 // If the entry's bit at position second_cp_class is set
71 // (in other words: if entry & (1u << second_cp_class) is non-zero)
72 // then there is NO grapheme break between the two code points.
73 
75 
76 // Check that we have enough bits in the table (in case
77 // NumGraphemeBreakClasses grows too much).
78 static_assert(sizeof(GBTableEntryType) * CHAR_BIT >= QUnicodeTables::NumGraphemeBreakClasses,
79  "Internal error: increase the size in bits of GBTableEntryType");
80 
81 // GB9, GB9a
82 static const GBTableEntryType Extend_SpacingMark_ZWJ =
86 
87 static const GBTableEntryType HardBreak = 0u;
88 
90  Extend_SpacingMark_ZWJ, // Any
92  HardBreak, // LF
93  HardBreak, // Control
94  Extend_SpacingMark_ZWJ, // Extend
95  Extend_SpacingMark_ZWJ, // ZWJ
96  Extend_SpacingMark_ZWJ, // RegionalIndicator
97  (Extend_SpacingMark_ZWJ
107  ), // Prepend
108  Extend_SpacingMark_ZWJ, // SpacingMark
109  (Extend_SpacingMark_ZWJ
114  ), // L
115  (Extend_SpacingMark_ZWJ
118  ), // V
119  (Extend_SpacingMark_ZWJ
121  ), // T
122  (Extend_SpacingMark_ZWJ
125  ), // LV
126  (Extend_SpacingMark_ZWJ
128  ), // LVT
129  Extend_SpacingMark_ZWJ // Extended_Pictographic
130 };
131 
132 static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first,
134 {
135  return (breakTable[first] & FLAG(second)) == 0;
136 }
137 
138 // Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
139 // so we need to store some local state.
140 enum class State : uchar {
141  Normal,
142  GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
143  GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
144  GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
145 };
146 
147 } // namespace GB
148 
149 static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
150 {
153  for (qsizetype i = 0; i != len; ++i) {
154  qsizetype pos = i;
155  char32_t ucs4 = string[i];
156  if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
157  ushort low = string[i + 1];
158  if (QChar::isLowSurrogate(low)) {
159  ucs4 = QChar::surrogateToUcs4(ucs4, low);
160  ++i;
161  }
162  }
163 
166 
167  bool shouldBreak = GB::shouldBreakBetweenClasses(lcls, cls);
168  bool handled = false;
169 
170  switch (state) {
171  case GB::State::Normal:
172  break; // will deal with it below
173 
177  // keep going in the current state
178  Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
179  handled = true;
180  } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
182  Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
183  handled = true;
184  } else {
186  }
187  break;
188 
192  shouldBreak = false;
193  handled = true;
194  }
195 
197  break;
198 
202  shouldBreak = false;
203  handled = true;
204  }
205 
207  break;
208  }
209 
210  if (!handled) {
215  Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
216  } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
218  Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
219  }
220  } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
222  }
223  }
224 
225  if (shouldBreak)
226  attributes[pos].graphemeBoundary = true;
227 
228  lcls = cls;
229  }
230 
231  attributes[len].graphemeBoundary = true; // GB2
232 }
233 
234 
235 namespace WB {
236 
237 enum Action {
241  LookupW
242 };
243 
245 // Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
246  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
247  { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
248  { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
249  { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
250  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
251  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
252  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
253  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
254  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
257  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
258  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
259  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
260  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
261  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
263  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ExtendNumLet
264  { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
265 };
266 
267 } // namespace WB
268 
269 static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
270 {
271  enum WordType {
272  WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
273  } currentWordType = WordTypeNone;
274 
276  auto real_cls = cls; // Unaffected by WB4
277 
278  for (qsizetype i = 0; i != len; ++i) {
279  qsizetype pos = i;
280  char32_t ucs4 = string[i];
281  if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
282  ushort low = string[i + 1];
283  if (QChar::isLowSurrogate(low)) {
284  ucs4 = QChar::surrogateToUcs4(ucs4, low);
285  ++i;
286  }
287  }
288 
291 #ifdef QT_BUILD_INTERNAL
293  // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
294  // which caused "hi.there" to be treated like if it were just a single word;
295  // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
296  // and this code is needed to pass the coverage tests; remove once the issue is fixed.
297  if (ucs4 == 0x002E) // FULL STOP
299  else if (ucs4 == 0x003A) // COLON
301  }
302 #endif
303 
304  uchar action = WB::breakTable[cls][ncls];
305  switch (action) {
306  case WB::Break:
308  && prop->graphemeBreakClass
310  // WB3c: ZWJ × \p{Extended_Pictographic}
312  }
313  break;
314  case WB::NoBreak:
316  // WB4: X(Extend|Format)* -> X
317  real_cls = ncls;
318  continue;
319  }
321  // WB15/WB16: break between pairs of Regional indicator
323  }
325  && real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
326  // WB3d should not be affected by WB4
327  action = WB::Break;
328  }
329  break;
330  case WB::Lookup:
331  case WB::LookupW:
332  for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
333  ucs4 = string[lookahead];
334  if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
335  ushort low = string[lookahead + 1];
336  if (QChar::isLowSurrogate(low)) {
337  ucs4 = QChar::surrogateToUcs4(ucs4, low);
338  ++lookahead;
339  }
340  }
341 
342  prop = QUnicodeTables::properties(ucs4);
344 
346  // WB4: X(Extend|Format)* -> X
347  continue;
348  }
349 
350  if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
351  || tcls == QUnicodeTables::WordBreak_ALetter)))) {
352  i = lookahead;
353  ncls = tcls;
355  }
356  break;
357  }
358  if (action != WB::NoBreak) {
359  action = WB::Break;
361  action = WB::NoBreak; // WB7a
362  }
363  break;
364  }
365 
366  cls = ncls;
367  real_cls = ncls;
368 
369  if (action == WB::Break) {
370  attributes[pos].wordBreak = true;
371  if (currentWordType != WordTypeNone)
372  attributes[pos].wordEnd = true;
373  switch (cls) {
375  currentWordType = WordTypeHiraganaKatakana;
376  attributes[pos].wordStart = true;
377  break;
381  currentWordType = WordTypeAlphaNumeric;
382  attributes[pos].wordStart = true;
383  break;
384  default:
385  currentWordType = WordTypeNone;
386  break;
387  }
388  }
389  }
390 
391  if (currentWordType != WordTypeNone)
392  attributes[len].wordEnd = true;
393  attributes[len].wordBreak = true; // WB2
394 }
395 
396 
397 namespace SB {
398 
399 enum State {
413  Lookup
414 };
415 
416 static const uchar breakTable[BAfter + 1][QUnicodeTables::NumSentenceBreakClasses] = {
417 // Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
421 
422  { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
423  { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
424  { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
425  { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
426 
427  { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
428  { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
429  { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
430  { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
431  { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
432 };
433 
434 } // namespace SB
435 
436 static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
437 {
438  uchar state = SB::BAfter; // to meet SB1
439  for (qsizetype i = 0; i != len; ++i) {
440  qsizetype pos = i;
441  char32_t ucs4 = string[i];
442  if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
443  ushort low = string[i + 1];
444  if (QChar::isLowSurrogate(low)) {
445  ucs4 = QChar::surrogateToUcs4(ucs4, low);
446  ++i;
447  }
448  }
449 
452 
454  state = SB::breakTable[state][ncls];
455  if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
456  state = SB::Break;
457  for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
458  ucs4 = string[lookahead];
459  if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
460  ushort low = string[lookahead + 1];
461  if (QChar::isLowSurrogate(low)) {
462  ucs4 = QChar::surrogateToUcs4(ucs4, low);
463  ++lookahead;
464  }
465  }
466 
467  prop = QUnicodeTables::properties(ucs4);
469  switch (tcls) {
476  continue;
478  i = lookahead;
479  state = SB::Initial;
480  break;
481  default:
482  break;
483  }
484  break;
485  }
486  }
487  if (Q_UNLIKELY(state == SB::Break)) {
488  attributes[pos].sentenceBoundary = true;
489  state = SB::breakTable[SB::Initial][ncls];
490  }
491  }
492 
493  attributes[len].sentenceBoundary = true; // SB2
494 }
495 
496 
497 // -----------------------------------------------------------------------------------------------------
498 //
499 // The line breaking algorithm.
500 // See http://www.unicode.org/reports/tr14/tr14-39.html
501 //
502 // -----------------------------------------------------------------------------------------------------
503 
504 namespace LB {
505 
506 namespace NS { // Number Sequence
507 
508 // LB25 recommends to not break lines inside numbers of the form
509 // described by the following regular expression:
510 // (PR|PO)?(OP|HY)?NU(NU|SY|IS)*(CL|CP)?(PR|PO)?
511 
512 enum Action {
516  Break
517 };
518 
519 enum Class {
520  XX,
523  NU,
525  CLCP
526 };
527 
528 static const uchar actionTable[CLCP + 1][CLCP + 1] = {
529 // XX PRPO OPHY NU SYIS CLCP
530  { None , Start , Start , Start , None , None }, // XX
531  { None , Start , Continue, Continue, None , None }, // PRPO
532  { None , Start , Start , Continue, None , None }, // OPHY
533  { Break , Break , Break , Continue, Continue, Continue }, // NU
534  { Break , Break , Break , Continue, Continue, Continue }, // SYIS
535  { Break , Continue, Break , Break , Break , Break }, // CLCP
536 };
537 
539 {
540  switch (lbc) {
541  case QUnicodeTables::LineBreak_AL:// case QUnicodeTables::LineBreak_AI:
542  // resolve AI math symbols in numerical context to IS
544  return SYIS;
545  break;
547  return PRPO;
549  return OPHY;
551  return NU;
553  return SYIS;
555  return CLCP;
556  default:
557  break;
558  }
559  return XX;
560 }
561 
562 } // namespace NS
563 
564 /* In order to support the tailored implementation of LB25 properly
565  the following changes were made in the pair table to allow breaks
566  where the numeric expression doesn't match the template (i.e. [^NU](IS|SY)NU):
567  (CL)(PO) from IB to DB
568  (CP)(PO) from IB to DB
569  (CL)(PR) from IB to DB
570  (CP)(PR) from IB to DB
571  (PO)(OP) from IB to DB
572  (PR)(OP) from IB to DB
573  (IS)(NU) from IB to DB
574  (SY)(NU) from IB to DB
575 */
576 
577 /* In order to implementat LB21a properly a special rule HH has been introduced and
578  the following changes were made in the pair table to disallow breaks after Hebrew + Hyphen:
579  (HL)(HY|BA) from IB to CI
580  (HY|BA)(!CB) from DB to HH
581 */
582 
583 enum Action {
591 };
592 
594 /* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM*/
595 /* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
596 /* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
597 /* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
598 /* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
599 /* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
600 /* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
601 /* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
602 /* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
603 /* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
604 /* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB },
605 /* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
606 /* NU */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
607 /* AL */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
608 /* HL */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
609 /* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
610 /* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
611 /* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
612 /* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
613 /* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB },
614 /* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
615 /* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
616 /* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
617 /* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
618 /* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
619 /* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
620 /* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB },
621 /* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
622 /* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
623 /* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB },
624 /* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
625 /* EB */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
626 /* EM */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
627 };
628 
629 // The following line break classes are not treated by the pair table
630 // and must be resolved outside:
631 // AI, BK, CB, CJ, CR, LF, NL, ZWJ, SA, SG, SP, XX
632 
633 } // namespace LB
634 
635 static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
636 {
637  qsizetype nestart = 0;
638  LB::NS::Class nelast = LB::NS::XX;
639 
643 
644  for (qsizetype i = 0; i != len; ++i) {
645  qsizetype pos = i;
646  char32_t ucs4 = string[i];
647  if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
648  ushort low = string[i + 1];
649  if (QChar::isLowSurrogate(low)) {
650  ucs4 = QChar::surrogateToUcs4(ucs4, low);
651  ++i;
652  }
653  }
654 
658 
661  && ncls <= QUnicodeTables::LineBreak_JT)
662  || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
663  ) {
664  // LB27: use SPACE for line breaking
665  // "When Korean uses SPACE for line breaking, the classes in rule LB26,
666  // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
667  // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
669  } else {
671  // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
673  if (FLAG(prop->category) & test)
675  }
677  // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
680  }
681  }
682  }
683 
685  // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
687  if (FLAG(prop->category) & test)
689  }
690 
692  // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
694  attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
697  goto next_no_cls_update;
698  }
699  goto next;
700  }
701 
703  if (ncls > QUnicodeTables::LineBreak_SP)
704  goto next; // LB6: x(BK|CR|LF|NL)
705  goto next_no_cls_update; // LB7: xSP
706  }
707 
709  // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
711  // don't update anything
712  goto next_no_cls_update;
713  }
714 
716  // LB8a: ZWJ x
717  goto next;
718  }
719 
720  // LB25: do not break lines inside numbers
721  {
723  switch (LB::NS::actionTable[nelast][necur]) {
724  case LB::NS::Break:
725  // do not change breaks before and after the expression
726  for (qsizetype j = nestart + 1; j < pos; ++j)
727  attributes[j].lineBreak = false;
728  Q_FALLTHROUGH();
729  case LB::NS::None:
730  nelast = LB::NS::XX; // reset state
731  break;
732  case LB::NS::Start:
733  nestart = i;
734  Q_FALLTHROUGH();
735  default:
736  nelast = necur;
737  break;
738  }
739  }
740 
742  // LB30a
744  goto next;
745  }
746 
748  && lastProp->category == QChar::Other_NotAssigned
749  && lastProp->graphemeBreakClass
751  // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
752  goto next;
753  }
754 
755  // for South East Asian chars that require a complex analysis, the Unicode
756  // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
759 
760  tcls = cls;
762  // LB10
764  switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_ZWJ ? ncls : QUnicodeTables::LineBreak_AL]) {
765  case LB::DirectBreak:
766  attributes[pos].lineBreak = true;
767  break;
768  case LB::IndirectBreak:
769  if (lcls == QUnicodeTables::LineBreak_SP)
770  attributes[pos].lineBreak = true;
771  break;
773  if (lcls != QUnicodeTables::LineBreak_SP)
774  goto next_no_cls_update;
775  attributes[pos].lineBreak = true;
776  break;
778  if (lcls != QUnicodeTables::LineBreak_SP)
779  goto next_no_cls_update;
780  break;
782  if (lcls != QUnicodeTables::LineBreak_HL)
783  attributes[pos].lineBreak = true;
784  break;
786  switch (static_cast<QUnicodeTables::EastAsianWidth>(prop->eastAsianWidth)) {
787  default:
788  if (lcls != QUnicodeTables::LineBreak_SP)
789  break;
790  Q_FALLTHROUGH();
794  attributes[pos].lineBreak = true;
795  break;
796  }
797  break;
798  case LB::ProhibitedBreak:
799  // nothing to do
800  default:
801  break;
802  }
803 
804  next:
805  cls = ncls;
806  lastProp = prop;
807  next_no_cls_update:
808  lcls = ncls;
809  }
810 
811  if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
812  // LB25: do not break lines inside numbers
813  for (qsizetype j = nestart + 1; j < len; ++j)
814  attributes[j].lineBreak = false;
815  }
816 
817  attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
818  attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
819 }
820 
821 
822 static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
823 {
824  for (qsizetype i = 0; i != len; ++i) {
825  uint ucs4 = string[i];
826  if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
827  ushort low = string[i + 1];
828  if (QChar::isLowSurrogate(low)) {
829  ucs4 = QChar::surrogateToUcs4(ucs4, low);
830  ++i;
831  }
832  }
833 
834  if (Q_UNLIKELY(QChar::isSpace(ucs4)))
835  attributes[i].whiteSpace = true;
836  }
837 }
838 
839 namespace Tailored {
840 
841 using CharAttributeFunction = void (*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes);
842 
843 
844 enum Form {
845  Invalid = 0x0,
856  Other
857 };
858 
859 static const unsigned char indicForms[0xe00-0x900] = {
860  // Devangari
865 
870 
875 
879  Nukta, Other, Matra, Matra,
880 
881  Matra, Matra, Matra, Matra,
882  Matra, Matra, Matra, Matra,
883  Matra, Matra, Matra, Matra,
885 
890 
892  Other, Other, Other, Other,
893  Other, Other, Other, Other,
894  Other, Other, Other, Other,
895 
896  Other, Other, Other, Other,
897  Other, Other, Other, Other,
899  Consonant, Consonant /* ??? */, Consonant, Consonant,
900 
901  // Bengali
906 
911 
916 
920  Nukta, Other, Matra, Matra,
921 
922  Matra, Matra, Matra, Matra,
926 
931 
933  Other, Other, Other, Other,
934  Other, Other, Other, Other,
935  Other, Other, Other, Other,
936 
938  Other, Other, Other, Other,
939  Other, Other, Other, Other,
940  Other, Other, Other, Other,
941 
942  // Gurmukhi
947 
952 
957 
961  Nukta, Other, Matra, Matra,
962 
967 
972 
974  Other, Other, Other, Other,
975  Other, Other, Other, Other,
976  Other, Other, Other, Other,
977 
979  Other, Other, Other, Other,
980  Other, Other, Other, Other,
981  Other, Other, Other, Other,
982 
983  // Gujarati
988 
993 
998 
1002  Nukta, Other, Matra, Matra,
1003 
1004  Matra, Matra, Matra, Matra,
1005  Matra, Matra, Invalid, Matra,
1006  Matra, Matra, Invalid, Matra,
1008 
1013 
1015  Other, Other, Other, Other,
1016  Other, Other, Other, Other,
1017  Other, Other, Other, Other,
1018 
1019  Other, Other, Other, Other,
1020  Other, Other, Other, Other,
1021  Other, Other, Other, Other,
1022  Other, Other, Other, Other,
1023 
1024  // Oriya
1029 
1034 
1039 
1043  Nukta, Other, Matra, Matra,
1044 
1045  Matra, Matra, Matra, Matra,
1049 
1054 
1057  Other, Other, Other, Other,
1058  Other, Other, Other, Other,
1059 
1061  Other, Other, Other, Other,
1062  Other, Other, Other, Other,
1063  Other, Other, Other, Other,
1064 
1065  //Tamil
1070 
1075 
1080 
1085 
1086  Matra, Matra, Matra, Invalid,
1088  Matra, Invalid, Matra, Matra,
1090 
1095 
1098  Other, Other, Other, Other,
1099  Other, Other, Other, Other,
1100 
1101  Other, Other, Other, Other,
1102  Other, Other, Other, Other,
1103  Other, Other, Other, Other,
1104  Other, Other, Other, Other,
1105 
1106  // Telugu
1111 
1116 
1121 
1126 
1127  Matra, Matra, Matra, Matra,
1128  Matra, Invalid, Matra, Matra,
1129  Matra, Invalid, Matra, Matra,
1131 
1136 
1139  Other, Other, Other, Other,
1140  Other, Other, Other, Other,
1141 
1142  Other, Other, Other, Other,
1143  Other, Other, Other, Other,
1144  Other, Other, Other, Other,
1145  Other, Other, Other, Other,
1146 
1147  // Kannada
1152 
1157 
1162 
1166  Nukta, Other, Matra, Matra,
1167 
1168  Matra, Matra, Matra, Matra,
1169  Matra, Invalid, Matra, Matra,
1170  Matra, Invalid, Matra, Matra,
1172 
1177 
1180  Other, Other, Other, Other,
1181  Other, Other, Other, Other,
1182 
1183  Other, Other, Other, Other,
1184  Other, Other, Other, Other,
1185  Other, Other, Other, Other,
1186  Other, Other, Other, Other,
1187 
1188  // Malayalam
1193 
1198 
1203 
1208 
1209  Matra, Matra, Matra, Matra,
1211  Matra, Invalid, Matra, Matra,
1213 
1218 
1221  Other, Other, Other, Other,
1222  Other, Other, Other, Other,
1223 
1224  Other, Other, Other, Other,
1225  Other, Other, Other, Other,
1226  Other, Other, Other, Other,
1227  Other, Other, Other, Other,
1228 
1229  // Sinhala
1234 
1239 
1244 
1249 
1254 
1255  Matra, Matra, Matra, Matra,
1257  Matra, Matra, Matra, Matra,
1258  Matra, Matra, Matra, Matra,
1259 
1264 
1266  Other, Other, Other, Other,
1267  Other, Other, Other, Other,
1268  Other, Other, Other, Other,
1269 };
1270 
1271 static inline Form form(unsigned short uc) {
1272  if (uc < 0x900 || uc > 0xdff) {
1273  if (uc == 0x25cc)
1274  return Consonant;
1275  if (uc == 0x200c || uc == 0x200d)
1276  return Control;
1277  return Other;
1278  }
1279  return (Form)indicForms[uc-0x900];
1280 }
1281 
1282 // #define INDIC_DEBUG
1283 #ifdef INDIC_DEBUG
1284 #define IDEBUG qDebug
1285 #else
1286 #define IDEBUG if constexpr (1) ; else qDebug
1287 #endif
1288 
1289 /* syllables are of the form:
1290 
1291  (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark?
1292  (Consonant Nukta? Halant)* Consonant Halant
1293  IndependentVowel VowelMark? StressMark?
1294 
1295  We return syllable boundaries on invalid combinations as well
1296 */
1297 static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1298 {
1299  *invalid = false;
1300  IDEBUG("indic_nextSyllableBoundary: start=%d, end=%d", int(start), int(end));
1301  const char16_t *uc = s+start;
1302 
1303  qsizetype pos = 0;
1304  Form state = form(uc[pos]);
1305  IDEBUG("state[%d]=%d (uc=%4x)", int(pos), state, uc[pos]);
1306  pos++;
1307 
1308  if (state != Consonant && state != IndependentVowel) {
1309  if (state != Other)
1310  *invalid = true;
1311  goto finish;
1312  }
1313 
1314  while (pos < end - start) {
1315  Form newState = form(uc[pos]);
1316  IDEBUG("state[%d]=%d (uc=%4x)", int(pos), newState, uc[pos]);
1317  switch (newState) {
1318  case Control:
1319  newState = state;
1320  if (state == Halant && uc[pos] == 0x200d /* ZWJ */)
1321  break;
1322  // the control character should be the last char in the item
1323  if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */)
1324  break;
1325  if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */)
1326  break;
1327  // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1328  ++pos;
1329  goto finish;
1330  case Consonant:
1331  if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */))
1332  break;
1333  goto finish;
1334  case Halant:
1335  if (state == Nukta || state == Consonant)
1336  break;
1337  // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1338  if (script == QChar::Script_Bengali && pos == 1 &&
1339  (uc[0] == 0x0985 || uc[0] == 0x098f))
1340  break;
1341  // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1342  if (script == QChar::Script_Sinhala && state == Matra) {
1343  ++pos;
1344  continue;
1345  }
1346  if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) {
1347  ++pos;
1348  continue;
1349  }
1350  goto finish;
1351  case Nukta:
1352  if (state == Consonant)
1353  break;
1354  goto finish;
1355  case StressMark:
1356  if (state == VowelMark)
1357  break;
1358  Q_FALLTHROUGH();
1359  case VowelMark:
1360  if (state == Matra || state == LengthMark || state == IndependentVowel)
1361  break;
1362  Q_FALLTHROUGH();
1363  case Matra:
1364  if (state == Consonant || state == Nukta)
1365  break;
1366  if (state == Matra) {
1367  // ### needs proper testing for correct two/three part matras
1368  break;
1369  }
1370  // ### not sure if this is correct. If it is, does it apply only to Bengali or should
1371  // it work for all Indic languages?
1372  // the combination Independent_A + Vowel Sign AA is allowed.
1373  if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985)
1374  break;
1375  if (script == QChar::Script_Tamil && state == Matra) {
1376  if (uc[pos-1] == 0x0bc6 &&
1377  (uc[pos] == 0xbbe || uc[pos] == 0xbd7))
1378  break;
1379  if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe)
1380  break;
1381  }
1382  goto finish;
1383 
1384  case LengthMark:
1385  if (state == Matra) {
1386  // ### needs proper testing for correct two/three part matras
1387  break;
1388  }
1389  case IndependentVowel:
1390  case Invalid:
1391  case Other:
1392  goto finish;
1393  }
1394  state = newState;
1395  pos++;
1396  }
1397  finish:
1398  return pos+start;
1399 }
1400 
1401 static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1402 {
1403  qsizetype end = from + len;
1404  attributes += from;
1405  qsizetype i = 0;
1406  while (i < len) {
1407  bool invalid;
1408  qsizetype boundary = indic_nextSyllableBoundary(script, text, from+i, end, &invalid) - from;
1409  attributes[i].graphemeBoundary = true;
1410 
1411  if (boundary > len-1) boundary = len;
1412  i++;
1413  while (i < boundary) {
1414  attributes[i].graphemeBoundary = false;
1415  ++i;
1416  }
1417  assert(i == boundary);
1418  }
1419 
1420 
1421 }
1422 
1423 #define LIBTHAI_MAJOR 0
1424 
1425 /*
1426  * if libthai changed please update these codes too.
1427  */
1428 struct thcell_t {
1429  unsigned char base;
1430  unsigned char hilo;
1431  unsigned char top;
1432 };
1433 typedef int (*th_brk_def) (const unsigned char*, int*, size_t);
1434 typedef size_t (*th_next_cell_def) (const unsigned char *, size_t, struct thcell_t *, int);
1435 
1436 /* libthai related function handles */
1437 static th_brk_def th_brk = nullptr;
1438 static th_next_cell_def th_next_cell = nullptr;
1439 
1440 static int init_libthai() {
1441 #if QT_CONFIG(library)
1442  static bool initialized = false;
1443  if (!initialized && (!th_brk || !th_next_cell)) {
1444  th_brk = reinterpret_cast<th_brk_def>(QLibrary::resolve(QLatin1String("thai"), static_cast<int>(LIBTHAI_MAJOR), "th_brk"));
1445  th_next_cell = (th_next_cell_def)QLibrary::resolve(QLatin1String("thai"), LIBTHAI_MAJOR, "th_next_cell");
1446  initialized = true;
1447  }
1448  if (th_brk && th_next_cell)
1449  return 1;
1450  else
1451 #endif
1452  return 0;
1453 }
1454 
1455 static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
1456 {
1457  qsizetype i;
1458  unsigned char *result = reinterpret_cast<unsigned char *>(cstr);
1459 
1460  for (i = 0; i < len; ++i) {
1461  if (string[i] <= 0xa0)
1462  result[i] = static_cast<unsigned char>(string[i]);
1463  else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
1464  result[i] = static_cast<unsigned char>(string[i] - 0xe00 + 0xa0);
1465  else
1466  result[i] = static_cast<unsigned char>(~0); // Same encoding as libthai uses for invalid chars
1467  }
1468 
1469  result[len] = 0;
1470 }
1471 
1472 /*
1473  * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1474  */
1475 static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1476 {
1477  char s[128];
1478  char *cstr = s;
1479  int *break_positions = nullptr;
1480  int brp[128];
1481  int brp_size = 0;
1482  qsizetype numbreaks, i, j, cell_length;
1483  struct thcell_t tis_cell;
1484 
1485  if (!init_libthai())
1486  return ;
1487 
1488  if (len >= 128)
1489  cstr = static_cast<char *>(malloc (len * sizeof(char) + 1));
1490 
1491  to_tis620(string, len, cstr);
1492 
1493  for (i = 0; i < len; ++i) {
1494  attributes[i].wordBreak = false;
1495  attributes[i].wordStart = false;
1496  attributes[i].wordEnd = false;
1497  attributes[i].lineBreak = false;
1498  }
1499 
1500  if (len > 128) {
1501  break_positions = static_cast<int *>(malloc (sizeof(int) * len));
1502  memset (break_positions, 0, sizeof(int) * len);
1503  brp_size = len;
1504  }
1505  else {
1506  break_positions = brp;
1507  brp_size = 128;
1508  }
1509 
1510  if (break_positions) {
1511  attributes[0].wordBreak = true;
1512  attributes[0].wordStart = true;
1513  attributes[0].wordEnd = false;
1514  numbreaks = th_brk(reinterpret_cast<const unsigned char *>(cstr), break_positions, brp_size);
1515  for (i = 0; i < numbreaks; ++i) {
1516  attributes[break_positions[i]].wordBreak = true;
1517  attributes[break_positions[i]].wordStart = true;
1518  attributes[break_positions[i]].wordEnd = true;
1519  attributes[break_positions[i]].lineBreak = true;
1520  }
1521  if (numbreaks > 0)
1522  attributes[break_positions[numbreaks - 1]].wordStart = false;
1523 
1524  if (break_positions != brp)
1525  free(break_positions);
1526  }
1527 
1528  /* manage grapheme boundaries */
1529  i = 0;
1530  while (i < len) {
1531  cell_length = static_cast<uint>(th_next_cell(reinterpret_cast<const unsigned char *>(cstr) + i, len - i, &tis_cell, true));
1532 
1533 
1534  attributes[i].graphemeBoundary = true;
1535  for (j = 1; j < cell_length; j++)
1536  attributes[i + j].graphemeBoundary = false;
1537 
1538  i += cell_length;
1539  }
1540 
1541  if (len >= 128)
1542  free(cstr);
1543 }
1544 
1545 static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1546 {
1548  const char16_t *uc = text + from;
1549  attributes += from;
1550  Q_UNUSED(script);
1551  thaiAssignAttributes(uc, len, attributes);
1552 }
1553 
1554 /*
1555  tibetan syllables are of the form:
1556  head position consonant
1557  first sub-joined consonant
1558  ....intermediate sub-joined consonants (if any)
1559  last sub-joined consonant
1560  sub-joined vowel (a-chung U+0F71)
1561  standard or compound vowel sign (or 'virama' for devanagari transliteration)
1562 */
1563 
1564 typedef enum {
1569  TibetanVowel
1571 
1572 /* this table starts at U+0f40 */
1573 static const unsigned char tibetanForm[0x80] = {
1578 
1583 
1588 
1593 
1598 
1603 
1608 
1613 };
1614 
1615 #define tibetan_form(c) \
1616  ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1617 
1618 static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1619 {
1620  const char16_t *uc = s + start;
1621 
1622  qsizetype pos = 0;
1624 
1625 /* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/
1626  pos++;
1627 
1628  if (state != TibetanHeadConsonant) {
1629  if (state != TibetanOther)
1630  *invalid = true;
1631  goto finish;
1632  }
1633 
1634  while (pos < end - start) {
1636  switch (newState) {
1638  case TibetanSubjoinedVowel:
1639  if (state != TibetanHeadConsonant &&
1641  goto finish;
1642  state = newState;
1643  break;
1644  case TibetanVowel:
1645  if (state != TibetanHeadConsonant &&
1648  goto finish;
1649  break;
1650  case TibetanOther:
1651  case TibetanHeadConsonant:
1652  goto finish;
1653  }
1654  pos++;
1655  }
1656 
1657 finish:
1658  *invalid = false;
1659  return start+pos;
1660 }
1661 
1662 static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1663 {
1664  qsizetype end = from + len;
1665  qsizetype i = 0;
1666  Q_UNUSED(script);
1667  attributes += from;
1668  while (i < len) {
1669  bool invalid;
1670  qsizetype boundary = tibetan_nextSyllableBoundary(text, from+i, end, &invalid) - from;
1671 
1672  attributes[i].graphemeBoundary = true;
1673 
1674  if (boundary > len-1) boundary = len;
1675  i++;
1676  while (i < boundary) {
1677  attributes[i].graphemeBoundary = false;
1678  ++i;
1679  }
1680  assert(i == boundary);
1681  }
1682 }
1683 
1686  Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */
1687  Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */
1688  Mymr_CC_NGA = 3, /* Consonant NGA */
1689  Mymr_CC_YA = 4, /* Consonant YA */
1690  Mymr_CC_RA = 5, /* Consonant RA */
1691  Mymr_CC_WA = 6, /* Consonant WA */
1692  Mymr_CC_HA = 7, /* Consonant HA */
1693  Mymr_CC_IND_VOWEL = 8, /* Independent vowel */
1694  Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */
1695  Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */
1696  Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */
1697  Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */
1698  Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */
1699  Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */
1703  Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */
1704  Mymr_CC_COUNT = 19 /* This is the number of character classes */
1705 };
1706 
1708  Mymr_CF_CLASS_MASK = 0x0000FFFF,
1709 
1710  Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
1711  Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */
1712  Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */
1713  Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */
1714  Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the
1715  first in a syllable */
1716  Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */
1717 
1718  /* position flags */
1719  Mymr_CF_POS_BEFORE = 0x00080000,
1720  Mymr_CF_POS_BELOW = 0x00040000,
1721  Mymr_CF_POS_ABOVE = 0x00020000,
1722  Mymr_CF_POS_AFTER = 0x00010000,
1723  Mymr_CF_POS_MASK = 0x000f0000,
1724 
1725  Mymr_CF_AFTER_KINZI = 0x00100000
1726 };
1727 
1729 
1730 /* Characters that get refrered to by name */
1732 {
1736  Mymr_C_RA = 0x101B,
1737  Mymr_C_YA = 0x101A,
1738  Mymr_C_NGA = 0x1004,
1739  Mymr_C_VOWEL_E = 0x1031,
1740  Mymr_C_VIRAMA = 0x1039
1741 };
1742 
1743 enum
1744 {
1762 };
1763 
1764 
1765 typedef int MymrCharClass;
1766 
1767 
1768 static const MymrCharClass mymrCharClasses[] =
1769 {
1771  Mymr_c1, Mymr_c1, Mymr_c2, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, /* 1000 - 100F */
1773  Mymr_c1, Mymr_c1, Mymr_ya, Mymr_ra, Mymr_c1, Mymr_wa, Mymr_c1, Mymr_ha, /* 1010 - 101F */
1775  Mymr_xx, Mymr_id, Mymr_id, Mymr_xx, Mymr_dr, Mymr_da, Mymr_da, Mymr_db, /* 1020 - 102F */
1777  Mymr_sp, Mymr_vi, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1030 - 103F */
1779  Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1040 - 104F */
1781  Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1050 - 105F */
1782 };
1783 
1784 static MymrCharClass
1785 getMyanmarCharClass (ushort ch)
1786 {
1787  if (ch == Mymr_C_SIGN_ZWJ)
1789 
1790  if (ch == Mymr_C_SIGN_ZWNJ)
1792 
1793  if (ch < 0x1000 || ch > 0x105f)
1794  return Mymr_CC_RESERVED;
1795 
1796  return mymrCharClasses[ch - 0x1000];
1797 }
1798 
1799 static const signed char mymrStateTable[][Mymr_CC_COUNT] =
1800 {
1801 /* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */
1802  { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */
1803  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */
1804  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */
1805  {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */
1806  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */
1807  {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */
1808  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */
1809  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */
1810  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */
1811  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */
1812  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */
1813  {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */
1814  {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */
1815  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */
1816  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */
1817  {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */
1818  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */
1819  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */
1820  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */
1821  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */
1822  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */
1823  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */
1824  {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */
1825  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */
1826  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */
1827  {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */
1828  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */
1829  {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */
1830 /* exit state -2 is for invalid order of medials and combination of invalids
1831  with virama where virama should treat as start of next syllable
1832  */
1833 };
1834 
1835 /*#define MYANMAR_DEBUG */
1836 #ifdef MYANMAR_DEBUG
1837 #define MMDEBUG qDebug
1838 #else
1839 # define MMDEBUG \
1840  if (0) \
1841  printf
1842 #endif
1843 
1844 /*
1845 // Given an input string of characters and a location in which to start looking
1846 // calculate, using the state table, which one is the last character of the syllable
1847 // that starts in the starting position.
1848 */
1849 static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1850 {
1851  const char16_t *uc = s + start;
1852  int state = 0;
1853  qsizetype pos = start;
1854  *invalid = false;
1855 
1856  while (pos < end) {
1857  MymrCharClass charClass = getMyanmarCharClass(*uc);
1858  state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
1859  if (pos == start)
1860  *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
1861 
1862  MMDEBUG("state[%d]=%d class=%8x (uc=%4x)", int(pos - start), state, charClass, *uc);
1863 
1864  if (state < 0) {
1865  if (state < -1)
1866  --pos;
1867  break;
1868  }
1869  ++uc;
1870  ++pos;
1871  }
1872  return pos;
1873 }
1874 
1875 static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1876 {
1877  qsizetype end = from + len;
1878  qsizetype i = 0;
1879  Q_UNUSED(script);
1880  attributes += from;
1881  while (i < len) {
1882  bool invalid;
1883  qsizetype boundary = myanmar_nextSyllableBoundary(text, from+i, end, &invalid) - from;
1884 
1885  attributes[i].graphemeBoundary = true;
1886  attributes[i].lineBreak = true;
1887 
1888  if (boundary > len-1)
1889  boundary = len;
1890  i++;
1891  while (i < boundary) {
1892  attributes[i].graphemeBoundary = false;
1893  ++i;
1894  }
1895  assert(i == boundary);
1896  }
1897 }
1898 
1899 /*
1900 // Vocabulary
1901 // Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
1902 // center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
1903 // split vowels, signs... but there is only one base in a syllable, it has to be coded as
1904 // the first character of the syllable.
1905 // split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
1906 // Khmer language has five of them. Khmer split vowels either have one part before the
1907 // base and one after the base or they have a part before the base and a part above the base.
1908 // The first part of all Khmer split vowels is the same character, identical to
1909 // the glyph of Khmer dependent vowel SRA EI
1910 // coeng --> modifier used in Khmer to construct coeng (subscript) consonants
1911 // Differently than indian languages, the coeng modifies the consonant that follows it,
1912 // not the one preceding it Each consonant has two forms, the base form and the subscript form
1913 // the base form is the normal one (using the consonants code-point), the subscript form is
1914 // displayed when the combination coeng + consonant is encountered.
1915 // Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
1916 // Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
1917 // Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
1918 // Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
1919 // if it is attached to a consonant of the first series or a consonant of the second series
1920 // Most consonants have an equivalent in the other series, but some of theme exist only in
1921 // one series (for example SA). If we want to use the consonant SA with a vowel sound that
1922 // can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
1923 // of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
1924 // x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
1925 // MUSIKATOAN a second series consonant to have a first series vowel sound.
1926 // Consonant shifter are both normally supercript marks, but, when they are followed by a
1927 // superscript, they change shape and take the form of subscript dependent vowel SRA U.
1928 // If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
1929 // should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
1930 // be placed after the coeng consonant.
1931 // Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
1932 // Each vowel has its own position. Only one vowel per syllable is allowed.
1933 // Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
1934 // Allowed in a syllable.
1935 //
1936 //
1937 // order is important here! This order must be the same that is found in each horizontal
1938 // line in the statetable for Khmer (see khmerStateTable) .
1939 */
1942  CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */
1943  CC_CONSONANT2 = 2, /* Consonant of type 2 */
1944  CC_CONSONANT3 = 3, /* Consonant of type 3 */
1945  CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */
1947  CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */
1948  CC_COENG = 7, /* Subscript consonant combining character */
1952  CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */
1953  CC_COUNT = 12 /* This is the number of character classes */
1954 };
1955 
1956 
1958  CF_CLASS_MASK = 0x0000FFFF,
1959 
1960  CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
1961  CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */
1962  CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */
1963  CF_COENG = 0x08000000, /* flag to speed up comparing */
1964  CF_SHIFTER = 0x10000000, /* flag to speed up comparing */
1965  CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */
1966 
1967  /* position flags */
1968  CF_POS_BEFORE = 0x00080000,
1969  CF_POS_BELOW = 0x00040000,
1970  CF_POS_ABOVE = 0x00020000,
1971  CF_POS_AFTER = 0x00010000,
1972  CF_POS_MASK = 0x000f0000
1973 };
1974 
1976 
1977 /* Characters that get referred to by name */
1979  C_SIGN_ZWNJ = 0x200C,
1980  C_SIGN_ZWJ = 0x200D,
1981  C_RO = 0x179A,
1982  C_VOWEL_AA = 0x17B6,
1983  C_SIGN_NIKAHIT = 0x17C6,
1984  C_VOWEL_E = 0x17C1,
1985  C_COENG = 0x17D2
1986 };
1987 
1988 
1989 /*
1990 // simple classes, they are used in the statetable (in this file) to control the length of a syllable
1991 // they are also used to know where a character should be placed (location in reference to the base character)
1992 // and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
1993 // indicate error in syllable construction
1994 */
1995 enum {
2009 
2010  /* split vowel */
2012  _vr = _dr | CF_SPLIT_VOWEL
2013 };
2014 
2015 
2016 /*
2017 // Character class: a character class value
2018 // ORed with character class flags.
2019 */
2020 typedef unsigned long KhmerCharClass;
2021 
2022 
2023 /*
2024 // Character class tables
2025 // _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2026 // _sa Sign placed above the base
2027 // _sp Sign placed after the base
2028 // _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2029 // _c2 Consonant of type 2 (only RO)
2030 // _c3 Consonant of type 3
2031 // _rb Khmer sign robat u17CC. combining mark for subscript consonants
2032 // _cd Consonant-shifter
2033 // _dl Dependent vowel placed before the base (left of the base)
2034 // _db Dependent vowel placed below the base
2035 // _da Dependent vowel placed above the base
2036 // _dr Dependent vowel placed behind the base (right of the base)
2037 // _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2038 // it to create a subscript consonant or independent vowel
2039 // _va Khmer split vowel in which the first part is before the base and the second one above the base
2040 // _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2041 */
2042 static const KhmerCharClass khmerCharClasses[] = {
2043  _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */
2044  _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */
2045  _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */
2046  _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */
2047  _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */
2048  _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */
2049 };
2050 
2051 /* this enum must reflect the range of khmerCharClasses */
2053  KhmerFirstChar = 0x1780,
2054  KhmerLastChar = 0x17df
2055 };
2056 
2057 /*
2058 // Below we define how a character in the input string is either in the khmerCharClasses table
2059 // (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2060 // within the syllable, but are not in the table) we also get their type back, or an unknown object
2061 // in which case we get _xx (CC_RESERVED) back
2062 */
2063 static KhmerCharClass getKhmerCharClass(ushort uc)
2064 {
2065  if (uc == C_SIGN_ZWJ) {
2066  return CC_ZERO_WIDTH_J_MARK;
2067  }
2068 
2069  if (uc == C_SIGN_ZWNJ) {
2070  return CC_ZERO_WIDTH_NJ_MARK;
2071  }
2072 
2073  if (uc < KhmerFirstChar || uc > KhmerLastChar) {
2074  return CC_RESERVED;
2075  }
2076 
2077  return khmerCharClasses[uc - KhmerFirstChar];
2078 }
2079 
2080 
2081 /*
2082 // The stateTable is used to calculate the end (the length) of a well
2083 // formed Khmer Syllable.
2084 //
2085 // Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2086 // CharClassValues. This coincidence of values allows the follow up of the table.
2087 //
2088 // Each line corresponds to a state, which does not necessarily need to be a type
2089 // of component... for example, state 2 is a base, with is always a first character
2090 // in the syllable, but the state could be produced a consonant of any type when
2091 // it is the first character that is analysed (in ground state).
2092 //
2093 // Differentiating 3 types of consonants is necessary in order to
2094 // forbid the use of certain combinations, such as having a second
2095 // coeng after a coeng RO,
2096 // The inexistent possibility of having a type 3 after another type 3 is permitted,
2097 // eliminating it would very much complicate the table, and it does not create typing
2098 // problems, as the case above.
2099 //
2100 // The table is quite complex, in order to limit the number of coeng consonants
2101 // to 2 (by means of the table).
2102 //
2103 // There a peculiarity, as far as Unicode is concerned:
2104 // - The consonant-shifter is considered in two possible different
2105 // locations, the one considered in Unicode 3.0 and the one considered in
2106 // Unicode 4.0. (there is a backwards compatibility problem in this standard).
2107 //
2108 //
2109 // xx independent character, such as a number, punctuation sign or non-khmer char
2110 //
2111 // c1 Khmer consonant of type 1 or an independent vowel
2112 // that is, a letter in which the subscript for is only under the
2113 // base, not taking any space to the right or to the left
2114 //
2115 // c2 Khmer consonant of type 2, the coeng form takes space under
2116 // and to the left of the base (only RO is of this type)
2117 //
2118 // c3 Khmer consonant of type 3. Its subscript form takes space under
2119 // and to the right of the base.
2120 //
2121 // cs Khmer consonant shifter
2122 //
2123 // rb Khmer robat
2124 //
2125 // co coeng character (u17D2)
2126 //
2127 // dv dependent vowel (including split vowels, they are treated in the same way).
2128 // even if dv is not defined above, the component that is really tested for is
2129 // KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2130 //
2131 // zwj Zero Width joiner
2132 //
2133 // zwnj Zero width non joiner
2134 //
2135 // sa above sign
2136 //
2137 // sp post sign
2138 //
2139 // there are lines with equal content but for an easier understanding
2140 // (and maybe change in the future) we did not join them
2141 */
2142 static const signed char khmerStateTable[][CC_COUNT] =
2143 {
2144  /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */
2145  { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */
2146  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */
2147  {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */
2148  {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */
2149  {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */
2150  {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */
2151  {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */
2152  {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */
2153  {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */
2154  {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */
2155  {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */
2156  {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */
2157  {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */
2158  {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */
2159  {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */
2160  {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */
2161  {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */
2162  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */
2163  {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */
2164  {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */
2165  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */
2166 };
2167 
2168 
2169 /* #define KHMER_DEBUG */
2170 #ifdef KHMER_DEBUG
2171 #define KHDEBUG qDebug
2172 #else
2173 # define KHDEBUG \
2174  if (0) \
2175  printf
2176 #endif
2177 
2178 /*
2179 // Given an input string of characters and a location in which to start looking
2180 // calculate, using the state table, which one is the last character of the syllable
2181 // that starts in the starting position.
2182 */
2183 static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2184 {
2185  const char16_t *uc = s + start;
2186  int state = 0;
2187  qsizetype pos = start;
2188  *invalid = false;
2189 
2190  while (pos < end) {
2191  KhmerCharClass charClass = getKhmerCharClass(*uc);
2192  if (pos == start) {
2193  *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT);
2194  }
2195  state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2196 
2197  KHDEBUG("state[%d]=%d class=%8lx (uc=%4x)", int(pos - start), state,
2198  charClass, *uc );
2199 
2200  if (state < 0) {
2201  break;
2202  }
2203  ++uc;
2204  ++pos;
2205  }
2206  return pos;
2207 }
2208 
2209 static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2210 {
2211  qsizetype end = from + len;
2212  qsizetype i = 0;
2213  Q_UNUSED(script);
2214  attributes += from;
2215  while ( i < len ) {
2216  bool invalid;
2217  qsizetype boundary = khmer_nextSyllableBoundary( text, from+i, end, &invalid ) - from;
2218 
2219  attributes[i].graphemeBoundary = true;
2220 
2221  if ( boundary > len-1 ) boundary = len;
2222  i++;
2223  while ( i < boundary ) {
2224  attributes[i].graphemeBoundary = false;
2225  ++i;
2226  }
2227  assert( i == boundary );
2228  }
2229 }
2230 
2231 
2233 // Script_Unknown,
2234  nullptr,
2235 // Script_Inherited,
2236  nullptr,
2237 // Script_Common,
2238  nullptr,
2239 // Script_Latin,
2240  nullptr,
2241 // Script_Greek,
2242  nullptr,
2243 // Script_Cyrillic,
2244  nullptr,
2245 // Script_Armenian,
2246  nullptr,
2247 // Script_Hebrew,
2248  nullptr,
2249 // Script_Arabic,
2250  nullptr,
2251 // Script_Syriac,
2252  nullptr,
2253 // Script_Thaana,
2254  nullptr,
2255 // Script_Devanagari,
2256  indicAttributes,
2257 // Script_Bengali,
2258  indicAttributes,
2259 // Script_Gurmukhi,
2260  indicAttributes,
2261 // Script_Gujarati,
2262  indicAttributes,
2263 // Script_Oriya,
2264  indicAttributes,
2265 // Script_Tamil,
2266  indicAttributes,
2267 // Script_Telugu,
2268  indicAttributes,
2269 // Script_Kannada,
2270  indicAttributes,
2271 // Script_Malayalam,
2272  indicAttributes,
2273 // Script_Sinhala,
2274  indicAttributes,
2275 // Script_Thai,
2276  thaiAttributes,
2277 // Script_Lao,
2278  nullptr,
2279 // Script_Tibetan,
2280  tibetanAttributes,
2281 // Script_Myanmar,
2282  myanmarAttributes,
2283 // Script_Georgian,
2284  nullptr,
2285 // Script_Hangul,
2286  nullptr,
2287 // Script_Ethiopic,
2288  nullptr,
2289 // Script_Cherokee,
2290  nullptr,
2291 // Script_CanadianAboriginal,
2292  nullptr,
2293 // Script_Ogham,
2294  nullptr,
2295 // Script_Runic,
2296  nullptr,
2297 // Script_Khmer,
2298  khmerAttributes
2299 };
2300 
2301 static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2302  const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2303  QCharAttributes *attributes)
2304 {
2305  if (stringLength == 0)
2306  return;
2307  for (qsizetype i = 0; i < numItems; ++i) {
2308  QChar::Script script = items[i].script;
2311  CharAttributeFunction attributeFunction = charAttributeFunction[script];
2312  if (!attributeFunction)
2313  continue;
2314  qsizetype end = i < numItems - 1 ? items[i + 1].position : stringLength;
2315  attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2316  }
2317 }
2318 
2319 }
2320 
2321 Q_CORE_EXPORT void initCharAttributes(QStringView string,
2322  const ScriptItem *items, qsizetype numItems,
2323  QCharAttributes *attributes, CharAttributeOptions options)
2324 {
2325  if (string.size() <= 0)
2326  return;
2327 
2328  if (!(options & DontClearAttributes))
2329  ::memset(attributes, 0, (string.size() + 1) * sizeof(QCharAttributes));
2330 
2331  if (options & GraphemeBreaks)
2332  getGraphemeBreaks(string.utf16(), string.size(), attributes);
2333  if (options & WordBreaks)
2334  getWordBreaks(string.utf16(), string.size(), attributes);
2335  if (options & SentenceBreaks)
2336  getSentenceBreaks(string.utf16(), string.size(), attributes);
2337  if (options & LineBreaks)
2338  getLineBreaks(string.utf16(), string.size(), attributes, options);
2339  if (options & WhiteSpaces)
2340  getWhiteSpaces(string.utf16(), string.size(), attributes);
2341 
2343  if (!items || numItems <= 0)
2344  return;
2345 
2346  Tailored::getCharAttributes(string.utf16(), string.size(), items, numItems, attributes);
2347  }
2348 }
2349 
2350 
2351 // ----------------------------------------------------------------------------
2352 //
2353 // The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2354 //
2355 // ----------------------------------------------------------------------------
2356 
2357 Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2358 {
2359  qsizetype sor = 0;
2360  qsizetype eor = 0;
2362 
2363  for (qsizetype i = 0; i < string.size(); ++i, eor = i) {
2364  char32_t ucs4 = string[i].unicode();
2365  if (QChar::isHighSurrogate(ucs4) && i + 1 < string.size()) {
2366  ushort low = string[i + 1].unicode();
2367  if (QChar::isLowSurrogate(low)) {
2368  ucs4 = QChar::surrogateToUcs4(ucs4, low);
2369  ++i;
2370  }
2371  }
2372 
2374 
2375  QChar::Script nscript = QChar::Script(prop->script);
2376 
2377  if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
2378  continue;
2379 
2380  // inherit preceding Common-s
2382  // also covers a case where the base character of Common script followed
2383  // by one or more combining marks of non-Inherited, non-Common script
2384  script = nscript;
2385  continue;
2386  }
2387 
2388  // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2389  // Thus, a combining mark - whatever its script property value is - should inherit
2390  // the script property value of its base character.
2392  if (Q_UNLIKELY(FLAG(prop->category) & test))
2393  continue;
2394 
2396  Q_ASSERT(sor < eor);
2397  scripts->append(ScriptItem{sor, script});
2398  sor = eor;
2399 
2400  script = nscript;
2401  }
2402 
2404  Q_ASSERT(eor == string.size());
2405  scripts->append(ScriptItem{sor, script});
2406 }
2407 
2408 } // namespace QUnicodeTools
2409 
small capitals from c petite p scientific f u
Definition: afcover.h:88
small capitals from c petite p scientific i
[1]
Definition: afcover.h:80
xD9 x84 xD8 xAD xD9 x80 xF0 x90 xAC x9A xE0 xA7 xA6 xE0 xA7 xAA xF0 x91 x84 xA4 xF0 x91 x84 x89 xF0 x91 x84 x9B xF0 x90 x8A xAB xF0 x90 x8B x89 xE2 xB2 x9E xE2 xB2 x9F xD0 xBE xD0 x9E xF0 x90 x90 x84 xF0 x90 x90 xAC xE1 x83 x98 xE1 x83 x94 xE1 x83 x90 xE1 xB2 xBF xE2 xB0 x95 xE2 xB1 x85 xCE xBF xCE x9F xE0 xA8 xA0 xE0 xA8 xB0 xE0 xA9 xA6 Kayah xEA xA4 x8D xEA xA4 x80 Khmer xE1 xA7 xA1 xE1 xA7 xAA xE0 xBB x90 Latin Subscript xE2 x82 x92 xE2 x82 x80 xEA x93 xB3 xF0 x96 xB9 xA1 xF0 x96 xB9 x9B xF0 x96 xB9 xAF xE1 x80 x9D xE1 x80 x84 xE1 x80 x82 no script
Definition: afscript.h:271
static constexpr char32_t surrogateToUcs4(char16_t high, char16_t low) noexcept
Definition: qchar.h:539
Category
Definition: qchar.h:140
@ Mark_SpacingCombining
Definition: qchar.h:142
@ Symbol_Math
Definition: qchar.h:173
@ Mark_NonSpacing
Definition: qchar.h:141
@ Mark_Enclosing
Definition: qchar.h:143
@ Other_NotAssigned
Definition: qchar.h:157
constexpr bool isLowSurrogate() const noexcept
Definition: qchar.h:511
Script
Definition: qchar.h:180
@ Script_Tamil
Definition: qchar.h:198
@ Script_Thai
Definition: qchar.h:203
@ Script_Kannada
Definition: qchar.h:200
@ Script_Common
Definition: qchar.h:183
@ Script_Malayalam
Definition: qchar.h:201
@ Script_Bengali
Definition: qchar.h:194
@ Script_Khmer
Definition: qchar.h:214
@ Script_Sinhala
Definition: qchar.h:202
constexpr bool isSpace() const noexcept
Definition: qchar.h:497
constexpr bool isHighSurrogate() const noexcept
Definition: qchar.h:510
The QLatin1String class provides a thin wrapper around an US-ASCII/Latin-1 encoded string literal.
Definition: qstring.h:84
QFunctionPointer resolve(const char *symbol)
Definition: qlibrary.cpp:1024
The QStringView class provides a unified view on UTF-16 strings with a read-only subset of the QStrin...
Definition: qstringview.h:122
void append(const T &t)
switch(msgBox.exec())
const QLoggingCategory & category()
[1]
QString text
[meta data]
else opt state
[0]
void newState(QList< State > &states, const char *token, const char *lexem, bool pre)
short next
Definition: keywords.cpp:454
QHighDpiScaling::Point position(T, QHighDpiScaling::Point::Kind)
Q_DECL_CONST_FUNCTION Q_CORE_EXPORT const Properties *QT_FASTCALL properties(char32_t ucs4) noexcept
@ GraphemeBreak_Extended_Pictographic
Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category category)
const CharAttributeFunction charAttributeFunction[]
int(* th_brk_def)(const unsigned char *, int *, size_t)
void(*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes) CharAttributeFunction
size_t(* th_next_cell_def)(const unsigned char *, size_t, struct thcell_t *, int)
Q_CORE_EXPORT void initCharAttributes(QStringView string, const ScriptItem *items, qsizetype numItems, QCharAttributes *attributes, CharAttributeOptions options)
Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
typing.Tuple[int, int] test(str binary_directory, *debug=False)
action
Definition: devices.py:78
set set set set set set set macro pixldst1 abits if abits op else op endif endm macro pixldst2 abits if abits op else op endif endm macro pixldst4 abits if abits op else op endif endm macro pixldst0 abits op endm macro pixldst3 mem_operand op endm macro pixldst30 mem_operand op endm macro pixldst abits if abits elseif abits elseif abits elseif abits elseif abits pixldst0 abits else pixldst0 abits pixldst0 abits pixldst0 abits pixldst0 abits endif elseif abits else pixldst0 abits pixldst0 abits endif elseif abits else error unsupported bpp *numpix else pixst endif endm macro vuzp8 reg2 vuzp d d &reg2 endm macro vzip8 reg2 vzip d d &reg2 endm macro pixdeinterleave basereg basereg basereg basereg basereg endif endm macro pixinterleave basereg basereg basereg basereg basereg endif endm macro PF boost_increment endif if endif PF tst PF addne PF subne PF cmp ORIG_W if endif if endif if endif PF subge ORIG_W PF subges if endif if endif if endif endif endm macro cache_preload_simple endif if dst_r_bpp pld[DST_R, #(PREFETCH_DISTANCE_SIMPLE *dst_r_bpp/8)] endif if mask_bpp pld if[MASK, #(PREFETCH_DISTANCE_SIMPLE *mask_bpp/8)] endif endif endm macro ensure_destination_ptr_alignment process_pixblock_tail_head if beq irp skip1(dst_w_bpp<=(lowbit *8)) &&((lowbit *8)<(pixblock_size *dst_w_bpp)) .if lowbit< 16 tst DST_R
[3]
void
Definition: png.h:1080
#define assert
Definition: qcborcommon_p.h:63
#define Q_FALLTHROUGH()
#define Q_UNLIKELY(x)
#define Q_LIKELY(x)
#define Q_DECLARE_MIXED_ENUM_OPERATORS(Ret, Flags, Enum)
Definition: qflags.h:267
QT_BEGIN_INCLUDE_NAMESPACE typedef unsigned char uchar
Definition: qglobal.h:332
#define Q_AUTOTEST_EXPORT
Definition: qglobal.h:579
unsigned short quint16
Definition: qglobal.h:286
ptrdiff_t qsizetype
Definition: qglobal.h:308
unsigned int uint
Definition: qglobal.h:334
unsigned short ushort
Definition: qglobal.h:333
@ text
#define NS(x)
Definition: qmetatype.cpp:98
GLenum GLuint GLintptr GLsizeiptr size
[1]
GLuint GLuint end
GLuint start
GLint first
GLenum GLsizei len
Definition: qopenglext.h:3292
GLuint64EXT * result
[6]
Definition: qopenglext.h:10932
GLdouble s
[6]
Definition: qopenglext.h:235
#define Q_ASSERT(cond)
Definition: qrandom.cpp:84
#define LIBTHAI_MAJOR
#define FLAG(x)
#define KHDEBUG
#define IDEBUG
QT_BEGIN_NAMESPACE Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only
#define tibetan_form(c)
#define MMDEBUG
Q_UNUSED(salary)
[21]
QGraphicsWidget * form
QList< QTreeWidgetItem * > items