The Tor Browser: intl/lwbrk/src/nsJISx4051LineBreaker.cpp@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */

     2 /* This Source Code Form is subject to the terms of the Mozilla Public

     3  * License, v. 2.0. If a copy of the MPL was not distributed with this

     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     8 #include "nsJISx4051LineBreaker.h"

    10 #include "jisx4051class.h"

    11 #include "nsComplexBreaker.h"

    12 #include "nsTArray.h"

    14 /*

    16    Simplification of Pair Table in JIS X 4051

    18    1. The Origion Table - in 4.1.3

    20    In JIS x 4051. The pair table is defined as below

    22    Class of

    23    Leading    Class of Trailing Char Class

    24    Char

    26               1  2  3  4  5  6  7  8  9 10 11 12 13 13 14 14 15 16 17 18 19 20

    27                                                  *  #  *  #

    28         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  E

    29         2        X  X  X  X  X                                               X

    30         3        X  X  X  X  X                                               X

    31         4        X  X  X  X  X                                               X

    32         5        X  X  X  X  X                                               X

    33         6        X  X  X  X  X                                               X

    34         7        X  X  X  X  X  X                                            X

    35         8        X  X  X  X  X                                X              E

    36         9        X  X  X  X  X                                               X

    37        10        X  X  X  X  X                                               X

    38        11        X  X  X  X  X                                               X

    39        12        X  X  X  X  X                                               X

    40        13        X  X  X  X  X                    X                          X

    41        14        X  X  X  X  X                          X                    X

    42        15        X  X  X  X  X        X                       X        X     X

    43        16        X  X  X  X  X                                   X     X     X

    44        17        X  X  X  X  X                                               E

    45        18        X  X  X  X  X                                X  X     X     X

    46        19     X  E  E  E  E  E  X  X  X  X  X  X  X  X  X  X  X  X  E  X  E  E

    47        20        X  X  X  X  X                                               E

    49    * Same Char

    50    # Other Char

    52    X Cannot Break

    54    The classes mean:

    55       1: Open parenthesis

    56       2: Close parenthesis

    57       3: Prohibit a line break before

    58       4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")

    59       5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)

    60       6: Full stop

    61       7: Non-breakable between same characters

    62       8: Prefix (e.g., "$", "NO.")

    63       9: Postfix (e.g., "%")

    64      10: Ideographic space

    65      11: Hiragana

    66      12: Japanese characters (except class 11)

    67      13: Subscript

    68      14: Ruby

    69      15: Numeric

    70      16: Alphabet

    71      17: Space for Western language

    72      18: Western characters (except class 17)

    73      19: Split line note (Warichu) begin quote

    74      20: Split line note (Warichu) end quote

    76    2. Simplified by remove the class which we do not care

    78    However, since we do not care about class 13(Subscript), 14(Ruby),

    79    16 (Aphabet), 19(split line note begin quote), and 20(split line note end

    80    quote) we can simplify this par table into the following

    82    Class of

    83    Leading    Class of Trailing Char Class

    84    Char

    86               1  2  3  4  5  6  7  8  9 10 11 12 15 17 18

    88         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X

    89         2        X  X  X  X  X

    90         3        X  X  X  X  X

    91         4        X  X  X  X  X

    92         5        X  X  X  X  X

    93         6        X  X  X  X  X

    94         7        X  X  X  X  X  X

    95         8        X  X  X  X  X                    X

    96         9        X  X  X  X  X

    97        10        X  X  X  X  X

    98        11        X  X  X  X  X

    99        12        X  X  X  X  X

   100        15        X  X  X  X  X        X           X     X

   101        17        X  X  X  X  X

   102        18        X  X  X  X  X                    X     X

   104    3. Simplified by merged classes

   106    After the 2 simplification, the pair table have some duplication

   107    a. class 2, 3, 4, 5, 6,  are the same- we can merged them

   108    b. class 10, 11, 12, 17  are the same- we can merged them

   111    Class of

   112    Leading    Class of Trailing Char Class

   113    Char

   115               1 [a] 7  8  9 [b]15 18

   117         1     X  X  X  X  X  X  X  X

   118       [a]        X

   119         7        X  X

   120         8        X              X

   121         9        X

   122       [b]        X

   123        15        X        X     X  X

   124        18        X              X  X

   127    4. We add COMPLEX characters and make it breakable w/ all ther class

   128       except after class 1 and before class [a]

   130    Class of

   131    Leading    Class of Trailing Char Class

   132    Char

   134               1 [a] 7  8  9 [b]15 18 COMPLEX

   136         1     X  X  X  X  X  X  X  X  X

   137       [a]        X

   138         7        X  X

   139         8        X              X

   140         9        X

   141       [b]        X

   142        15        X        X     X  X

   143        18        X              X  X

   144   COMPLEX        X                    T

   146      T : need special handling

   149    5. However, we need two special class for some punctuations/parentheses,

   150       theirs breaking rules like character class (18), see bug 389056.

   151       And also we need character like punctuation that is same behavior with 18,

   152       but the characters are not letters of all languages. (e.g., '_')

   153       [c]. Based on open parenthesis class (1), but it is not breakable after

   154            character class (18) or numeric class (15).

   155       [d]. Based on close parenthesis (or punctuation) class (2), but it is not

   156            breakable before character class (18) or numeric class (15).

   158    Class of

   159    Leading    Class of Trailing Char Class

   160    Char

   162               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d]

   164         1     X  X  X  X  X  X  X  X  X       X    X

   165       [a]        X                            X    X

   166         7        X  X

   167         8        X              X

   168         9        X

   169       [b]        X                                 X

   170        15        X        X     X  X          X    X

   171        18        X              X  X          X    X

   172   COMPLEX        X                    T

   173       [c]     X  X  X  X  X  X  X  X  X       X    X

   174       [d]        X              X  X               X

   177    6. And Unicode has "NON-BREAK" characters. The lines should be broken around

   178       them. But in JIS X 4051, such class is not, therefore, we create [e].

   180    Class of

   181    Leading    Class of Trailing Char Class

   182    Char

   184               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]

   186         1     X  X  X  X  X  X  X  X  X       X    X   X

   187       [a]        X                                 X   X

   188         7        X  X                                  X

   189         8        X              X                      X

   190         9        X                                     X

   191       [b]        X                                 X   X

   192        15        X        X     X  X          X    X   X

   193        18        X              X  X          X    X   X

   194   COMPLEX        X                    T                X

   195       [c]     X  X  X  X  X  X  X  X  X       X    X   X

   196       [d]        X              X  X               X   X

   197       [e]     X  X  X  X  X  X  X  X  X       X    X   X

   200    7. Now we use one bit to encode weather it is breakable, and use 2 bytes

   201       for one row, then the bit table will look like:

   203                  18    <-   1

   205        1  0000 1111 1111 1111  = 0x0FFF

   206       [a] 0000 1100 0000 0010  = 0x0C02

   207        7  0000 1000 0000 0110  = 0x0806

   208        8  0000 1000 0100 0010  = 0x0842

   209        9  0000 1000 0000 0010  = 0x0802

   210       [b] 0000 1100 0000 0010  = 0x0C02

   211       15  0000 1110 1101 0010  = 0x0ED2

   212       18  0000 1110 1100 0010  = 0x0EC2

   213  COMPLEX  0000 1001 0000 0010  = 0x0902

   214       [c] 0000 1111 1111 1111  = 0x0FFF

   215       [d] 0000 1100 1100 0010  = 0x0CC2

   216       [e] 0000 1111 1111 1111  = 0x0FFF

   217 */

   219 #define MAX_CLASSES 12

   221 static const uint16_t gPair[MAX_CLASSES] = {

   222   0x0FFF,

   223   0x0C02,

   224   0x0806,

   225   0x0842,

   226   0x0802,

   227   0x0C02,

   228   0x0ED2,

   229   0x0EC2,

   230   0x0902,

   231   0x0FFF,

   232   0x0CC2,

   233   0x0FFF

   234 };

   237 /*

   239    8. And if the character is not enough far from word start, word end and

   240       another break point, we should not break in non-CJK languages.

   241       I.e., Don't break around 15, 18, [c] and [d], but don't change

   242       that if they are related to [b].

   244    Class of

   245    Leading    Class of Trailing Char Class

   246    Char

   248               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]

   250         1     X  X  X  X  X  X  X  X  X       X    X   X

   251       [a]        X              X  X          X    X   X

   252         7        X  X           X  X          X    X   X

   253         8        X              X  X          X    X   X

   254         9        X              X  X          X    X   X

   255       [b]        X                                 X   X

   256        15     X  X  X  X  X     X  X  X       X    X   X

   257        18     X  X  X  X  X     X  X  X       X    X   X

   258   COMPLEX        X              X  X  T       X    X   X

   259       [c]     X  X  X  X  X  X  X  X  X       X    X   X

   260       [d]     X  X  X  X  X     X  X  X       X    X   X

   261       [e]     X  X  X  X  X  X  X  X  X       X    X   X

   263                  18    <-   1

   265        1  0000 1111 1111 1111  = 0x0FFF

   266       [a] 0000 1110 1100 0010  = 0x0EC2

   267        7  0000 1110 1100 0110  = 0x0EC6

   268        8  0000 1110 1100 0010  = 0x0EC2

   269        9  0000 1110 1100 0010  = 0x0EC2

   270       [b] 0000 1100 0000 0010  = 0x0C02

   271       15  0000 1111 1101 1111  = 0x0FDF

   272       18  0000 1111 1101 1111  = 0x0FDF

   273  COMPLEX  0000 1111 1100 0010  = 0x0FC2

   274       [c] 0000 1111 1111 1111  = 0x0FFF

   275       [d] 0000 1111 1101 1111  = 0x0FDF

   276       [e] 0000 1111 1111 1111  = 0x0FFF

   277 */

   279 static const uint16_t gPairConservative[MAX_CLASSES] = {

   280   0x0FFF,

   281   0x0EC2,

   282   0x0EC6,

   283   0x0EC2,

   284   0x0EC2,

   285   0x0C02,

   286   0x0FDF,

   287   0x0FDF,

   288   0x0FC2,

   289   0x0FFF,

   290   0x0FDF,

   291   0x0FFF

   292 };

   295 /*

   297    9. Now we map the class to number

   299       0: 1

   300       1: [a]- 2, 3, 4, 5, 6

   301       2: 7

   302       3: 8

   303       4: 9

   304       5: [b]- 10, 11, 12, 17

   305       6: 15

   306       7: 18

   307       8: COMPLEX

   308       9: [c]

   309       A: [d]

   310       B: [e]

   312     and they mean:

   313       0: Open parenthesis

   314       1: Punctuation that prohibits break before

   315       2: Non-breakable between same classes

   316       3: Prefix

   317       4: Postfix

   318       5: Breakable character (Spaces and Most Japanese characters)

   319       6: Numeric

   320       7: Characters

   321       8: Need special handling characters (E.g., Thai)

   322       9: Open parentheses like Character (See bug 389056)

   323       A: Close parenthese (or punctuations) like Character (See bug 389056)

   324       B: Non breakable (See bug 390920)

   326 */

   328 #define CLASS_NONE                             INT8_MAX

   330 #define CLASS_OPEN                             0x00

   331 #define CLASS_CLOSE                            0x01

   332 #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02

   333 #define CLASS_PREFIX                           0x03

   334 #define CLASS_POSTFFIX                         0x04

   335 #define CLASS_BREAKABLE                        0x05

   336 #define CLASS_NUMERIC                          0x06

   337 #define CLASS_CHARACTER                        0x07

   338 #define CLASS_COMPLEX                          0x08

   339 #define CLASS_OPEN_LIKE_CHARACTER              0x09

   340 #define CLASS_CLOSE_LIKE_CHARACTER             0x0A

   341 #define CLASS_NON_BREAKABLE                    0x0B

   343 #define U_NULL      char16_t(0x0000)

   344 #define U_SLASH     char16_t('/')

   345 #define U_SPACE     char16_t(' ')

   346 #define U_HYPHEN    char16_t('-')

   347 #define U_EQUAL     char16_t('=')

   348 #define U_PERCENT   char16_t('%')

   349 #define U_AMPERSAND char16_t('&')

   350 #define U_SEMICOLON char16_t(';')

   351 #define U_BACKSLASH char16_t('\\')

   352 #define U_OPEN_SINGLE_QUOTE char16_t(0x2018)

   353 #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)

   354 #define U_OPEN_GUILLEMET    char16_t(0x00AB)

   356 #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \

   357                                      (c) == U_SLASH || \

   358                                      (c) == U_PERCENT || \

   359                                      (c) == U_AMPERSAND || \

   360                                      (c) == U_SEMICOLON || \

   361                                      (c) == U_BACKSLASH || \

   362                                      (c) == U_OPEN_SINGLE_QUOTE || \

   363                                      (c) == U_OPEN_DOUBLE_QUOTE || \

   364                                      (c) == U_OPEN_GUILLEMET)

   366 #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)

   368 static inline int

   369 GETCLASSFROMTABLE(const uint32_t* t, uint16_t l)

   370 {

   371   return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);

   372 }

   374 static inline int

   375 IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u)

   376 {

   377   return ((0xff66 <= (u)) && ((u) <= 0xff70));

   378 }

   380 static inline int

   381 IS_CJK_CHAR(char16_t u)

   382 {

   383   return ((0x1100 <= (u) && (u) <= 0x11ff) ||

   384           (0x2e80 <= (u) && (u) <= 0xd7ff) ||

   385           (0xf900 <= (u) && (u) <= 0xfaff) ||

   386           (0xff00 <= (u) && (u) <= 0xffef) );

   387 }

   389 static inline bool

   390 IS_NONBREAKABLE_SPACE(char16_t u)

   391 {

   392   return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE

   393 }

   395 static inline bool

   396 IS_HYPHEN(char16_t u)

   397 {

   398   return (u == U_HYPHEN ||

   399           u == 0x058A || // ARMENIAN HYPHEN

   400           u == 0x2010 || // HYPHEN

   401           u == 0x2012 || // FIGURE DASH

   402           u == 0x2013);  // EN DASH

   403 }

   405 static int8_t

   406 GetClass(char16_t u)

   407 {

   408    uint16_t h = u & 0xFF00;

   409    uint16_t l = u & 0x00ff;

   410    int8_t c;

   412    // Handle 3 range table first

   413    if (0x0000 == h) {

   414      c = GETCLASSFROMTABLE(gLBClass00, l);

   415    } else if (0x1700 == h) {

   416      c = GETCLASSFROMTABLE(gLBClass17, l);

   417    } else if (NS_NeedsPlatformNativeHandling(u)) {

   418      c = CLASS_COMPLEX;

   419    } else if (0x0E00 == h) {

   420      c = GETCLASSFROMTABLE(gLBClass0E, l);

   421    } else if (0x2000 == h) {

   422      c = GETCLASSFROMTABLE(gLBClass20, l);

   423    } else if (0x2100 == h) {

   424      c = GETCLASSFROMTABLE(gLBClass21, l);

   425    } else if (0x3000 == h) {

   426      c = GETCLASSFROMTABLE(gLBClass30, l);

   427    } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi

   428               ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul

   429               ((0xf900 <= h) && (h <= 0xfaff))) {

   430      c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatibility

   431    } else if (0xff00 == h) {

   432      if (l < 0x0060) { // Fullwidth ASCII variant

   433        c = GETCLASSFROMTABLE(gLBClass00, (l+0x20));

   434      } else if (l < 0x00a0) {

   435        switch (l) {

   436          case 0x61: c = GetClass(0x3002); break;

   437          case 0x62: c = GetClass(0x300c); break;

   438          case 0x63: c = GetClass(0x300d); break;

   439          case 0x64: c = GetClass(0x3001); break;

   440          case 0x65: c = GetClass(0x30fb); break;

   441          case 0x9e: c = GetClass(0x309b); break;

   442          case 0x9f: c = GetClass(0x309c); break;

   443          default:

   444            if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u))

   445               c = CLASS_CLOSE; // jis x4051 class 3

   446            else

   447               c = CLASS_BREAKABLE; // jis x4051 class 11

   448            break;

   449        }

   450      // Halfwidth Katakana variants

   451      } else if (l < 0x00e0) {

   452        c = CLASS_CHARACTER; // Halfwidth Hangul variants

   453      } else if (l < 0x00f0) {

   454        static char16_t NarrowFFEx[16] = {

   455          0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,

   456          0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000

   457        };

   458        c = GetClass(NarrowFFEx[l - 0x00e0]);

   459      } else {

   460        c = CLASS_CHARACTER;

   461      }

   462    } else if (0x3100 == h) {

   463      if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun

   464                       // XXX: This is per UAX #14, but UAX #14 may change

   465                       // the line breaking rules about Kanbun and Bopomofo.

   466        c = CLASS_BREAKABLE;

   467      } else if (l >= 0xf0) { // Katakana small letters for Ainu

   468        c = CLASS_CLOSE;

   469      } else { // unassigned

   470        c = CLASS_CHARACTER;

   471      }

   472    } else if (0x0300 == h) {

   473      if (0x4F == l || (0x5C <= l && l <= 0x62))

   474        c = CLASS_NON_BREAKABLE;

   475      else

   476        c = CLASS_CHARACTER;

   477    } else if (0x0500 == h) {

   478      // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)

   479      if (l == 0x8A)

   480        c = GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));

   481      else

   482        c = CLASS_CHARACTER;

   483    } else if (0x0F00 == h) {

   484      if (0x08 == l || 0x0C == l || 0x12 == l)

   485        c = CLASS_NON_BREAKABLE;

   486      else

   487        c = CLASS_CHARACTER;

   488    } else if (0x1800 == h) {

   489      if (0x0E == l)

   490        c = CLASS_NON_BREAKABLE;

   491      else

   492        c = CLASS_CHARACTER;

   493    } else if (0x1600 == h) {

   494      if (0x80 == l) { // U+1680 OGHAM SPACE MARK

   495        c = CLASS_BREAKABLE;

   496      } else {

   497        c = CLASS_CHARACTER;

   498      }

   499    } else if (u == 0xfeff) {

   500      c = CLASS_NON_BREAKABLE;

   501    } else {

   502      c = CLASS_CHARACTER; // others

   503    }

   504    return c;

   505 }

   507 static bool

   508 GetPair(int8_t c1, int8_t c2)

   509 {

   510   NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");

   511   NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");

   513   return (0 == ((gPair[c1] >> c2) & 0x0001));

   514 }

   516 static bool

   517 GetPairConservative(int8_t c1, int8_t c2)

   518 {

   519   NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");

   520   NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");

   522   return (0 == ((gPairConservative[c1] >> c2) & 0x0001));

   523 }

   525 nsJISx4051LineBreaker::nsJISx4051LineBreaker()

   526 {

   527 }

   529 nsJISx4051LineBreaker::~nsJISx4051LineBreaker()

   530 {

   531 }

   533 NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker)

   535 class ContextState {

   536 public:

   537   ContextState(const char16_t* aText, uint32_t aLength) {

   538     mUniText = aText;

   539     mText = nullptr;

   540     mLength = aLength;

   541     Init();

   542   }

   544   ContextState(const uint8_t* aText, uint32_t aLength) {

   545     mUniText = nullptr;

   546     mText = aText;

   547     mLength = aLength;

   548     Init();

   549   }

   551   uint32_t Length() { return mLength; }

   552   uint32_t Index() { return mIndex; }

   554   char16_t GetCharAt(uint32_t aIndex) {

   555     NS_ASSERTION(aIndex < mLength, "Out of range!");

   556     return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);

   557   }

   559   void AdvanceIndex() {

   560     ++mIndex;

   561   }

   563   void NotifyBreakBefore() { mLastBreakIndex = mIndex; }

   565 // A word of western language should not be broken. But even if the word has

   566 // only ASCII characters, non-natural context words should be broken, e.g.,

   567 // URL and file path. For protecting the natural words, we should use

   568 // conservative breaking rules at following conditions:

   569 //   1. at near the start of word

   570 //   2. at near the end of word

   571 //   3. at near the latest broken point

   572 // CONSERVATIVE_BREAK_RANGE define the 'near' in characters.

   573 #define CONSERVATIVE_BREAK_RANGE 6

   575   bool UseConservativeBreaking(uint32_t aOffset = 0) {

   576     if (mHasCJKChar)

   577       return false;

   578     uint32_t index = mIndex + aOffset;

   579     bool result = (index < CONSERVATIVE_BREAK_RANGE ||

   580                      mLength - index < CONSERVATIVE_BREAK_RANGE ||

   581                      index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE);

   582     if (result || !mHasNonbreakableSpace)

   583       return result;

   585     // This text has no-breakable space, we need to check whether the index

   586     // is near it.

   588     // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here.

   589     for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) {

   590       if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1)))

   591         return true;

   592     }

   593     // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE.

   594     for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) {

   595       if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))

   596         return true;

   597     }

   598     return false;

   599   }

   601   bool HasPreviousEqualsSign() const {

   602     return mHasPreviousEqualsSign;

   603   }

   604   void NotifySeenEqualsSign() {

   605     mHasPreviousEqualsSign = true;

   606   }

   608   bool HasPreviousSlash() const {

   609     return mHasPreviousSlash;

   610   }

   611   void NotifySeenSlash() {

   612     mHasPreviousSlash = true;

   613   }

   615   bool HasPreviousBackslash() const {

   616     return mHasPreviousBackslash;

   617   }

   618   void NotifySeenBackslash() {

   619     mHasPreviousBackslash = true;

   620   }

   622   char16_t GetPreviousNonHyphenCharacter() const {

   623     return mPreviousNonHyphenCharacter;

   624   }

   625   void NotifyNonHyphenCharacter(char16_t ch) {

   626     mPreviousNonHyphenCharacter = ch;

   627   }

   629 private:

   630   void Init() {

   631     mIndex = 0;

   632     mLastBreakIndex = 0;

   633     mPreviousNonHyphenCharacter = U_NULL;

   634     mHasCJKChar = 0;

   635     mHasNonbreakableSpace = 0;

   636     mHasPreviousEqualsSign = false;

   637     mHasPreviousSlash = false;

   638     mHasPreviousBackslash = false;

   640     for (uint32_t i = 0; i < mLength; ++i) {

   641       char16_t u = GetCharAt(i);

   642       if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u))

   643         mHasNonbreakableSpace = 1;

   644       else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u))

   645         mHasCJKChar = 1;

   646     }

   647   }

   649   const char16_t* mUniText;

   650   const uint8_t* mText;

   652   uint32_t mIndex;

   653   uint32_t mLength;         // length of text

   654   uint32_t mLastBreakIndex;

   655   char16_t mPreviousNonHyphenCharacter; // The last character we have seen

   656                                          // which is not U_HYPHEN

   657   bool mHasCJKChar; // if the text has CJK character, this is true.

   658   bool mHasNonbreakableSpace; // if the text has no-breakable space,

   659                                      // this is true.

   660   bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL

   661   bool mHasPreviousSlash;      // True if we have seen a U_SLASH

   662   bool mHasPreviousBackslash;  // True if we have seen a U_BACKSLASH

   663 };

   665 static int8_t

   666 ContextualAnalysis(char16_t prev, char16_t cur, char16_t next,

   667                    ContextState &aState)

   668 {

   669   // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.

   671   if (IS_HYPHEN(cur)) {

   672     // If next character is hyphen, we don't need to break between them.

   673     if (IS_HYPHEN(next))

   674       return CLASS_CHARACTER;

   675     // If prev and next characters are numeric, it may be in Math context.

   676     // So, we should not break here.

   677     bool prevIsNum = IS_ASCII_DIGIT(prev);

   678     bool nextIsNum = IS_ASCII_DIGIT(next);

   679     if (prevIsNum && nextIsNum)

   680       return CLASS_NUMERIC;

   681     // If one side is numeric and the other is a character, or if both sides are

   682     // characters, the hyphen should be breakable.

   683     if (!aState.UseConservativeBreaking(1)) {

   684       char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();

   685       if (prevOfHyphen && next) {

   686         int8_t prevClass = GetClass(prevOfHyphen);

   687         int8_t nextClass = GetClass(next);

   688         bool prevIsNumOrCharOrClose =

   689           prevIsNum ||

   690           (prevClass == CLASS_CHARACTER &&

   691             !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||

   692           prevClass == CLASS_CLOSE ||

   693           prevClass == CLASS_CLOSE_LIKE_CHARACTER;

   694         bool nextIsNumOrCharOrOpen =

   695           nextIsNum ||

   696           (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||

   697           nextClass == CLASS_OPEN ||

   698           nextClass == CLASS_OPEN_LIKE_CHARACTER ||

   699           next == U_OPEN_SINGLE_QUOTE ||

   700           next == U_OPEN_DOUBLE_QUOTE ||

   701           next == U_OPEN_GUILLEMET;

   702         if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {

   703           return CLASS_CLOSE;

   704         }

   705       }

   706     }

   707   } else {

   708     aState.NotifyNonHyphenCharacter(cur);

   709     if (cur == U_SLASH || cur == U_BACKSLASH) {

   710       // If this is immediately after same char, we should not break here.

   711       if (prev == cur)

   712         return CLASS_CHARACTER;

   713       // If this text has two or more (BACK)SLASHs, this may be file path or URL.

   714       // Make sure to compute shouldReturn before we notify on this slash.

   715       bool shouldReturn = !aState.UseConservativeBreaking() &&

   716         (cur == U_SLASH ?

   717          aState.HasPreviousSlash() : aState.HasPreviousBackslash());

   719       if (cur == U_SLASH) {

   720         aState.NotifySeenSlash();

   721       } else {

   722         aState.NotifySeenBackslash();

   723       }

   725       if (shouldReturn)

   726         return CLASS_OPEN;

   727     } else if (cur == U_PERCENT) {

   728       // If this is a part of the param of URL, we should break before.

   729       if (!aState.UseConservativeBreaking()) {

   730         if (aState.Index() >= 3 &&

   731             aState.GetCharAt(aState.Index() - 3) == U_PERCENT)

   732           return CLASS_OPEN;

   733         if (aState.Index() + 3 < aState.Length() &&

   734             aState.GetCharAt(aState.Index() + 3) == U_PERCENT)

   735           return CLASS_OPEN;

   736       }

   737     } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {

   738       // If this may be a separator of params of URL, we should break after.

   739       if (!aState.UseConservativeBreaking(1) &&

   740           aState.HasPreviousEqualsSign())

   741         return CLASS_CLOSE;

   742     } else if (cur == U_OPEN_SINGLE_QUOTE ||

   743                cur == U_OPEN_DOUBLE_QUOTE ||

   744                cur == U_OPEN_GUILLEMET) {

   745       // for CJK usage, we treat these as openers to allow a break before them,

   746       // but otherwise treat them as normal characters because quote mark usage

   747       // in various Western languages varies too much; see bug #450088 discussion.

   748       if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))

   749         return CLASS_OPEN;

   750     } else {

   751       NS_ERROR("Forgot to handle the current character!");

   752     }

   753   }

   754   return GetClass(cur);

   755 }

   758 int32_t

   759 nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen,

   760                                 uint32_t aPos, int8_t aDirection)

   761 {

   762   bool    textNeedsJISx4051 = false;

   763   int32_t begin, end;

   765   for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {

   766     if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) {

   767       textNeedsJISx4051 = true;

   768     }

   769   }

   770   for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {

   771     if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {

   772       textNeedsJISx4051 = true;

   773     }

   774   }

   776   int32_t ret;

   777   nsAutoTArray<uint8_t, 2000> breakState;

   778   if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {

   779     // No complex text character, do not try to do complex line break.

   780     // (This is required for serializers. See Bug #344816.)

   781     // Also fall back to this when out of memory.

   782     if (aDirection < 0) {

   783       ret = (begin == int32_t(aPos)) ? begin - 1 : begin;

   784     } else {

   785       ret = end;

   786     }

   787   } else {

   788     GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal,

   789                       breakState.Elements());

   791     ret = aPos;

   792     do {

   793       ret += aDirection;

   794     } while (begin < ret && ret < end && !breakState[ret - begin]);

   795   }

   797   return ret;

   798 }

   800 int32_t

   801 nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen,

   802                             uint32_t aPos)

   803 {

   804   NS_ASSERTION(aText, "aText shouldn't be null");

   805   NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next");

   807   int32_t nextPos = WordMove(aText, aLen, aPos, 1);

   808   return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;

   809 }

   811 int32_t

   812 nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen,

   813                             uint32_t aPos)

   814 {

   815   NS_ASSERTION(aText, "aText shouldn't be null");

   816   NS_ASSERTION(aLen >= aPos && aPos > 0,

   817                "Bad position passed to nsJISx4051LineBreaker::Prev");

   819   int32_t prevPos = WordMove(aText, aLen, aPos, -1);

   820   return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;

   821 }

   823 void

   824 nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength,

   825                                          uint8_t aWordBreak,

   826                                          uint8_t* aBreakBefore)

   827 {

   828   uint32_t cur;

   829   int8_t lastClass = CLASS_NONE;

   830   ContextState state(aChars, aLength);

   832   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {

   833     char16_t ch = aChars[cur];

   834     int8_t cl;

   836     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {

   837       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,

   838                               ch,

   839                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,

   840                               state);

   841     } else {

   842       if (ch == U_EQUAL)

   843         state.NotifySeenEqualsSign();

   844       state.NotifyNonHyphenCharacter(ch);

   845       cl = GetClass(ch);

   846     }

   848     bool allowBreak = false;

   849     if (cur > 0) {

   850       NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,

   851                    "Loop should have prevented adjacent complex chars here");

   852       if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {

   853         allowBreak = (state.UseConservativeBreaking()) ?

   854           GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);

   855       } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {

   856         allowBreak = true;

   857       }

   858     }

   859     aBreakBefore[cur] = allowBreak;

   860     if (allowBreak)

   861       state.NotifyBreakBefore();

   862     lastClass = cl;

   863     if (CLASS_COMPLEX == cl) {

   864       uint32_t end = cur + 1;

   866       while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) {

   867         ++end;

   868       }

   870       NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);

   872       // We have to consider word-break value again for complex characters

   873       if (aWordBreak != nsILineBreaker::kWordBreak_Normal) {

   874         // Respect word-break property

   875         for (uint32_t i = cur; i < end; i++)

   876           aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll);

   877       }

   879       // restore breakability at chunk begin, which was always set to false

   880       // by the complex line breaker

   881       aBreakBefore[cur] = allowBreak;

   883       cur = end - 1;

   884     }

   885   }

   886 }

   888 void

   889 nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength,

   890                                          uint8_t aWordBreak,

   891                                          uint8_t* aBreakBefore)

   892 {

   893   uint32_t cur;

   894   int8_t lastClass = CLASS_NONE;

   895   ContextState state(aChars, aLength);

   897   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {

   898     char16_t ch = aChars[cur];

   899     int8_t cl;

   901     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {

   902       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,

   903                               ch,

   904                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,

   905                               state);

   906     } else {

   907       if (ch == U_EQUAL)

   908         state.NotifySeenEqualsSign();

   909       state.NotifyNonHyphenCharacter(ch);

   910       cl = GetClass(ch);

   911     }

   913     bool allowBreak = false;

   914     if (cur > 0) {

   915       if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {

   916         allowBreak = (state.UseConservativeBreaking()) ?

   917           GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);

   918       } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {

   919         allowBreak = true;

   920       }

   921     }

   922     aBreakBefore[cur] = allowBreak;

   923     if (allowBreak)

   924       state.NotifyBreakBefore();

   925     lastClass = cl;

   926   }

   927 }

The Tor Browser / file revision

intl/lwbrk/src/nsJISx4051LineBreaker.cpp@b8a032363ba2

intl/lwbrk/src/nsJISx4051LineBreaker.cpp