intl/lwbrk/src/nsJISx4051LineBreaker.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     8 #include "nsJISx4051LineBreaker.h"
    10 #include "jisx4051class.h"
    11 #include "nsComplexBreaker.h"
    12 #include "nsTArray.h"
    14 /* 
    16    Simplification of Pair Table in JIS X 4051
    18    1. The Origion Table - in 4.1.3
    20    In JIS x 4051. The pair table is defined as below
    22    Class of
    23    Leading    Class of Trailing Char Class
    24    Char        
    26               1  2  3  4  5  6  7  8  9 10 11 12 13 13 14 14 15 16 17 18 19 20
    27                                                  *  #  *  #
    28         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  E
    29         2        X  X  X  X  X                                               X
    30         3        X  X  X  X  X                                               X
    31         4        X  X  X  X  X                                               X
    32         5        X  X  X  X  X                                               X
    33         6        X  X  X  X  X                                               X
    34         7        X  X  X  X  X  X                                            X
    35         8        X  X  X  X  X                                X              E
    36         9        X  X  X  X  X                                               X
    37        10        X  X  X  X  X                                               X
    38        11        X  X  X  X  X                                               X
    39        12        X  X  X  X  X                                               X
    40        13        X  X  X  X  X                    X                          X
    41        14        X  X  X  X  X                          X                    X
    42        15        X  X  X  X  X        X                       X        X     X
    43        16        X  X  X  X  X                                   X     X     X
    44        17        X  X  X  X  X                                               E
    45        18        X  X  X  X  X                                X  X     X     X
    46        19     X  E  E  E  E  E  X  X  X  X  X  X  X  X  X  X  X  X  E  X  E  E
    47        20        X  X  X  X  X                                               E
    49    * Same Char
    50    # Other Char
    52    X Cannot Break
    54    The classes mean:
    55       1: Open parenthesis
    56       2: Close parenthesis
    57       3: Prohibit a line break before
    58       4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
    59       5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
    60       6: Full stop
    61       7: Non-breakable between same characters
    62       8: Prefix (e.g., "$", "NO.")
    63       9: Postfix (e.g., "%")
    64      10: Ideographic space
    65      11: Hiragana
    66      12: Japanese characters (except class 11)
    67      13: Subscript
    68      14: Ruby
    69      15: Numeric
    70      16: Alphabet
    71      17: Space for Western language
    72      18: Western characters (except class 17)
    73      19: Split line note (Warichu) begin quote
    74      20: Split line note (Warichu) end quote
    76    2. Simplified by remove the class which we do not care
    78    However, since we do not care about class 13(Subscript), 14(Ruby),
    79    16 (Aphabet), 19(split line note begin quote), and 20(split line note end
    80    quote) we can simplify this par table into the following
    82    Class of
    83    Leading    Class of Trailing Char Class
    84    Char
    86               1  2  3  4  5  6  7  8  9 10 11 12 15 17 18
    88         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X
    89         2        X  X  X  X  X                           
    90         3        X  X  X  X  X                           
    91         4        X  X  X  X  X                           
    92         5        X  X  X  X  X                           
    93         6        X  X  X  X  X                           
    94         7        X  X  X  X  X  X                        
    95         8        X  X  X  X  X                    X      
    96         9        X  X  X  X  X                           
    97        10        X  X  X  X  X                           
    98        11        X  X  X  X  X                           
    99        12        X  X  X  X  X                           
   100        15        X  X  X  X  X        X           X     X
   101        17        X  X  X  X  X                           
   102        18        X  X  X  X  X                    X     X
   104    3. Simplified by merged classes
   106    After the 2 simplification, the pair table have some duplication
   107    a. class 2, 3, 4, 5, 6,  are the same- we can merged them
   108    b. class 10, 11, 12, 17  are the same- we can merged them
   111    Class of
   112    Leading    Class of Trailing Char Class
   113    Char
   115               1 [a] 7  8  9 [b]15 18
   117         1     X  X  X  X  X  X  X  X
   118       [a]        X                  
   119         7        X  X               
   120         8        X              X   
   121         9        X                  
   122       [b]        X                  
   123        15        X        X     X  X
   124        18        X              X  X
   127    4. We add COMPLEX characters and make it breakable w/ all ther class
   128       except after class 1 and before class [a]
   130    Class of
   131    Leading    Class of Trailing Char Class
   132    Char
   134               1 [a] 7  8  9 [b]15 18 COMPLEX
   136         1     X  X  X  X  X  X  X  X  X
   137       [a]        X                     
   138         7        X  X                  
   139         8        X              X      
   140         9        X                     
   141       [b]        X                     
   142        15        X        X     X  X   
   143        18        X              X  X   
   144   COMPLEX        X                    T
   146      T : need special handling
   149    5. However, we need two special class for some punctuations/parentheses,
   150       theirs breaking rules like character class (18), see bug 389056.
   151       And also we need character like punctuation that is same behavior with 18,
   152       but the characters are not letters of all languages. (e.g., '_')
   153       [c]. Based on open parenthesis class (1), but it is not breakable after
   154            character class (18) or numeric class (15).
   155       [d]. Based on close parenthesis (or punctuation) class (2), but it is not
   156            breakable before character class (18) or numeric class (15).
   158    Class of
   159    Leading    Class of Trailing Char Class
   160    Char
   162               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d]
   164         1     X  X  X  X  X  X  X  X  X       X    X
   165       [a]        X                            X    X
   166         7        X  X                               
   167         8        X              X                   
   168         9        X                                  
   169       [b]        X                                 X
   170        15        X        X     X  X          X    X
   171        18        X              X  X          X    X
   172   COMPLEX        X                    T             
   173       [c]     X  X  X  X  X  X  X  X  X       X    X
   174       [d]        X              X  X               X
   177    6. And Unicode has "NON-BREAK" characters. The lines should be broken around
   178       them. But in JIS X 4051, such class is not, therefore, we create [e].
   180    Class of
   181    Leading    Class of Trailing Char Class
   182    Char
   184               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]
   186         1     X  X  X  X  X  X  X  X  X       X    X   X
   187       [a]        X                                 X   X
   188         7        X  X                                  X
   189         8        X              X                      X
   190         9        X                                     X
   191       [b]        X                                 X   X
   192        15        X        X     X  X          X    X   X
   193        18        X              X  X          X    X   X
   194   COMPLEX        X                    T                X
   195       [c]     X  X  X  X  X  X  X  X  X       X    X   X
   196       [d]        X              X  X               X   X
   197       [e]     X  X  X  X  X  X  X  X  X       X    X   X
   200    7. Now we use one bit to encode weather it is breakable, and use 2 bytes
   201       for one row, then the bit table will look like:
   203                  18    <-   1
   205        1  0000 1111 1111 1111  = 0x0FFF
   206       [a] 0000 1100 0000 0010  = 0x0C02
   207        7  0000 1000 0000 0110  = 0x0806
   208        8  0000 1000 0100 0010  = 0x0842
   209        9  0000 1000 0000 0010  = 0x0802
   210       [b] 0000 1100 0000 0010  = 0x0C02
   211       15  0000 1110 1101 0010  = 0x0ED2
   212       18  0000 1110 1100 0010  = 0x0EC2
   213  COMPLEX  0000 1001 0000 0010  = 0x0902
   214       [c] 0000 1111 1111 1111  = 0x0FFF
   215       [d] 0000 1100 1100 0010  = 0x0CC2
   216       [e] 0000 1111 1111 1111  = 0x0FFF
   217 */
   219 #define MAX_CLASSES 12
   221 static const uint16_t gPair[MAX_CLASSES] = {
   222   0x0FFF,
   223   0x0C02,
   224   0x0806,
   225   0x0842,
   226   0x0802,
   227   0x0C02,
   228   0x0ED2,
   229   0x0EC2,
   230   0x0902,
   231   0x0FFF,
   232   0x0CC2,
   233   0x0FFF
   234 };
   237 /*
   239    8. And if the character is not enough far from word start, word end and
   240       another break point, we should not break in non-CJK languages.
   241       I.e., Don't break around 15, 18, [c] and [d], but don't change
   242       that if they are related to [b].
   244    Class of
   245    Leading    Class of Trailing Char Class
   246    Char
   248               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]
   250         1     X  X  X  X  X  X  X  X  X       X    X   X
   251       [a]        X              X  X          X    X   X
   252         7        X  X           X  X          X    X   X
   253         8        X              X  X          X    X   X
   254         9        X              X  X          X    X   X
   255       [b]        X                                 X   X
   256        15     X  X  X  X  X     X  X  X       X    X   X
   257        18     X  X  X  X  X     X  X  X       X    X   X
   258   COMPLEX        X              X  X  T       X    X   X
   259       [c]     X  X  X  X  X  X  X  X  X       X    X   X
   260       [d]     X  X  X  X  X     X  X  X       X    X   X
   261       [e]     X  X  X  X  X  X  X  X  X       X    X   X
   263                  18    <-   1
   265        1  0000 1111 1111 1111  = 0x0FFF
   266       [a] 0000 1110 1100 0010  = 0x0EC2
   267        7  0000 1110 1100 0110  = 0x0EC6
   268        8  0000 1110 1100 0010  = 0x0EC2
   269        9  0000 1110 1100 0010  = 0x0EC2
   270       [b] 0000 1100 0000 0010  = 0x0C02
   271       15  0000 1111 1101 1111  = 0x0FDF
   272       18  0000 1111 1101 1111  = 0x0FDF
   273  COMPLEX  0000 1111 1100 0010  = 0x0FC2
   274       [c] 0000 1111 1111 1111  = 0x0FFF
   275       [d] 0000 1111 1101 1111  = 0x0FDF
   276       [e] 0000 1111 1111 1111  = 0x0FFF
   277 */
   279 static const uint16_t gPairConservative[MAX_CLASSES] = {
   280   0x0FFF,
   281   0x0EC2,
   282   0x0EC6,
   283   0x0EC2,
   284   0x0EC2,
   285   0x0C02,
   286   0x0FDF,
   287   0x0FDF,
   288   0x0FC2,
   289   0x0FFF,
   290   0x0FDF,
   291   0x0FFF
   292 };
   295 /*
   297    9. Now we map the class to number
   299       0: 1 
   300       1: [a]- 2, 3, 4, 5, 6
   301       2: 7
   302       3: 8
   303       4: 9
   304       5: [b]- 10, 11, 12, 17
   305       6: 15
   306       7: 18
   307       8: COMPLEX
   308       9: [c]
   309       A: [d]
   310       B: [e]
   312     and they mean:
   313       0: Open parenthesis
   314       1: Punctuation that prohibits break before
   315       2: Non-breakable between same classes
   316       3: Prefix
   317       4: Postfix
   318       5: Breakable character (Spaces and Most Japanese characters)
   319       6: Numeric
   320       7: Characters
   321       8: Need special handling characters (E.g., Thai)
   322       9: Open parentheses like Character (See bug 389056)
   323       A: Close parenthese (or punctuations) like Character (See bug 389056)
   324       B: Non breakable (See bug 390920)
   326 */
   328 #define CLASS_NONE                             INT8_MAX
   330 #define CLASS_OPEN                             0x00
   331 #define CLASS_CLOSE                            0x01
   332 #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
   333 #define CLASS_PREFIX                           0x03
   334 #define CLASS_POSTFFIX                         0x04
   335 #define CLASS_BREAKABLE                        0x05
   336 #define CLASS_NUMERIC                          0x06
   337 #define CLASS_CHARACTER                        0x07
   338 #define CLASS_COMPLEX                          0x08
   339 #define CLASS_OPEN_LIKE_CHARACTER              0x09
   340 #define CLASS_CLOSE_LIKE_CHARACTER             0x0A
   341 #define CLASS_NON_BREAKABLE                    0x0B
   343 #define U_NULL      char16_t(0x0000)
   344 #define U_SLASH     char16_t('/')
   345 #define U_SPACE     char16_t(' ')
   346 #define U_HYPHEN    char16_t('-')
   347 #define U_EQUAL     char16_t('=')
   348 #define U_PERCENT   char16_t('%')
   349 #define U_AMPERSAND char16_t('&')
   350 #define U_SEMICOLON char16_t(';')
   351 #define U_BACKSLASH char16_t('\\')
   352 #define U_OPEN_SINGLE_QUOTE char16_t(0x2018)
   353 #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)
   354 #define U_OPEN_GUILLEMET    char16_t(0x00AB)
   356 #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \
   357                                      (c) == U_SLASH || \
   358                                      (c) == U_PERCENT || \
   359                                      (c) == U_AMPERSAND || \
   360                                      (c) == U_SEMICOLON || \
   361                                      (c) == U_BACKSLASH || \
   362                                      (c) == U_OPEN_SINGLE_QUOTE || \
   363                                      (c) == U_OPEN_DOUBLE_QUOTE || \
   364                                      (c) == U_OPEN_GUILLEMET)
   366 #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
   368 static inline int
   369 GETCLASSFROMTABLE(const uint32_t* t, uint16_t l)
   370 {
   371   return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
   372 }
   374 static inline int
   375 IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u)
   376 {
   377   return ((0xff66 <= (u)) && ((u) <= 0xff70));
   378 }
   380 static inline int
   381 IS_CJK_CHAR(char16_t u)
   382 {
   383   return ((0x1100 <= (u) && (u) <= 0x11ff) ||
   384           (0x2e80 <= (u) && (u) <= 0xd7ff) ||
   385           (0xf900 <= (u) && (u) <= 0xfaff) ||
   386           (0xff00 <= (u) && (u) <= 0xffef) );
   387 }
   389 static inline bool
   390 IS_NONBREAKABLE_SPACE(char16_t u)
   391 {
   392   return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
   393 }
   395 static inline bool
   396 IS_HYPHEN(char16_t u)
   397 {
   398   return (u == U_HYPHEN ||
   399           u == 0x058A || // ARMENIAN HYPHEN
   400           u == 0x2010 || // HYPHEN
   401           u == 0x2012 || // FIGURE DASH
   402           u == 0x2013);  // EN DASH
   403 }
   405 static int8_t
   406 GetClass(char16_t u)
   407 {
   408    uint16_t h = u & 0xFF00;
   409    uint16_t l = u & 0x00ff;
   410    int8_t c;
   412    // Handle 3 range table first
   413    if (0x0000 == h) {
   414      c = GETCLASSFROMTABLE(gLBClass00, l);
   415    } else if (0x1700 == h) {
   416      c = GETCLASSFROMTABLE(gLBClass17, l);
   417    } else if (NS_NeedsPlatformNativeHandling(u)) {
   418      c = CLASS_COMPLEX;
   419    } else if (0x0E00 == h) {
   420      c = GETCLASSFROMTABLE(gLBClass0E, l);
   421    } else if (0x2000 == h) {
   422      c = GETCLASSFROMTABLE(gLBClass20, l);
   423    } else if (0x2100 == h) {
   424      c = GETCLASSFROMTABLE(gLBClass21, l);
   425    } else if (0x3000 == h) {
   426      c = GETCLASSFROMTABLE(gLBClass30, l);
   427    } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi
   428               ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul
   429               ((0xf900 <= h) && (h <= 0xfaff))) {
   430      c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatibility
   431    } else if (0xff00 == h) {
   432      if (l < 0x0060) { // Fullwidth ASCII variant
   433        c = GETCLASSFROMTABLE(gLBClass00, (l+0x20));
   434      } else if (l < 0x00a0) {
   435        switch (l) {
   436          case 0x61: c = GetClass(0x3002); break;
   437          case 0x62: c = GetClass(0x300c); break;
   438          case 0x63: c = GetClass(0x300d); break;
   439          case 0x64: c = GetClass(0x3001); break;
   440          case 0x65: c = GetClass(0x30fb); break;
   441          case 0x9e: c = GetClass(0x309b); break;
   442          case 0x9f: c = GetClass(0x309c); break;
   443          default:
   444            if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u))
   445               c = CLASS_CLOSE; // jis x4051 class 3
   446            else
   447               c = CLASS_BREAKABLE; // jis x4051 class 11
   448            break;
   449        }
   450      // Halfwidth Katakana variants
   451      } else if (l < 0x00e0) {
   452        c = CLASS_CHARACTER; // Halfwidth Hangul variants
   453      } else if (l < 0x00f0) {
   454        static char16_t NarrowFFEx[16] = {
   455          0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
   456          0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
   457        };
   458        c = GetClass(NarrowFFEx[l - 0x00e0]);
   459      } else {
   460        c = CLASS_CHARACTER;
   461      }
   462    } else if (0x3100 == h) { 
   463      if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
   464                       // XXX: This is per UAX #14, but UAX #14 may change
   465                       // the line breaking rules about Kanbun and Bopomofo.
   466        c = CLASS_BREAKABLE;
   467      } else if (l >= 0xf0) { // Katakana small letters for Ainu
   468        c = CLASS_CLOSE;
   469      } else { // unassigned
   470        c = CLASS_CHARACTER;
   471      }
   472    } else if (0x0300 == h) {
   473      if (0x4F == l || (0x5C <= l && l <= 0x62))
   474        c = CLASS_NON_BREAKABLE;
   475      else
   476        c = CLASS_CHARACTER;
   477    } else if (0x0500 == h) {
   478      // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
   479      if (l == 0x8A)
   480        c = GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
   481      else
   482        c = CLASS_CHARACTER;
   483    } else if (0x0F00 == h) {
   484      if (0x08 == l || 0x0C == l || 0x12 == l)
   485        c = CLASS_NON_BREAKABLE;
   486      else
   487        c = CLASS_CHARACTER;
   488    } else if (0x1800 == h) {
   489      if (0x0E == l)
   490        c = CLASS_NON_BREAKABLE;
   491      else
   492        c = CLASS_CHARACTER;
   493    } else if (0x1600 == h) {
   494      if (0x80 == l) { // U+1680 OGHAM SPACE MARK
   495        c = CLASS_BREAKABLE;
   496      } else {
   497        c = CLASS_CHARACTER;
   498      }
   499    } else if (u == 0xfeff) {
   500      c = CLASS_NON_BREAKABLE;
   501    } else {
   502      c = CLASS_CHARACTER; // others
   503    }
   504    return c;
   505 }
   507 static bool
   508 GetPair(int8_t c1, int8_t c2)
   509 {
   510   NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
   511   NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
   513   return (0 == ((gPair[c1] >> c2) & 0x0001));
   514 }
   516 static bool
   517 GetPairConservative(int8_t c1, int8_t c2)
   518 {
   519   NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
   520   NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
   522   return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
   523 }
   525 nsJISx4051LineBreaker::nsJISx4051LineBreaker()
   526 {
   527 }
   529 nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
   530 {
   531 }
   533 NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker)
   535 class ContextState {
   536 public:
   537   ContextState(const char16_t* aText, uint32_t aLength) {
   538     mUniText = aText;
   539     mText = nullptr;
   540     mLength = aLength;
   541     Init();
   542   }
   544   ContextState(const uint8_t* aText, uint32_t aLength) {
   545     mUniText = nullptr;
   546     mText = aText;
   547     mLength = aLength;
   548     Init();
   549   }
   551   uint32_t Length() { return mLength; }
   552   uint32_t Index() { return mIndex; }
   554   char16_t GetCharAt(uint32_t aIndex) {
   555     NS_ASSERTION(aIndex < mLength, "Out of range!");
   556     return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
   557   }
   559   void AdvanceIndex() {
   560     ++mIndex;
   561   }
   563   void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
   565 // A word of western language should not be broken. But even if the word has
   566 // only ASCII characters, non-natural context words should be broken, e.g.,
   567 // URL and file path. For protecting the natural words, we should use
   568 // conservative breaking rules at following conditions:
   569 //   1. at near the start of word
   570 //   2. at near the end of word
   571 //   3. at near the latest broken point
   572 // CONSERVATIVE_BREAK_RANGE define the 'near' in characters.
   573 #define CONSERVATIVE_BREAK_RANGE 6
   575   bool UseConservativeBreaking(uint32_t aOffset = 0) {
   576     if (mHasCJKChar)
   577       return false;
   578     uint32_t index = mIndex + aOffset;
   579     bool result = (index < CONSERVATIVE_BREAK_RANGE ||
   580                      mLength - index < CONSERVATIVE_BREAK_RANGE ||
   581                      index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE);
   582     if (result || !mHasNonbreakableSpace)
   583       return result;
   585     // This text has no-breakable space, we need to check whether the index
   586     // is near it.
   588     // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here.
   589     for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) {
   590       if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1)))
   591         return true;
   592     }
   593     // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE.
   594     for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) {
   595       if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))
   596         return true;
   597     }
   598     return false;
   599   }
   601   bool HasPreviousEqualsSign() const {
   602     return mHasPreviousEqualsSign;
   603   }
   604   void NotifySeenEqualsSign() {
   605     mHasPreviousEqualsSign = true;
   606   }
   608   bool HasPreviousSlash() const {
   609     return mHasPreviousSlash;
   610   }
   611   void NotifySeenSlash() {
   612     mHasPreviousSlash = true;
   613   }
   615   bool HasPreviousBackslash() const {
   616     return mHasPreviousBackslash;
   617   }
   618   void NotifySeenBackslash() {
   619     mHasPreviousBackslash = true;
   620   }
   622   char16_t GetPreviousNonHyphenCharacter() const {
   623     return mPreviousNonHyphenCharacter;
   624   }
   625   void NotifyNonHyphenCharacter(char16_t ch) {
   626     mPreviousNonHyphenCharacter = ch;
   627   }
   629 private:
   630   void Init() {
   631     mIndex = 0;
   632     mLastBreakIndex = 0;
   633     mPreviousNonHyphenCharacter = U_NULL;
   634     mHasCJKChar = 0;
   635     mHasNonbreakableSpace = 0;
   636     mHasPreviousEqualsSign = false;
   637     mHasPreviousSlash = false;
   638     mHasPreviousBackslash = false;
   640     for (uint32_t i = 0; i < mLength; ++i) {
   641       char16_t u = GetCharAt(i);
   642       if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u))
   643         mHasNonbreakableSpace = 1;
   644       else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u))
   645         mHasCJKChar = 1;
   646     }
   647   }
   649   const char16_t* mUniText;
   650   const uint8_t* mText;
   652   uint32_t mIndex;
   653   uint32_t mLength;         // length of text
   654   uint32_t mLastBreakIndex;
   655   char16_t mPreviousNonHyphenCharacter; // The last character we have seen
   656                                          // which is not U_HYPHEN
   657   bool mHasCJKChar; // if the text has CJK character, this is true.
   658   bool mHasNonbreakableSpace; // if the text has no-breakable space,
   659                                      // this is true.
   660   bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
   661   bool mHasPreviousSlash;      // True if we have seen a U_SLASH
   662   bool mHasPreviousBackslash;  // True if we have seen a U_BACKSLASH
   663 };
   665 static int8_t
   666 ContextualAnalysis(char16_t prev, char16_t cur, char16_t next,
   667                    ContextState &aState)
   668 {
   669   // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
   671   if (IS_HYPHEN(cur)) {
   672     // If next character is hyphen, we don't need to break between them.
   673     if (IS_HYPHEN(next))
   674       return CLASS_CHARACTER;
   675     // If prev and next characters are numeric, it may be in Math context.
   676     // So, we should not break here.
   677     bool prevIsNum = IS_ASCII_DIGIT(prev);
   678     bool nextIsNum = IS_ASCII_DIGIT(next);
   679     if (prevIsNum && nextIsNum)
   680       return CLASS_NUMERIC;
   681     // If one side is numeric and the other is a character, or if both sides are
   682     // characters, the hyphen should be breakable.
   683     if (!aState.UseConservativeBreaking(1)) {
   684       char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
   685       if (prevOfHyphen && next) {
   686         int8_t prevClass = GetClass(prevOfHyphen);
   687         int8_t nextClass = GetClass(next);
   688         bool prevIsNumOrCharOrClose =
   689           prevIsNum ||
   690           (prevClass == CLASS_CHARACTER &&
   691             !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
   692           prevClass == CLASS_CLOSE ||
   693           prevClass == CLASS_CLOSE_LIKE_CHARACTER;
   694         bool nextIsNumOrCharOrOpen =
   695           nextIsNum ||
   696           (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||
   697           nextClass == CLASS_OPEN ||
   698           nextClass == CLASS_OPEN_LIKE_CHARACTER ||
   699           next == U_OPEN_SINGLE_QUOTE ||
   700           next == U_OPEN_DOUBLE_QUOTE ||
   701           next == U_OPEN_GUILLEMET;
   702         if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {
   703           return CLASS_CLOSE;
   704         }
   705       }
   706     }
   707   } else {
   708     aState.NotifyNonHyphenCharacter(cur);
   709     if (cur == U_SLASH || cur == U_BACKSLASH) {
   710       // If this is immediately after same char, we should not break here.
   711       if (prev == cur)
   712         return CLASS_CHARACTER;
   713       // If this text has two or more (BACK)SLASHs, this may be file path or URL.
   714       // Make sure to compute shouldReturn before we notify on this slash.
   715       bool shouldReturn = !aState.UseConservativeBreaking() &&
   716         (cur == U_SLASH ?
   717          aState.HasPreviousSlash() : aState.HasPreviousBackslash());
   719       if (cur == U_SLASH) {
   720         aState.NotifySeenSlash();
   721       } else {
   722         aState.NotifySeenBackslash();
   723       }
   725       if (shouldReturn)
   726         return CLASS_OPEN;
   727     } else if (cur == U_PERCENT) {
   728       // If this is a part of the param of URL, we should break before.
   729       if (!aState.UseConservativeBreaking()) {
   730         if (aState.Index() >= 3 &&
   731             aState.GetCharAt(aState.Index() - 3) == U_PERCENT)
   732           return CLASS_OPEN;
   733         if (aState.Index() + 3 < aState.Length() &&
   734             aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
   735           return CLASS_OPEN;
   736       }
   737     } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
   738       // If this may be a separator of params of URL, we should break after.
   739       if (!aState.UseConservativeBreaking(1) &&
   740           aState.HasPreviousEqualsSign())
   741         return CLASS_CLOSE;
   742     } else if (cur == U_OPEN_SINGLE_QUOTE ||
   743                cur == U_OPEN_DOUBLE_QUOTE ||
   744                cur == U_OPEN_GUILLEMET) {
   745       // for CJK usage, we treat these as openers to allow a break before them,
   746       // but otherwise treat them as normal characters because quote mark usage
   747       // in various Western languages varies too much; see bug #450088 discussion.
   748       if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
   749         return CLASS_OPEN;
   750     } else {
   751       NS_ERROR("Forgot to handle the current character!");
   752     }
   753   }
   754   return GetClass(cur);
   755 }
   758 int32_t
   759 nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen,
   760                                 uint32_t aPos, int8_t aDirection)
   761 {
   762   bool    textNeedsJISx4051 = false;
   763   int32_t begin, end;
   765   for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
   766     if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) {
   767       textNeedsJISx4051 = true;
   768     }
   769   }
   770   for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {
   771     if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
   772       textNeedsJISx4051 = true;
   773     }
   774   }
   776   int32_t ret;
   777   nsAutoTArray<uint8_t, 2000> breakState;
   778   if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
   779     // No complex text character, do not try to do complex line break.
   780     // (This is required for serializers. See Bug #344816.)
   781     // Also fall back to this when out of memory.
   782     if (aDirection < 0) {
   783       ret = (begin == int32_t(aPos)) ? begin - 1 : begin;
   784     } else {
   785       ret = end;
   786     }
   787   } else {
   788     GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal,
   789                       breakState.Elements());
   791     ret = aPos;
   792     do {
   793       ret += aDirection;
   794     } while (begin < ret && ret < end && !breakState[ret - begin]);
   795   }
   797   return ret;
   798 }
   800 int32_t
   801 nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen,
   802                             uint32_t aPos) 
   803 {
   804   NS_ASSERTION(aText, "aText shouldn't be null");
   805   NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next");
   807   int32_t nextPos = WordMove(aText, aLen, aPos, 1);
   808   return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
   809 }
   811 int32_t
   812 nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen,
   813                             uint32_t aPos) 
   814 {
   815   NS_ASSERTION(aText, "aText shouldn't be null");
   816   NS_ASSERTION(aLen >= aPos && aPos > 0,
   817                "Bad position passed to nsJISx4051LineBreaker::Prev");
   819   int32_t prevPos = WordMove(aText, aLen, aPos, -1);
   820   return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
   821 }
   823 void
   824 nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength,
   825                                          uint8_t aWordBreak,
   826                                          uint8_t* aBreakBefore)
   827 {
   828   uint32_t cur;
   829   int8_t lastClass = CLASS_NONE;
   830   ContextState state(aChars, aLength);
   832   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
   833     char16_t ch = aChars[cur];
   834     int8_t cl;
   836     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
   837       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
   838                               ch,
   839                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
   840                               state);
   841     } else {
   842       if (ch == U_EQUAL)
   843         state.NotifySeenEqualsSign();
   844       state.NotifyNonHyphenCharacter(ch);
   845       cl = GetClass(ch);
   846     }
   848     bool allowBreak = false;
   849     if (cur > 0) {
   850       NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
   851                    "Loop should have prevented adjacent complex chars here");
   852       if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
   853         allowBreak = (state.UseConservativeBreaking()) ?
   854           GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
   855       } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
   856         allowBreak = true;
   857       }
   858     }
   859     aBreakBefore[cur] = allowBreak;
   860     if (allowBreak)
   861       state.NotifyBreakBefore();
   862     lastClass = cl;
   863     if (CLASS_COMPLEX == cl) {
   864       uint32_t end = cur + 1;
   866       while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) {
   867         ++end;
   868       }
   870       NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
   872       // We have to consider word-break value again for complex characters
   873       if (aWordBreak != nsILineBreaker::kWordBreak_Normal) {
   874         // Respect word-break property 
   875         for (uint32_t i = cur; i < end; i++)
   876           aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll);
   877       }
   879       // restore breakability at chunk begin, which was always set to false
   880       // by the complex line breaker
   881       aBreakBefore[cur] = allowBreak;
   883       cur = end - 1;
   884     }
   885   }
   886 }
   888 void
   889 nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength,
   890                                          uint8_t aWordBreak,
   891                                          uint8_t* aBreakBefore)
   892 {
   893   uint32_t cur;
   894   int8_t lastClass = CLASS_NONE;
   895   ContextState state(aChars, aLength);
   897   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
   898     char16_t ch = aChars[cur];
   899     int8_t cl;
   901     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
   902       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
   903                               ch,
   904                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
   905                               state);
   906     } else {
   907       if (ch == U_EQUAL)
   908         state.NotifySeenEqualsSign();
   909       state.NotifyNonHyphenCharacter(ch);
   910       cl = GetClass(ch);
   911     }
   913     bool allowBreak = false;
   914     if (cur > 0) {
   915       if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
   916         allowBreak = (state.UseConservativeBreaking()) ?
   917           GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
   918       } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
   919         allowBreak = true;
   920       }
   921     }
   922     aBreakBefore[cur] = allowBreak;
   923     if (allowBreak)
   924       state.NotifyBreakBefore();
   925     lastClass = cl;
   926   }
   927 }

mercurial