michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: 
michael@0: 
michael@0: #include "nsJISx4051LineBreaker.h"
michael@0: 
michael@0: #include "jisx4051class.h"
michael@0: #include "nsComplexBreaker.h"
michael@0: #include "nsTArray.h"
michael@0: 
michael@0: /* 
michael@0: 
michael@0:    Simplification of Pair Table in JIS X 4051
michael@0: 
michael@0:    1. The Origion Table - in 4.1.3
michael@0: 
michael@0:    In JIS x 4051. The pair table is defined as below
michael@0: 
michael@0:    Class of
michael@0:    Leading    Class of Trailing Char Class
michael@0:    Char        
michael@0: 
michael@0:               1  2  3  4  5  6  7  8  9 10 11 12 13 13 14 14 15 16 17 18 19 20
michael@0:                                                  *  #  *  #
michael@0:         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  E
michael@0:         2        X  X  X  X  X                                               X
michael@0:         3        X  X  X  X  X                                               X
michael@0:         4        X  X  X  X  X                                               X
michael@0:         5        X  X  X  X  X                                               X
michael@0:         6        X  X  X  X  X                                               X
michael@0:         7        X  X  X  X  X  X                                            X
michael@0:         8        X  X  X  X  X                                X              E
michael@0:         9        X  X  X  X  X                                               X
michael@0:        10        X  X  X  X  X                                               X
michael@0:        11        X  X  X  X  X                                               X
michael@0:        12        X  X  X  X  X                                               X
michael@0:        13        X  X  X  X  X                    X                          X
michael@0:        14        X  X  X  X  X                          X                    X
michael@0:        15        X  X  X  X  X        X                       X        X     X
michael@0:        16        X  X  X  X  X                                   X     X     X
michael@0:        17        X  X  X  X  X                                               E
michael@0:        18        X  X  X  X  X                                X  X     X     X
michael@0:        19     X  E  E  E  E  E  X  X  X  X  X  X  X  X  X  X  X  X  E  X  E  E
michael@0:        20        X  X  X  X  X                                               E
michael@0: 
michael@0:    * Same Char
michael@0:    # Other Char
michael@0: 
michael@0:    X Cannot Break
michael@0: 
michael@0:    The classes mean:
michael@0:       1: Open parenthesis
michael@0:       2: Close parenthesis
michael@0:       3: Prohibit a line break before
michael@0:       4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
michael@0:       5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
michael@0:       6: Full stop
michael@0:       7: Non-breakable between same characters
michael@0:       8: Prefix (e.g., "$", "NO.")
michael@0:       9: Postfix (e.g., "%")
michael@0:      10: Ideographic space
michael@0:      11: Hiragana
michael@0:      12: Japanese characters (except class 11)
michael@0:      13: Subscript
michael@0:      14: Ruby
michael@0:      15: Numeric
michael@0:      16: Alphabet
michael@0:      17: Space for Western language
michael@0:      18: Western characters (except class 17)
michael@0:      19: Split line note (Warichu) begin quote
michael@0:      20: Split line note (Warichu) end quote
michael@0: 
michael@0:    2. Simplified by remove the class which we do not care
michael@0: 
michael@0:    However, since we do not care about class 13(Subscript), 14(Ruby),
michael@0:    16 (Aphabet), 19(split line note begin quote), and 20(split line note end
michael@0:    quote) we can simplify this par table into the following
michael@0: 
michael@0:    Class of
michael@0:    Leading    Class of Trailing Char Class
michael@0:    Char
michael@0: 
michael@0:               1  2  3  4  5  6  7  8  9 10 11 12 15 17 18
michael@0: 
michael@0:         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X
michael@0:         2        X  X  X  X  X                           
michael@0:         3        X  X  X  X  X                           
michael@0:         4        X  X  X  X  X                           
michael@0:         5        X  X  X  X  X                           
michael@0:         6        X  X  X  X  X                           
michael@0:         7        X  X  X  X  X  X                        
michael@0:         8        X  X  X  X  X                    X      
michael@0:         9        X  X  X  X  X                           
michael@0:        10        X  X  X  X  X                           
michael@0:        11        X  X  X  X  X                           
michael@0:        12        X  X  X  X  X                           
michael@0:        15        X  X  X  X  X        X           X     X
michael@0:        17        X  X  X  X  X                           
michael@0:        18        X  X  X  X  X                    X     X
michael@0: 
michael@0:    3. Simplified by merged classes
michael@0: 
michael@0:    After the 2 simplification, the pair table have some duplication
michael@0:    a. class 2, 3, 4, 5, 6,  are the same- we can merged them
michael@0:    b. class 10, 11, 12, 17  are the same- we can merged them
michael@0: 
michael@0: 
michael@0:    Class of
michael@0:    Leading    Class of Trailing Char Class
michael@0:    Char
michael@0: 
michael@0:               1 [a] 7  8  9 [b]15 18
michael@0: 
michael@0:         1     X  X  X  X  X  X  X  X
michael@0:       [a]        X                  
michael@0:         7        X  X               
michael@0:         8        X              X   
michael@0:         9        X                  
michael@0:       [b]        X                  
michael@0:        15        X        X     X  X
michael@0:        18        X              X  X
michael@0: 
michael@0: 
michael@0:    4. We add COMPLEX characters and make it breakable w/ all ther class
michael@0:       except after class 1 and before class [a]
michael@0: 
michael@0:    Class of
michael@0:    Leading    Class of Trailing Char Class
michael@0:    Char
michael@0: 
michael@0:               1 [a] 7  8  9 [b]15 18 COMPLEX
michael@0: 
michael@0:         1     X  X  X  X  X  X  X  X  X
michael@0:       [a]        X                     
michael@0:         7        X  X                  
michael@0:         8        X              X      
michael@0:         9        X                     
michael@0:       [b]        X                     
michael@0:        15        X        X     X  X   
michael@0:        18        X              X  X   
michael@0:   COMPLEX        X                    T
michael@0: 
michael@0:      T : need special handling
michael@0: 
michael@0: 
michael@0:    5. However, we need two special class for some punctuations/parentheses,
michael@0:       theirs breaking rules like character class (18), see bug 389056.
michael@0:       And also we need character like punctuation that is same behavior with 18,
michael@0:       but the characters are not letters of all languages. (e.g., '_')
michael@0:       [c]. Based on open parenthesis class (1), but it is not breakable after
michael@0:            character class (18) or numeric class (15).
michael@0:       [d]. Based on close parenthesis (or punctuation) class (2), but it is not
michael@0:            breakable before character class (18) or numeric class (15).
michael@0: 
michael@0:    Class of
michael@0:    Leading    Class of Trailing Char Class
michael@0:    Char
michael@0: 
michael@0:               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d]
michael@0: 
michael@0:         1     X  X  X  X  X  X  X  X  X       X    X
michael@0:       [a]        X                            X    X
michael@0:         7        X  X                               
michael@0:         8        X              X                   
michael@0:         9        X                                  
michael@0:       [b]        X                                 X
michael@0:        15        X        X     X  X          X    X
michael@0:        18        X              X  X          X    X
michael@0:   COMPLEX        X                    T             
michael@0:       [c]     X  X  X  X  X  X  X  X  X       X    X
michael@0:       [d]        X              X  X               X
michael@0: 
michael@0: 
michael@0:    6. And Unicode has "NON-BREAK" characters. The lines should be broken around
michael@0:       them. But in JIS X 4051, such class is not, therefore, we create [e].
michael@0: 
michael@0:    Class of
michael@0:    Leading    Class of Trailing Char Class
michael@0:    Char
michael@0: 
michael@0:               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]
michael@0: 
michael@0:         1     X  X  X  X  X  X  X  X  X       X    X   X
michael@0:       [a]        X                                 X   X
michael@0:         7        X  X                                  X
michael@0:         8        X              X                      X
michael@0:         9        X                                     X
michael@0:       [b]        X                                 X   X
michael@0:        15        X        X     X  X          X    X   X
michael@0:        18        X              X  X          X    X   X
michael@0:   COMPLEX        X                    T                X
michael@0:       [c]     X  X  X  X  X  X  X  X  X       X    X   X
michael@0:       [d]        X              X  X               X   X
michael@0:       [e]     X  X  X  X  X  X  X  X  X       X    X   X
michael@0: 
michael@0: 
michael@0:    7. Now we use one bit to encode weather it is breakable, and use 2 bytes
michael@0:       for one row, then the bit table will look like:
michael@0: 
michael@0:                  18    <-   1
michael@0: 
michael@0:        1  0000 1111 1111 1111  = 0x0FFF
michael@0:       [a] 0000 1100 0000 0010  = 0x0C02
michael@0:        7  0000 1000 0000 0110  = 0x0806
michael@0:        8  0000 1000 0100 0010  = 0x0842
michael@0:        9  0000 1000 0000 0010  = 0x0802
michael@0:       [b] 0000 1100 0000 0010  = 0x0C02
michael@0:       15  0000 1110 1101 0010  = 0x0ED2
michael@0:       18  0000 1110 1100 0010  = 0x0EC2
michael@0:  COMPLEX  0000 1001 0000 0010  = 0x0902
michael@0:       [c] 0000 1111 1111 1111  = 0x0FFF
michael@0:       [d] 0000 1100 1100 0010  = 0x0CC2
michael@0:       [e] 0000 1111 1111 1111  = 0x0FFF
michael@0: */
michael@0: 
michael@0: #define MAX_CLASSES 12
michael@0: 
michael@0: static const uint16_t gPair[MAX_CLASSES] = {
michael@0:   0x0FFF,
michael@0:   0x0C02,
michael@0:   0x0806,
michael@0:   0x0842,
michael@0:   0x0802,
michael@0:   0x0C02,
michael@0:   0x0ED2,
michael@0:   0x0EC2,
michael@0:   0x0902,
michael@0:   0x0FFF,
michael@0:   0x0CC2,
michael@0:   0x0FFF
michael@0: };
michael@0: 
michael@0: 
michael@0: /*
michael@0: 
michael@0:    8. And if the character is not enough far from word start, word end and
michael@0:       another break point, we should not break in non-CJK languages.
michael@0:       I.e., Don't break around 15, 18, [c] and [d], but don't change
michael@0:       that if they are related to [b].
michael@0: 
michael@0:    Class of
michael@0:    Leading    Class of Trailing Char Class
michael@0:    Char
michael@0: 
michael@0:               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]
michael@0: 
michael@0:         1     X  X  X  X  X  X  X  X  X       X    X   X
michael@0:       [a]        X              X  X          X    X   X
michael@0:         7        X  X           X  X          X    X   X
michael@0:         8        X              X  X          X    X   X
michael@0:         9        X              X  X          X    X   X
michael@0:       [b]        X                                 X   X
michael@0:        15     X  X  X  X  X     X  X  X       X    X   X
michael@0:        18     X  X  X  X  X     X  X  X       X    X   X
michael@0:   COMPLEX        X              X  X  T       X    X   X
michael@0:       [c]     X  X  X  X  X  X  X  X  X       X    X   X
michael@0:       [d]     X  X  X  X  X     X  X  X       X    X   X
michael@0:       [e]     X  X  X  X  X  X  X  X  X       X    X   X
michael@0: 
michael@0:                  18    <-   1
michael@0: 
michael@0:        1  0000 1111 1111 1111  = 0x0FFF
michael@0:       [a] 0000 1110 1100 0010  = 0x0EC2
michael@0:        7  0000 1110 1100 0110  = 0x0EC6
michael@0:        8  0000 1110 1100 0010  = 0x0EC2
michael@0:        9  0000 1110 1100 0010  = 0x0EC2
michael@0:       [b] 0000 1100 0000 0010  = 0x0C02
michael@0:       15  0000 1111 1101 1111  = 0x0FDF
michael@0:       18  0000 1111 1101 1111  = 0x0FDF
michael@0:  COMPLEX  0000 1111 1100 0010  = 0x0FC2
michael@0:       [c] 0000 1111 1111 1111  = 0x0FFF
michael@0:       [d] 0000 1111 1101 1111  = 0x0FDF
michael@0:       [e] 0000 1111 1111 1111  = 0x0FFF
michael@0: */
michael@0: 
michael@0: static const uint16_t gPairConservative[MAX_CLASSES] = {
michael@0:   0x0FFF,
michael@0:   0x0EC2,
michael@0:   0x0EC6,
michael@0:   0x0EC2,
michael@0:   0x0EC2,
michael@0:   0x0C02,
michael@0:   0x0FDF,
michael@0:   0x0FDF,
michael@0:   0x0FC2,
michael@0:   0x0FFF,
michael@0:   0x0FDF,
michael@0:   0x0FFF
michael@0: };
michael@0: 
michael@0: 
michael@0: /*
michael@0: 
michael@0:    9. Now we map the class to number
michael@0: 
michael@0:       0: 1 
michael@0:       1: [a]- 2, 3, 4, 5, 6
michael@0:       2: 7
michael@0:       3: 8
michael@0:       4: 9
michael@0:       5: [b]- 10, 11, 12, 17
michael@0:       6: 15
michael@0:       7: 18
michael@0:       8: COMPLEX
michael@0:       9: [c]
michael@0:       A: [d]
michael@0:       B: [e]
michael@0: 
michael@0:     and they mean:
michael@0:       0: Open parenthesis
michael@0:       1: Punctuation that prohibits break before
michael@0:       2: Non-breakable between same classes
michael@0:       3: Prefix
michael@0:       4: Postfix
michael@0:       5: Breakable character (Spaces and Most Japanese characters)
michael@0:       6: Numeric
michael@0:       7: Characters
michael@0:       8: Need special handling characters (E.g., Thai)
michael@0:       9: Open parentheses like Character (See bug 389056)
michael@0:       A: Close parenthese (or punctuations) like Character (See bug 389056)
michael@0:       B: Non breakable (See bug 390920)
michael@0: 
michael@0: */
michael@0: 
michael@0: #define CLASS_NONE                             INT8_MAX
michael@0: 
michael@0: #define CLASS_OPEN                             0x00
michael@0: #define CLASS_CLOSE                            0x01
michael@0: #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
michael@0: #define CLASS_PREFIX                           0x03
michael@0: #define CLASS_POSTFFIX                         0x04
michael@0: #define CLASS_BREAKABLE                        0x05
michael@0: #define CLASS_NUMERIC                          0x06
michael@0: #define CLASS_CHARACTER                        0x07
michael@0: #define CLASS_COMPLEX                          0x08
michael@0: #define CLASS_OPEN_LIKE_CHARACTER              0x09
michael@0: #define CLASS_CLOSE_LIKE_CHARACTER             0x0A
michael@0: #define CLASS_NON_BREAKABLE                    0x0B
michael@0: 
michael@0: #define U_NULL      char16_t(0x0000)
michael@0: #define U_SLASH     char16_t('/')
michael@0: #define U_SPACE     char16_t(' ')
michael@0: #define U_HYPHEN    char16_t('-')
michael@0: #define U_EQUAL     char16_t('=')
michael@0: #define U_PERCENT   char16_t('%')
michael@0: #define U_AMPERSAND char16_t('&')
michael@0: #define U_SEMICOLON char16_t(';')
michael@0: #define U_BACKSLASH char16_t('\\')
michael@0: #define U_OPEN_SINGLE_QUOTE char16_t(0x2018)
michael@0: #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)
michael@0: #define U_OPEN_GUILLEMET    char16_t(0x00AB)
michael@0: 
michael@0: #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \
michael@0:                                      (c) == U_SLASH || \
michael@0:                                      (c) == U_PERCENT || \
michael@0:                                      (c) == U_AMPERSAND || \
michael@0:                                      (c) == U_SEMICOLON || \
michael@0:                                      (c) == U_BACKSLASH || \
michael@0:                                      (c) == U_OPEN_SINGLE_QUOTE || \
michael@0:                                      (c) == U_OPEN_DOUBLE_QUOTE || \
michael@0:                                      (c) == U_OPEN_GUILLEMET)
michael@0: 
michael@0: #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
michael@0: 
michael@0: static inline int
michael@0: GETCLASSFROMTABLE(const uint32_t* t, uint16_t l)
michael@0: {
michael@0:   return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
michael@0: }
michael@0: 
michael@0: static inline int
michael@0: IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u)
michael@0: {
michael@0:   return ((0xff66 <= (u)) && ((u) <= 0xff70));
michael@0: }
michael@0: 
michael@0: static inline int
michael@0: IS_CJK_CHAR(char16_t u)
michael@0: {
michael@0:   return ((0x1100 <= (u) && (u) <= 0x11ff) ||
michael@0:           (0x2e80 <= (u) && (u) <= 0xd7ff) ||
michael@0:           (0xf900 <= (u) && (u) <= 0xfaff) ||
michael@0:           (0xff00 <= (u) && (u) <= 0xffef) );
michael@0: }
michael@0: 
michael@0: static inline bool
michael@0: IS_NONBREAKABLE_SPACE(char16_t u)
michael@0: {
michael@0:   return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
michael@0: }
michael@0: 
michael@0: static inline bool
michael@0: IS_HYPHEN(char16_t u)
michael@0: {
michael@0:   return (u == U_HYPHEN ||
michael@0:           u == 0x058A || // ARMENIAN HYPHEN
michael@0:           u == 0x2010 || // HYPHEN
michael@0:           u == 0x2012 || // FIGURE DASH
michael@0:           u == 0x2013);  // EN DASH
michael@0: }
michael@0: 
michael@0: static int8_t
michael@0: GetClass(char16_t u)
michael@0: {
michael@0:    uint16_t h = u & 0xFF00;
michael@0:    uint16_t l = u & 0x00ff;
michael@0:    int8_t c;
michael@0: 
michael@0:    // Handle 3 range table first
michael@0:    if (0x0000 == h) {
michael@0:      c = GETCLASSFROMTABLE(gLBClass00, l);
michael@0:    } else if (0x1700 == h) {
michael@0:      c = GETCLASSFROMTABLE(gLBClass17, l);
michael@0:    } else if (NS_NeedsPlatformNativeHandling(u)) {
michael@0:      c = CLASS_COMPLEX;
michael@0:    } else if (0x0E00 == h) {
michael@0:      c = GETCLASSFROMTABLE(gLBClass0E, l);
michael@0:    } else if (0x2000 == h) {
michael@0:      c = GETCLASSFROMTABLE(gLBClass20, l);
michael@0:    } else if (0x2100 == h) {
michael@0:      c = GETCLASSFROMTABLE(gLBClass21, l);
michael@0:    } else if (0x3000 == h) {
michael@0:      c = GETCLASSFROMTABLE(gLBClass30, l);
michael@0:    } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi
michael@0:               ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul
michael@0:               ((0xf900 <= h) && (h <= 0xfaff))) {
michael@0:      c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatibility
michael@0:    } else if (0xff00 == h) {
michael@0:      if (l < 0x0060) { // Fullwidth ASCII variant
michael@0:        c = GETCLASSFROMTABLE(gLBClass00, (l+0x20));
michael@0:      } else if (l < 0x00a0) {
michael@0:        switch (l) {
michael@0:          case 0x61: c = GetClass(0x3002); break;
michael@0:          case 0x62: c = GetClass(0x300c); break;
michael@0:          case 0x63: c = GetClass(0x300d); break;
michael@0:          case 0x64: c = GetClass(0x3001); break;
michael@0:          case 0x65: c = GetClass(0x30fb); break;
michael@0:          case 0x9e: c = GetClass(0x309b); break;
michael@0:          case 0x9f: c = GetClass(0x309c); break;
michael@0:          default:
michael@0:            if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u))
michael@0:               c = CLASS_CLOSE; // jis x4051 class 3
michael@0:            else
michael@0:               c = CLASS_BREAKABLE; // jis x4051 class 11
michael@0:            break;
michael@0:        }
michael@0:      // Halfwidth Katakana variants
michael@0:      } else if (l < 0x00e0) {
michael@0:        c = CLASS_CHARACTER; // Halfwidth Hangul variants
michael@0:      } else if (l < 0x00f0) {
michael@0:        static char16_t NarrowFFEx[16] = {
michael@0:          0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
michael@0:          0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
michael@0:        };
michael@0:        c = GetClass(NarrowFFEx[l - 0x00e0]);
michael@0:      } else {
michael@0:        c = CLASS_CHARACTER;
michael@0:      }
michael@0:    } else if (0x3100 == h) { 
michael@0:      if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
michael@0:                       // XXX: This is per UAX #14, but UAX #14 may change
michael@0:                       // the line breaking rules about Kanbun and Bopomofo.
michael@0:        c = CLASS_BREAKABLE;
michael@0:      } else if (l >= 0xf0) { // Katakana small letters for Ainu
michael@0:        c = CLASS_CLOSE;
michael@0:      } else { // unassigned
michael@0:        c = CLASS_CHARACTER;
michael@0:      }
michael@0:    } else if (0x0300 == h) {
michael@0:      if (0x4F == l || (0x5C <= l && l <= 0x62))
michael@0:        c = CLASS_NON_BREAKABLE;
michael@0:      else
michael@0:        c = CLASS_CHARACTER;
michael@0:    } else if (0x0500 == h) {
michael@0:      // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
michael@0:      if (l == 0x8A)
michael@0:        c = GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
michael@0:      else
michael@0:        c = CLASS_CHARACTER;
michael@0:    } else if (0x0F00 == h) {
michael@0:      if (0x08 == l || 0x0C == l || 0x12 == l)
michael@0:        c = CLASS_NON_BREAKABLE;
michael@0:      else
michael@0:        c = CLASS_CHARACTER;
michael@0:    } else if (0x1800 == h) {
michael@0:      if (0x0E == l)
michael@0:        c = CLASS_NON_BREAKABLE;
michael@0:      else
michael@0:        c = CLASS_CHARACTER;
michael@0:    } else if (0x1600 == h) {
michael@0:      if (0x80 == l) { // U+1680 OGHAM SPACE MARK
michael@0:        c = CLASS_BREAKABLE;
michael@0:      } else {
michael@0:        c = CLASS_CHARACTER;
michael@0:      }
michael@0:    } else if (u == 0xfeff) {
michael@0:      c = CLASS_NON_BREAKABLE;
michael@0:    } else {
michael@0:      c = CLASS_CHARACTER; // others
michael@0:    }
michael@0:    return c;
michael@0: }
michael@0: 
michael@0: static bool
michael@0: GetPair(int8_t c1, int8_t c2)
michael@0: {
michael@0:   NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
michael@0:   NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
michael@0: 
michael@0:   return (0 == ((gPair[c1] >> c2) & 0x0001));
michael@0: }
michael@0: 
michael@0: static bool
michael@0: GetPairConservative(int8_t c1, int8_t c2)
michael@0: {
michael@0:   NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
michael@0:   NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
michael@0: 
michael@0:   return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
michael@0: }
michael@0: 
michael@0: nsJISx4051LineBreaker::nsJISx4051LineBreaker()
michael@0: {
michael@0: }
michael@0: 
michael@0: nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
michael@0: {
michael@0: }
michael@0: 
michael@0: NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker)
michael@0: 
michael@0: class ContextState {
michael@0: public:
michael@0:   ContextState(const char16_t* aText, uint32_t aLength) {
michael@0:     mUniText = aText;
michael@0:     mText = nullptr;
michael@0:     mLength = aLength;
michael@0:     Init();
michael@0:   }
michael@0: 
michael@0:   ContextState(const uint8_t* aText, uint32_t aLength) {
michael@0:     mUniText = nullptr;
michael@0:     mText = aText;
michael@0:     mLength = aLength;
michael@0:     Init();
michael@0:   }
michael@0: 
michael@0:   uint32_t Length() { return mLength; }
michael@0:   uint32_t Index() { return mIndex; }
michael@0: 
michael@0:   char16_t GetCharAt(uint32_t aIndex) {
michael@0:     NS_ASSERTION(aIndex < mLength, "Out of range!");
michael@0:     return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
michael@0:   }
michael@0: 
michael@0:   void AdvanceIndex() {
michael@0:     ++mIndex;
michael@0:   }
michael@0: 
michael@0:   void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
michael@0: 
michael@0: // A word of western language should not be broken. But even if the word has
michael@0: // only ASCII characters, non-natural context words should be broken, e.g.,
michael@0: // URL and file path. For protecting the natural words, we should use
michael@0: // conservative breaking rules at following conditions:
michael@0: //   1. at near the start of word
michael@0: //   2. at near the end of word
michael@0: //   3. at near the latest broken point
michael@0: // CONSERVATIVE_BREAK_RANGE define the 'near' in characters.
michael@0: #define CONSERVATIVE_BREAK_RANGE 6
michael@0: 
michael@0:   bool UseConservativeBreaking(uint32_t aOffset = 0) {
michael@0:     if (mHasCJKChar)
michael@0:       return false;
michael@0:     uint32_t index = mIndex + aOffset;
michael@0:     bool result = (index < CONSERVATIVE_BREAK_RANGE ||
michael@0:                      mLength - index < CONSERVATIVE_BREAK_RANGE ||
michael@0:                      index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE);
michael@0:     if (result || !mHasNonbreakableSpace)
michael@0:       return result;
michael@0: 
michael@0:     // This text has no-breakable space, we need to check whether the index
michael@0:     // is near it.
michael@0: 
michael@0:     // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here.
michael@0:     for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) {
michael@0:       if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1)))
michael@0:         return true;
michael@0:     }
michael@0:     // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE.
michael@0:     for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) {
michael@0:       if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))
michael@0:         return true;
michael@0:     }
michael@0:     return false;
michael@0:   }
michael@0: 
michael@0:   bool HasPreviousEqualsSign() const {
michael@0:     return mHasPreviousEqualsSign;
michael@0:   }
michael@0:   void NotifySeenEqualsSign() {
michael@0:     mHasPreviousEqualsSign = true;
michael@0:   }
michael@0: 
michael@0:   bool HasPreviousSlash() const {
michael@0:     return mHasPreviousSlash;
michael@0:   }
michael@0:   void NotifySeenSlash() {
michael@0:     mHasPreviousSlash = true;
michael@0:   }
michael@0: 
michael@0:   bool HasPreviousBackslash() const {
michael@0:     return mHasPreviousBackslash;
michael@0:   }
michael@0:   void NotifySeenBackslash() {
michael@0:     mHasPreviousBackslash = true;
michael@0:   }
michael@0: 
michael@0:   char16_t GetPreviousNonHyphenCharacter() const {
michael@0:     return mPreviousNonHyphenCharacter;
michael@0:   }
michael@0:   void NotifyNonHyphenCharacter(char16_t ch) {
michael@0:     mPreviousNonHyphenCharacter = ch;
michael@0:   }
michael@0: 
michael@0: private:
michael@0:   void Init() {
michael@0:     mIndex = 0;
michael@0:     mLastBreakIndex = 0;
michael@0:     mPreviousNonHyphenCharacter = U_NULL;
michael@0:     mHasCJKChar = 0;
michael@0:     mHasNonbreakableSpace = 0;
michael@0:     mHasPreviousEqualsSign = false;
michael@0:     mHasPreviousSlash = false;
michael@0:     mHasPreviousBackslash = false;
michael@0: 
michael@0:     for (uint32_t i = 0; i < mLength; ++i) {
michael@0:       char16_t u = GetCharAt(i);
michael@0:       if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u))
michael@0:         mHasNonbreakableSpace = 1;
michael@0:       else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u))
michael@0:         mHasCJKChar = 1;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   const char16_t* mUniText;
michael@0:   const uint8_t* mText;
michael@0: 
michael@0:   uint32_t mIndex;
michael@0:   uint32_t mLength;         // length of text
michael@0:   uint32_t mLastBreakIndex;
michael@0:   char16_t mPreviousNonHyphenCharacter; // The last character we have seen
michael@0:                                          // which is not U_HYPHEN
michael@0:   bool mHasCJKChar; // if the text has CJK character, this is true.
michael@0:   bool mHasNonbreakableSpace; // if the text has no-breakable space,
michael@0:                                      // this is true.
michael@0:   bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
michael@0:   bool mHasPreviousSlash;      // True if we have seen a U_SLASH
michael@0:   bool mHasPreviousBackslash;  // True if we have seen a U_BACKSLASH
michael@0: };
michael@0: 
michael@0: static int8_t
michael@0: ContextualAnalysis(char16_t prev, char16_t cur, char16_t next,
michael@0:                    ContextState &aState)
michael@0: {
michael@0:   // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
michael@0: 
michael@0:   if (IS_HYPHEN(cur)) {
michael@0:     // If next character is hyphen, we don't need to break between them.
michael@0:     if (IS_HYPHEN(next))
michael@0:       return CLASS_CHARACTER;
michael@0:     // If prev and next characters are numeric, it may be in Math context.
michael@0:     // So, we should not break here.
michael@0:     bool prevIsNum = IS_ASCII_DIGIT(prev);
michael@0:     bool nextIsNum = IS_ASCII_DIGIT(next);
michael@0:     if (prevIsNum && nextIsNum)
michael@0:       return CLASS_NUMERIC;
michael@0:     // If one side is numeric and the other is a character, or if both sides are
michael@0:     // characters, the hyphen should be breakable.
michael@0:     if (!aState.UseConservativeBreaking(1)) {
michael@0:       char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
michael@0:       if (prevOfHyphen && next) {
michael@0:         int8_t prevClass = GetClass(prevOfHyphen);
michael@0:         int8_t nextClass = GetClass(next);
michael@0:         bool prevIsNumOrCharOrClose =
michael@0:           prevIsNum ||
michael@0:           (prevClass == CLASS_CHARACTER &&
michael@0:             !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
michael@0:           prevClass == CLASS_CLOSE ||
michael@0:           prevClass == CLASS_CLOSE_LIKE_CHARACTER;
michael@0:         bool nextIsNumOrCharOrOpen =
michael@0:           nextIsNum ||
michael@0:           (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||
michael@0:           nextClass == CLASS_OPEN ||
michael@0:           nextClass == CLASS_OPEN_LIKE_CHARACTER ||
michael@0:           next == U_OPEN_SINGLE_QUOTE ||
michael@0:           next == U_OPEN_DOUBLE_QUOTE ||
michael@0:           next == U_OPEN_GUILLEMET;
michael@0:         if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {
michael@0:           return CLASS_CLOSE;
michael@0:         }
michael@0:       }
michael@0:     }
michael@0:   } else {
michael@0:     aState.NotifyNonHyphenCharacter(cur);
michael@0:     if (cur == U_SLASH || cur == U_BACKSLASH) {
michael@0:       // If this is immediately after same char, we should not break here.
michael@0:       if (prev == cur)
michael@0:         return CLASS_CHARACTER;
michael@0:       // If this text has two or more (BACK)SLASHs, this may be file path or URL.
michael@0:       // Make sure to compute shouldReturn before we notify on this slash.
michael@0:       bool shouldReturn = !aState.UseConservativeBreaking() &&
michael@0:         (cur == U_SLASH ?
michael@0:          aState.HasPreviousSlash() : aState.HasPreviousBackslash());
michael@0: 
michael@0:       if (cur == U_SLASH) {
michael@0:         aState.NotifySeenSlash();
michael@0:       } else {
michael@0:         aState.NotifySeenBackslash();
michael@0:       }
michael@0: 
michael@0:       if (shouldReturn)
michael@0:         return CLASS_OPEN;
michael@0:     } else if (cur == U_PERCENT) {
michael@0:       // If this is a part of the param of URL, we should break before.
michael@0:       if (!aState.UseConservativeBreaking()) {
michael@0:         if (aState.Index() >= 3 &&
michael@0:             aState.GetCharAt(aState.Index() - 3) == U_PERCENT)
michael@0:           return CLASS_OPEN;
michael@0:         if (aState.Index() + 3 < aState.Length() &&
michael@0:             aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
michael@0:           return CLASS_OPEN;
michael@0:       }
michael@0:     } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
michael@0:       // If this may be a separator of params of URL, we should break after.
michael@0:       if (!aState.UseConservativeBreaking(1) &&
michael@0:           aState.HasPreviousEqualsSign())
michael@0:         return CLASS_CLOSE;
michael@0:     } else if (cur == U_OPEN_SINGLE_QUOTE ||
michael@0:                cur == U_OPEN_DOUBLE_QUOTE ||
michael@0:                cur == U_OPEN_GUILLEMET) {
michael@0:       // for CJK usage, we treat these as openers to allow a break before them,
michael@0:       // but otherwise treat them as normal characters because quote mark usage
michael@0:       // in various Western languages varies too much; see bug #450088 discussion.
michael@0:       if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
michael@0:         return CLASS_OPEN;
michael@0:     } else {
michael@0:       NS_ERROR("Forgot to handle the current character!");
michael@0:     }
michael@0:   }
michael@0:   return GetClass(cur);
michael@0: }
michael@0: 
michael@0: 
michael@0: int32_t
michael@0: nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen,
michael@0:                                 uint32_t aPos, int8_t aDirection)
michael@0: {
michael@0:   bool    textNeedsJISx4051 = false;
michael@0:   int32_t begin, end;
michael@0: 
michael@0:   for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
michael@0:     if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) {
michael@0:       textNeedsJISx4051 = true;
michael@0:     }
michael@0:   }
michael@0:   for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {
michael@0:     if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
michael@0:       textNeedsJISx4051 = true;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   int32_t ret;
michael@0:   nsAutoTArray<uint8_t, 2000> breakState;
michael@0:   if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
michael@0:     // No complex text character, do not try to do complex line break.
michael@0:     // (This is required for serializers. See Bug #344816.)
michael@0:     // Also fall back to this when out of memory.
michael@0:     if (aDirection < 0) {
michael@0:       ret = (begin == int32_t(aPos)) ? begin - 1 : begin;
michael@0:     } else {
michael@0:       ret = end;
michael@0:     }
michael@0:   } else {
michael@0:     GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal,
michael@0:                       breakState.Elements());
michael@0: 
michael@0:     ret = aPos;
michael@0:     do {
michael@0:       ret += aDirection;
michael@0:     } while (begin < ret && ret < end && !breakState[ret - begin]);
michael@0:   }
michael@0: 
michael@0:   return ret;
michael@0: }
michael@0: 
michael@0: int32_t
michael@0: nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen,
michael@0:                             uint32_t aPos) 
michael@0: {
michael@0:   NS_ASSERTION(aText, "aText shouldn't be null");
michael@0:   NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next");
michael@0: 
michael@0:   int32_t nextPos = WordMove(aText, aLen, aPos, 1);
michael@0:   return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
michael@0: }
michael@0: 
michael@0: int32_t
michael@0: nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen,
michael@0:                             uint32_t aPos) 
michael@0: {
michael@0:   NS_ASSERTION(aText, "aText shouldn't be null");
michael@0:   NS_ASSERTION(aLen >= aPos && aPos > 0,
michael@0:                "Bad position passed to nsJISx4051LineBreaker::Prev");
michael@0: 
michael@0:   int32_t prevPos = WordMove(aText, aLen, aPos, -1);
michael@0:   return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
michael@0: }
michael@0: 
michael@0: void
michael@0: nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength,
michael@0:                                          uint8_t aWordBreak,
michael@0:                                          uint8_t* aBreakBefore)
michael@0: {
michael@0:   uint32_t cur;
michael@0:   int8_t lastClass = CLASS_NONE;
michael@0:   ContextState state(aChars, aLength);
michael@0: 
michael@0:   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
michael@0:     char16_t ch = aChars[cur];
michael@0:     int8_t cl;
michael@0: 
michael@0:     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
michael@0:       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
michael@0:                               ch,
michael@0:                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
michael@0:                               state);
michael@0:     } else {
michael@0:       if (ch == U_EQUAL)
michael@0:         state.NotifySeenEqualsSign();
michael@0:       state.NotifyNonHyphenCharacter(ch);
michael@0:       cl = GetClass(ch);
michael@0:     }
michael@0: 
michael@0:     bool allowBreak = false;
michael@0:     if (cur > 0) {
michael@0:       NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
michael@0:                    "Loop should have prevented adjacent complex chars here");
michael@0:       if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
michael@0:         allowBreak = (state.UseConservativeBreaking()) ?
michael@0:           GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
michael@0:       } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
michael@0:         allowBreak = true;
michael@0:       }
michael@0:     }
michael@0:     aBreakBefore[cur] = allowBreak;
michael@0:     if (allowBreak)
michael@0:       state.NotifyBreakBefore();
michael@0:     lastClass = cl;
michael@0:     if (CLASS_COMPLEX == cl) {
michael@0:       uint32_t end = cur + 1;
michael@0: 
michael@0:       while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) {
michael@0:         ++end;
michael@0:       }
michael@0: 
michael@0:       NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
michael@0: 
michael@0:       // We have to consider word-break value again for complex characters
michael@0:       if (aWordBreak != nsILineBreaker::kWordBreak_Normal) {
michael@0:         // Respect word-break property 
michael@0:         for (uint32_t i = cur; i < end; i++)
michael@0:           aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll);
michael@0:       }
michael@0: 
michael@0:       // restore breakability at chunk begin, which was always set to false
michael@0:       // by the complex line breaker
michael@0:       aBreakBefore[cur] = allowBreak;
michael@0: 
michael@0:       cur = end - 1;
michael@0:     }
michael@0:   }
michael@0: }
michael@0: 
michael@0: void
michael@0: nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength,
michael@0:                                          uint8_t aWordBreak,
michael@0:                                          uint8_t* aBreakBefore)
michael@0: {
michael@0:   uint32_t cur;
michael@0:   int8_t lastClass = CLASS_NONE;
michael@0:   ContextState state(aChars, aLength);
michael@0: 
michael@0:   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
michael@0:     char16_t ch = aChars[cur];
michael@0:     int8_t cl;
michael@0: 
michael@0:     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
michael@0:       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
michael@0:                               ch,
michael@0:                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
michael@0:                               state);
michael@0:     } else {
michael@0:       if (ch == U_EQUAL)
michael@0:         state.NotifySeenEqualsSign();
michael@0:       state.NotifyNonHyphenCharacter(ch);
michael@0:       cl = GetClass(ch);
michael@0:     }
michael@0: 
michael@0:     bool allowBreak = false;
michael@0:     if (cur > 0) {
michael@0:       if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
michael@0:         allowBreak = (state.UseConservativeBreaking()) ?
michael@0:           GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
michael@0:       } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
michael@0:         allowBreak = true;
michael@0:       }
michael@0:     }
michael@0:     aBreakBefore[cur] = allowBreak;
michael@0:     if (allowBreak)
michael@0:       state.NotifyBreakBefore();
michael@0:     lastClass = cl;
michael@0:   }
michael@0: }