The Tor Browser: diff intl/lwbrk/src/nsJISx4051LineBreaker.cpp

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/lwbrk/src/nsJISx4051LineBreaker.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,927 @@
     1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +
    1.10 +
    1.11 +#include "nsJISx4051LineBreaker.h"
    1.12 +
    1.13 +#include "jisx4051class.h"
    1.14 +#include "nsComplexBreaker.h"
    1.15 +#include "nsTArray.h"
    1.16 +
    1.17 +/* 
    1.18 +
    1.19 +   Simplification of Pair Table in JIS X 4051
    1.20 +
    1.21 +   1. The Origion Table - in 4.1.3
    1.22 +
    1.23 +   In JIS x 4051. The pair table is defined as below
    1.24 +
    1.25 +   Class of
    1.26 +   Leading    Class of Trailing Char Class
    1.27 +   Char        
    1.28 +
    1.29 +              1  2  3  4  5  6  7  8  9 10 11 12 13 13 14 14 15 16 17 18 19 20
    1.30 +                                                 *  #  *  #
    1.31 +        1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  E
    1.32 +        2        X  X  X  X  X                                               X
    1.33 +        3        X  X  X  X  X                                               X
    1.34 +        4        X  X  X  X  X                                               X
    1.35 +        5        X  X  X  X  X                                               X
    1.36 +        6        X  X  X  X  X                                               X
    1.37 +        7        X  X  X  X  X  X                                            X
    1.38 +        8        X  X  X  X  X                                X              E
    1.39 +        9        X  X  X  X  X                                               X
    1.40 +       10        X  X  X  X  X                                               X
    1.41 +       11        X  X  X  X  X                                               X
    1.42 +       12        X  X  X  X  X                                               X
    1.43 +       13        X  X  X  X  X                    X                          X
    1.44 +       14        X  X  X  X  X                          X                    X
    1.45 +       15        X  X  X  X  X        X                       X        X     X
    1.46 +       16        X  X  X  X  X                                   X     X     X
    1.47 +       17        X  X  X  X  X                                               E
    1.48 +       18        X  X  X  X  X                                X  X     X     X
    1.49 +       19     X  E  E  E  E  E  X  X  X  X  X  X  X  X  X  X  X  X  E  X  E  E
    1.50 +       20        X  X  X  X  X                                               E
    1.51 +
    1.52 +   * Same Char
    1.53 +   # Other Char
    1.54 +
    1.55 +   X Cannot Break
    1.56 +
    1.57 +   The classes mean:
    1.58 +      1: Open parenthesis
    1.59 +      2: Close parenthesis
    1.60 +      3: Prohibit a line break before
    1.61 +      4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
    1.62 +      5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
    1.63 +      6: Full stop
    1.64 +      7: Non-breakable between same characters
    1.65 +      8: Prefix (e.g., "$", "NO.")
    1.66 +      9: Postfix (e.g., "%")
    1.67 +     10: Ideographic space
    1.68 +     11: Hiragana
    1.69 +     12: Japanese characters (except class 11)
    1.70 +     13: Subscript
    1.71 +     14: Ruby
    1.72 +     15: Numeric
    1.73 +     16: Alphabet
    1.74 +     17: Space for Western language
    1.75 +     18: Western characters (except class 17)
    1.76 +     19: Split line note (Warichu) begin quote
    1.77 +     20: Split line note (Warichu) end quote
    1.78 +
    1.79 +   2. Simplified by remove the class which we do not care
    1.80 +
    1.81 +   However, since we do not care about class 13(Subscript), 14(Ruby),
    1.82 +   16 (Aphabet), 19(split line note begin quote), and 20(split line note end
    1.83 +   quote) we can simplify this par table into the following
    1.84 +
    1.85 +   Class of
    1.86 +   Leading    Class of Trailing Char Class
    1.87 +   Char
    1.88 +
    1.89 +              1  2  3  4  5  6  7  8  9 10 11 12 15 17 18
    1.90 +
    1.91 +        1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X
    1.92 +        2        X  X  X  X  X                           
    1.93 +        3        X  X  X  X  X                           
    1.94 +        4        X  X  X  X  X                           
    1.95 +        5        X  X  X  X  X                           
    1.96 +        6        X  X  X  X  X                           
    1.97 +        7        X  X  X  X  X  X                        
    1.98 +        8        X  X  X  X  X                    X      
    1.99 +        9        X  X  X  X  X                           
   1.100 +       10        X  X  X  X  X                           
   1.101 +       11        X  X  X  X  X                           
   1.102 +       12        X  X  X  X  X                           
   1.103 +       15        X  X  X  X  X        X           X     X
   1.104 +       17        X  X  X  X  X                           
   1.105 +       18        X  X  X  X  X                    X     X
   1.106 +
   1.107 +   3. Simplified by merged classes
   1.108 +
   1.109 +   After the 2 simplification, the pair table have some duplication
   1.110 +   a. class 2, 3, 4, 5, 6,  are the same- we can merged them
   1.111 +   b. class 10, 11, 12, 17  are the same- we can merged them
   1.112 +
   1.113 +
   1.114 +   Class of
   1.115 +   Leading    Class of Trailing Char Class
   1.116 +   Char
   1.117 +
   1.118 +              1 [a] 7  8  9 [b]15 18
   1.119 +
   1.120 +        1     X  X  X  X  X  X  X  X
   1.121 +      [a]        X                  
   1.122 +        7        X  X               
   1.123 +        8        X              X   
   1.124 +        9        X                  
   1.125 +      [b]        X                  
   1.126 +       15        X        X     X  X
   1.127 +       18        X              X  X
   1.128 +
   1.129 +
   1.130 +   4. We add COMPLEX characters and make it breakable w/ all ther class
   1.131 +      except after class 1 and before class [a]
   1.132 +
   1.133 +   Class of
   1.134 +   Leading    Class of Trailing Char Class
   1.135 +   Char
   1.136 +
   1.137 +              1 [a] 7  8  9 [b]15 18 COMPLEX
   1.138 +
   1.139 +        1     X  X  X  X  X  X  X  X  X
   1.140 +      [a]        X                     
   1.141 +        7        X  X                  
   1.142 +        8        X              X      
   1.143 +        9        X                     
   1.144 +      [b]        X                     
   1.145 +       15        X        X     X  X   
   1.146 +       18        X              X  X   
   1.147 +  COMPLEX        X                    T
   1.148 +
   1.149 +     T : need special handling
   1.150 +
   1.151 +
   1.152 +   5. However, we need two special class for some punctuations/parentheses,
   1.153 +      theirs breaking rules like character class (18), see bug 389056.
   1.154 +      And also we need character like punctuation that is same behavior with 18,
   1.155 +      but the characters are not letters of all languages. (e.g., '_')
   1.156 +      [c]. Based on open parenthesis class (1), but it is not breakable after
   1.157 +           character class (18) or numeric class (15).
   1.158 +      [d]. Based on close parenthesis (or punctuation) class (2), but it is not
   1.159 +           breakable before character class (18) or numeric class (15).
   1.160 +
   1.161 +   Class of
   1.162 +   Leading    Class of Trailing Char Class
   1.163 +   Char
   1.164 +
   1.165 +              1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d]
   1.166 +
   1.167 +        1     X  X  X  X  X  X  X  X  X       X    X
   1.168 +      [a]        X                            X    X
   1.169 +        7        X  X                               
   1.170 +        8        X              X                   
   1.171 +        9        X                                  
   1.172 +      [b]        X                                 X
   1.173 +       15        X        X     X  X          X    X
   1.174 +       18        X              X  X          X    X
   1.175 +  COMPLEX        X                    T             
   1.176 +      [c]     X  X  X  X  X  X  X  X  X       X    X
   1.177 +      [d]        X              X  X               X
   1.178 +
   1.179 +
   1.180 +   6. And Unicode has "NON-BREAK" characters. The lines should be broken around
   1.181 +      them. But in JIS X 4051, such class is not, therefore, we create [e].
   1.182 +
   1.183 +   Class of
   1.184 +   Leading    Class of Trailing Char Class
   1.185 +   Char
   1.186 +
   1.187 +              1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]
   1.188 +
   1.189 +        1     X  X  X  X  X  X  X  X  X       X    X   X
   1.190 +      [a]        X                                 X   X
   1.191 +        7        X  X                                  X
   1.192 +        8        X              X                      X
   1.193 +        9        X                                     X
   1.194 +      [b]        X                                 X   X
   1.195 +       15        X        X     X  X          X    X   X
   1.196 +       18        X              X  X          X    X   X
   1.197 +  COMPLEX        X                    T                X
   1.198 +      [c]     X  X  X  X  X  X  X  X  X       X    X   X
   1.199 +      [d]        X              X  X               X   X
   1.200 +      [e]     X  X  X  X  X  X  X  X  X       X    X   X
   1.201 +
   1.202 +
   1.203 +   7. Now we use one bit to encode weather it is breakable, and use 2 bytes
   1.204 +      for one row, then the bit table will look like:
   1.205 +
   1.206 +                 18    <-   1
   1.207 +
   1.208 +       1  0000 1111 1111 1111  = 0x0FFF
   1.209 +      [a] 0000 1100 0000 0010  = 0x0C02
   1.210 +       7  0000 1000 0000 0110  = 0x0806
   1.211 +       8  0000 1000 0100 0010  = 0x0842
   1.212 +       9  0000 1000 0000 0010  = 0x0802
   1.213 +      [b] 0000 1100 0000 0010  = 0x0C02
   1.214 +      15  0000 1110 1101 0010  = 0x0ED2
   1.215 +      18  0000 1110 1100 0010  = 0x0EC2
   1.216 + COMPLEX  0000 1001 0000 0010  = 0x0902
   1.217 +      [c] 0000 1111 1111 1111  = 0x0FFF
   1.218 +      [d] 0000 1100 1100 0010  = 0x0CC2
   1.219 +      [e] 0000 1111 1111 1111  = 0x0FFF
   1.220 +*/
   1.221 +
   1.222 +#define MAX_CLASSES 12
   1.223 +
   1.224 +static const uint16_t gPair[MAX_CLASSES] = {
   1.225 +  0x0FFF,
   1.226 +  0x0C02,
   1.227 +  0x0806,
   1.228 +  0x0842,
   1.229 +  0x0802,
   1.230 +  0x0C02,
   1.231 +  0x0ED2,
   1.232 +  0x0EC2,
   1.233 +  0x0902,
   1.234 +  0x0FFF,
   1.235 +  0x0CC2,
   1.236 +  0x0FFF
   1.237 +};
   1.238 +
   1.239 +
   1.240 +/*
   1.241 +
   1.242 +   8. And if the character is not enough far from word start, word end and
   1.243 +      another break point, we should not break in non-CJK languages.
   1.244 +      I.e., Don't break around 15, 18, [c] and [d], but don't change
   1.245 +      that if they are related to [b].
   1.246 +
   1.247 +   Class of
   1.248 +   Leading    Class of Trailing Char Class
   1.249 +   Char
   1.250 +
   1.251 +              1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]
   1.252 +
   1.253 +        1     X  X  X  X  X  X  X  X  X       X    X   X
   1.254 +      [a]        X              X  X          X    X   X
   1.255 +        7        X  X           X  X          X    X   X
   1.256 +        8        X              X  X          X    X   X
   1.257 +        9        X              X  X          X    X   X
   1.258 +      [b]        X                                 X   X
   1.259 +       15     X  X  X  X  X     X  X  X       X    X   X
   1.260 +       18     X  X  X  X  X     X  X  X       X    X   X
   1.261 +  COMPLEX        X              X  X  T       X    X   X
   1.262 +      [c]     X  X  X  X  X  X  X  X  X       X    X   X
   1.263 +      [d]     X  X  X  X  X     X  X  X       X    X   X
   1.264 +      [e]     X  X  X  X  X  X  X  X  X       X    X   X
   1.265 +
   1.266 +                 18    <-   1
   1.267 +
   1.268 +       1  0000 1111 1111 1111  = 0x0FFF
   1.269 +      [a] 0000 1110 1100 0010  = 0x0EC2
   1.270 +       7  0000 1110 1100 0110  = 0x0EC6
   1.271 +       8  0000 1110 1100 0010  = 0x0EC2
   1.272 +       9  0000 1110 1100 0010  = 0x0EC2
   1.273 +      [b] 0000 1100 0000 0010  = 0x0C02
   1.274 +      15  0000 1111 1101 1111  = 0x0FDF
   1.275 +      18  0000 1111 1101 1111  = 0x0FDF
   1.276 + COMPLEX  0000 1111 1100 0010  = 0x0FC2
   1.277 +      [c] 0000 1111 1111 1111  = 0x0FFF
   1.278 +      [d] 0000 1111 1101 1111  = 0x0FDF
   1.279 +      [e] 0000 1111 1111 1111  = 0x0FFF
   1.280 +*/
   1.281 +
   1.282 +static const uint16_t gPairConservative[MAX_CLASSES] = {
   1.283 +  0x0FFF,
   1.284 +  0x0EC2,
   1.285 +  0x0EC6,
   1.286 +  0x0EC2,
   1.287 +  0x0EC2,
   1.288 +  0x0C02,
   1.289 +  0x0FDF,
   1.290 +  0x0FDF,
   1.291 +  0x0FC2,
   1.292 +  0x0FFF,
   1.293 +  0x0FDF,
   1.294 +  0x0FFF
   1.295 +};
   1.296 +
   1.297 +
   1.298 +/*
   1.299 +
   1.300 +   9. Now we map the class to number
   1.301 +
   1.302 +      0: 1 
   1.303 +      1: [a]- 2, 3, 4, 5, 6
   1.304 +      2: 7
   1.305 +      3: 8
   1.306 +      4: 9
   1.307 +      5: [b]- 10, 11, 12, 17
   1.308 +      6: 15
   1.309 +      7: 18
   1.310 +      8: COMPLEX
   1.311 +      9: [c]
   1.312 +      A: [d]
   1.313 +      B: [e]
   1.314 +
   1.315 +    and they mean:
   1.316 +      0: Open parenthesis
   1.317 +      1: Punctuation that prohibits break before
   1.318 +      2: Non-breakable between same classes
   1.319 +      3: Prefix
   1.320 +      4: Postfix
   1.321 +      5: Breakable character (Spaces and Most Japanese characters)
   1.322 +      6: Numeric
   1.323 +      7: Characters
   1.324 +      8: Need special handling characters (E.g., Thai)
   1.325 +      9: Open parentheses like Character (See bug 389056)
   1.326 +      A: Close parenthese (or punctuations) like Character (See bug 389056)
   1.327 +      B: Non breakable (See bug 390920)
   1.328 +
   1.329 +*/
   1.330 +
   1.331 +#define CLASS_NONE                             INT8_MAX
   1.332 +
   1.333 +#define CLASS_OPEN                             0x00
   1.334 +#define CLASS_CLOSE                            0x01
   1.335 +#define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
   1.336 +#define CLASS_PREFIX                           0x03
   1.337 +#define CLASS_POSTFFIX                         0x04
   1.338 +#define CLASS_BREAKABLE                        0x05
   1.339 +#define CLASS_NUMERIC                          0x06
   1.340 +#define CLASS_CHARACTER                        0x07
   1.341 +#define CLASS_COMPLEX                          0x08
   1.342 +#define CLASS_OPEN_LIKE_CHARACTER              0x09
   1.343 +#define CLASS_CLOSE_LIKE_CHARACTER             0x0A
   1.344 +#define CLASS_NON_BREAKABLE                    0x0B
   1.345 +
   1.346 +#define U_NULL      char16_t(0x0000)
   1.347 +#define U_SLASH     char16_t('/')
   1.348 +#define U_SPACE     char16_t(' ')
   1.349 +#define U_HYPHEN    char16_t('-')
   1.350 +#define U_EQUAL     char16_t('=')
   1.351 +#define U_PERCENT   char16_t('%')
   1.352 +#define U_AMPERSAND char16_t('&')
   1.353 +#define U_SEMICOLON char16_t(';')
   1.354 +#define U_BACKSLASH char16_t('\\')
   1.355 +#define U_OPEN_SINGLE_QUOTE char16_t(0x2018)
   1.356 +#define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)
   1.357 +#define U_OPEN_GUILLEMET    char16_t(0x00AB)
   1.358 +
   1.359 +#define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \
   1.360 +                                     (c) == U_SLASH || \
   1.361 +                                     (c) == U_PERCENT || \
   1.362 +                                     (c) == U_AMPERSAND || \
   1.363 +                                     (c) == U_SEMICOLON || \
   1.364 +                                     (c) == U_BACKSLASH || \
   1.365 +                                     (c) == U_OPEN_SINGLE_QUOTE || \
   1.366 +                                     (c) == U_OPEN_DOUBLE_QUOTE || \
   1.367 +                                     (c) == U_OPEN_GUILLEMET)
   1.368 +
   1.369 +#define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
   1.370 +
   1.371 +static inline int
   1.372 +GETCLASSFROMTABLE(const uint32_t* t, uint16_t l)
   1.373 +{
   1.374 +  return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
   1.375 +}
   1.376 +
   1.377 +static inline int
   1.378 +IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u)
   1.379 +{
   1.380 +  return ((0xff66 <= (u)) && ((u) <= 0xff70));
   1.381 +}
   1.382 +
   1.383 +static inline int
   1.384 +IS_CJK_CHAR(char16_t u)
   1.385 +{
   1.386 +  return ((0x1100 <= (u) && (u) <= 0x11ff) ||
   1.387 +          (0x2e80 <= (u) && (u) <= 0xd7ff) ||
   1.388 +          (0xf900 <= (u) && (u) <= 0xfaff) ||
   1.389 +          (0xff00 <= (u) && (u) <= 0xffef) );
   1.390 +}
   1.391 +
   1.392 +static inline bool
   1.393 +IS_NONBREAKABLE_SPACE(char16_t u)
   1.394 +{
   1.395 +  return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
   1.396 +}
   1.397 +
   1.398 +static inline bool
   1.399 +IS_HYPHEN(char16_t u)
   1.400 +{
   1.401 +  return (u == U_HYPHEN ||
   1.402 +          u == 0x058A || // ARMENIAN HYPHEN
   1.403 +          u == 0x2010 || // HYPHEN
   1.404 +          u == 0x2012 || // FIGURE DASH
   1.405 +          u == 0x2013);  // EN DASH
   1.406 +}
   1.407 +
   1.408 +static int8_t
   1.409 +GetClass(char16_t u)
   1.410 +{
   1.411 +   uint16_t h = u & 0xFF00;
   1.412 +   uint16_t l = u & 0x00ff;
   1.413 +   int8_t c;
   1.414 +
   1.415 +   // Handle 3 range table first
   1.416 +   if (0x0000 == h) {
   1.417 +     c = GETCLASSFROMTABLE(gLBClass00, l);
   1.418 +   } else if (0x1700 == h) {
   1.419 +     c = GETCLASSFROMTABLE(gLBClass17, l);
   1.420 +   } else if (NS_NeedsPlatformNativeHandling(u)) {
   1.421 +     c = CLASS_COMPLEX;
   1.422 +   } else if (0x0E00 == h) {
   1.423 +     c = GETCLASSFROMTABLE(gLBClass0E, l);
   1.424 +   } else if (0x2000 == h) {
   1.425 +     c = GETCLASSFROMTABLE(gLBClass20, l);
   1.426 +   } else if (0x2100 == h) {
   1.427 +     c = GETCLASSFROMTABLE(gLBClass21, l);
   1.428 +   } else if (0x3000 == h) {
   1.429 +     c = GETCLASSFROMTABLE(gLBClass30, l);
   1.430 +   } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi
   1.431 +              ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul
   1.432 +              ((0xf900 <= h) && (h <= 0xfaff))) {
   1.433 +     c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatibility
   1.434 +   } else if (0xff00 == h) {
   1.435 +     if (l < 0x0060) { // Fullwidth ASCII variant
   1.436 +       c = GETCLASSFROMTABLE(gLBClass00, (l+0x20));
   1.437 +     } else if (l < 0x00a0) {
   1.438 +       switch (l) {
   1.439 +         case 0x61: c = GetClass(0x3002); break;
   1.440 +         case 0x62: c = GetClass(0x300c); break;
   1.441 +         case 0x63: c = GetClass(0x300d); break;
   1.442 +         case 0x64: c = GetClass(0x3001); break;
   1.443 +         case 0x65: c = GetClass(0x30fb); break;
   1.444 +         case 0x9e: c = GetClass(0x309b); break;
   1.445 +         case 0x9f: c = GetClass(0x309c); break;
   1.446 +         default:
   1.447 +           if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u))
   1.448 +              c = CLASS_CLOSE; // jis x4051 class 3
   1.449 +           else
   1.450 +              c = CLASS_BREAKABLE; // jis x4051 class 11
   1.451 +           break;
   1.452 +       }
   1.453 +     // Halfwidth Katakana variants
   1.454 +     } else if (l < 0x00e0) {
   1.455 +       c = CLASS_CHARACTER; // Halfwidth Hangul variants
   1.456 +     } else if (l < 0x00f0) {
   1.457 +       static char16_t NarrowFFEx[16] = {
   1.458 +         0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
   1.459 +         0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
   1.460 +       };
   1.461 +       c = GetClass(NarrowFFEx[l - 0x00e0]);
   1.462 +     } else {
   1.463 +       c = CLASS_CHARACTER;
   1.464 +     }
   1.465 +   } else if (0x3100 == h) { 
   1.466 +     if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
   1.467 +                      // XXX: This is per UAX #14, but UAX #14 may change
   1.468 +                      // the line breaking rules about Kanbun and Bopomofo.
   1.469 +       c = CLASS_BREAKABLE;
   1.470 +     } else if (l >= 0xf0) { // Katakana small letters for Ainu
   1.471 +       c = CLASS_CLOSE;
   1.472 +     } else { // unassigned
   1.473 +       c = CLASS_CHARACTER;
   1.474 +     }
   1.475 +   } else if (0x0300 == h) {
   1.476 +     if (0x4F == l || (0x5C <= l && l <= 0x62))
   1.477 +       c = CLASS_NON_BREAKABLE;
   1.478 +     else
   1.479 +       c = CLASS_CHARACTER;
   1.480 +   } else if (0x0500 == h) {
   1.481 +     // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
   1.482 +     if (l == 0x8A)
   1.483 +       c = GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
   1.484 +     else
   1.485 +       c = CLASS_CHARACTER;
   1.486 +   } else if (0x0F00 == h) {
   1.487 +     if (0x08 == l || 0x0C == l || 0x12 == l)
   1.488 +       c = CLASS_NON_BREAKABLE;
   1.489 +     else
   1.490 +       c = CLASS_CHARACTER;
   1.491 +   } else if (0x1800 == h) {
   1.492 +     if (0x0E == l)
   1.493 +       c = CLASS_NON_BREAKABLE;
   1.494 +     else
   1.495 +       c = CLASS_CHARACTER;
   1.496 +   } else if (0x1600 == h) {
   1.497 +     if (0x80 == l) { // U+1680 OGHAM SPACE MARK
   1.498 +       c = CLASS_BREAKABLE;
   1.499 +     } else {
   1.500 +       c = CLASS_CHARACTER;
   1.501 +     }
   1.502 +   } else if (u == 0xfeff) {
   1.503 +     c = CLASS_NON_BREAKABLE;
   1.504 +   } else {
   1.505 +     c = CLASS_CHARACTER; // others
   1.506 +   }
   1.507 +   return c;
   1.508 +}
   1.509 +
   1.510 +static bool
   1.511 +GetPair(int8_t c1, int8_t c2)
   1.512 +{
   1.513 +  NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
   1.514 +  NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
   1.515 +
   1.516 +  return (0 == ((gPair[c1] >> c2) & 0x0001));
   1.517 +}
   1.518 +
   1.519 +static bool
   1.520 +GetPairConservative(int8_t c1, int8_t c2)
   1.521 +{
   1.522 +  NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
   1.523 +  NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
   1.524 +
   1.525 +  return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
   1.526 +}
   1.527 +
   1.528 +nsJISx4051LineBreaker::nsJISx4051LineBreaker()
   1.529 +{
   1.530 +}
   1.531 +
   1.532 +nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
   1.533 +{
   1.534 +}
   1.535 +
   1.536 +NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker)
   1.537 +
   1.538 +class ContextState {
   1.539 +public:
   1.540 +  ContextState(const char16_t* aText, uint32_t aLength) {
   1.541 +    mUniText = aText;
   1.542 +    mText = nullptr;
   1.543 +    mLength = aLength;
   1.544 +    Init();
   1.545 +  }
   1.546 +
   1.547 +  ContextState(const uint8_t* aText, uint32_t aLength) {
   1.548 +    mUniText = nullptr;
   1.549 +    mText = aText;
   1.550 +    mLength = aLength;
   1.551 +    Init();
   1.552 +  }
   1.553 +
   1.554 +  uint32_t Length() { return mLength; }
   1.555 +  uint32_t Index() { return mIndex; }
   1.556 +
   1.557 +  char16_t GetCharAt(uint32_t aIndex) {
   1.558 +    NS_ASSERTION(aIndex < mLength, "Out of range!");
   1.559 +    return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
   1.560 +  }
   1.561 +
   1.562 +  void AdvanceIndex() {
   1.563 +    ++mIndex;
   1.564 +  }
   1.565 +
   1.566 +  void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
   1.567 +
   1.568 +// A word of western language should not be broken. But even if the word has
   1.569 +// only ASCII characters, non-natural context words should be broken, e.g.,
   1.570 +// URL and file path. For protecting the natural words, we should use
   1.571 +// conservative breaking rules at following conditions:
   1.572 +//   1. at near the start of word
   1.573 +//   2. at near the end of word
   1.574 +//   3. at near the latest broken point
   1.575 +// CONSERVATIVE_BREAK_RANGE define the 'near' in characters.
   1.576 +#define CONSERVATIVE_BREAK_RANGE 6
   1.577 +
   1.578 +  bool UseConservativeBreaking(uint32_t aOffset = 0) {
   1.579 +    if (mHasCJKChar)
   1.580 +      return false;
   1.581 +    uint32_t index = mIndex + aOffset;
   1.582 +    bool result = (index < CONSERVATIVE_BREAK_RANGE ||
   1.583 +                     mLength - index < CONSERVATIVE_BREAK_RANGE ||
   1.584 +                     index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE);
   1.585 +    if (result || !mHasNonbreakableSpace)
   1.586 +      return result;
   1.587 +
   1.588 +    // This text has no-breakable space, we need to check whether the index
   1.589 +    // is near it.
   1.590 +
   1.591 +    // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here.
   1.592 +    for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) {
   1.593 +      if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1)))
   1.594 +        return true;
   1.595 +    }
   1.596 +    // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE.
   1.597 +    for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) {
   1.598 +      if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))
   1.599 +        return true;
   1.600 +    }
   1.601 +    return false;
   1.602 +  }
   1.603 +
   1.604 +  bool HasPreviousEqualsSign() const {
   1.605 +    return mHasPreviousEqualsSign;
   1.606 +  }
   1.607 +  void NotifySeenEqualsSign() {
   1.608 +    mHasPreviousEqualsSign = true;
   1.609 +  }
   1.610 +
   1.611 +  bool HasPreviousSlash() const {
   1.612 +    return mHasPreviousSlash;
   1.613 +  }
   1.614 +  void NotifySeenSlash() {
   1.615 +    mHasPreviousSlash = true;
   1.616 +  }
   1.617 +
   1.618 +  bool HasPreviousBackslash() const {
   1.619 +    return mHasPreviousBackslash;
   1.620 +  }
   1.621 +  void NotifySeenBackslash() {
   1.622 +    mHasPreviousBackslash = true;
   1.623 +  }
   1.624 +
   1.625 +  char16_t GetPreviousNonHyphenCharacter() const {
   1.626 +    return mPreviousNonHyphenCharacter;
   1.627 +  }
   1.628 +  void NotifyNonHyphenCharacter(char16_t ch) {
   1.629 +    mPreviousNonHyphenCharacter = ch;
   1.630 +  }
   1.631 +
   1.632 +private:
   1.633 +  void Init() {
   1.634 +    mIndex = 0;
   1.635 +    mLastBreakIndex = 0;
   1.636 +    mPreviousNonHyphenCharacter = U_NULL;
   1.637 +    mHasCJKChar = 0;
   1.638 +    mHasNonbreakableSpace = 0;
   1.639 +    mHasPreviousEqualsSign = false;
   1.640 +    mHasPreviousSlash = false;
   1.641 +    mHasPreviousBackslash = false;
   1.642 +
   1.643 +    for (uint32_t i = 0; i < mLength; ++i) {
   1.644 +      char16_t u = GetCharAt(i);
   1.645 +      if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u))
   1.646 +        mHasNonbreakableSpace = 1;
   1.647 +      else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u))
   1.648 +        mHasCJKChar = 1;
   1.649 +    }
   1.650 +  }
   1.651 +
   1.652 +  const char16_t* mUniText;
   1.653 +  const uint8_t* mText;
   1.654 +
   1.655 +  uint32_t mIndex;
   1.656 +  uint32_t mLength;         // length of text
   1.657 +  uint32_t mLastBreakIndex;
   1.658 +  char16_t mPreviousNonHyphenCharacter; // The last character we have seen
   1.659 +                                         // which is not U_HYPHEN
   1.660 +  bool mHasCJKChar; // if the text has CJK character, this is true.
   1.661 +  bool mHasNonbreakableSpace; // if the text has no-breakable space,
   1.662 +                                     // this is true.
   1.663 +  bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
   1.664 +  bool mHasPreviousSlash;      // True if we have seen a U_SLASH
   1.665 +  bool mHasPreviousBackslash;  // True if we have seen a U_BACKSLASH
   1.666 +};
   1.667 +
   1.668 +static int8_t
   1.669 +ContextualAnalysis(char16_t prev, char16_t cur, char16_t next,
   1.670 +                   ContextState &aState)
   1.671 +{
   1.672 +  // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
   1.673 +
   1.674 +  if (IS_HYPHEN(cur)) {
   1.675 +    // If next character is hyphen, we don't need to break between them.
   1.676 +    if (IS_HYPHEN(next))
   1.677 +      return CLASS_CHARACTER;
   1.678 +    // If prev and next characters are numeric, it may be in Math context.
   1.679 +    // So, we should not break here.
   1.680 +    bool prevIsNum = IS_ASCII_DIGIT(prev);
   1.681 +    bool nextIsNum = IS_ASCII_DIGIT(next);
   1.682 +    if (prevIsNum && nextIsNum)
   1.683 +      return CLASS_NUMERIC;
   1.684 +    // If one side is numeric and the other is a character, or if both sides are
   1.685 +    // characters, the hyphen should be breakable.
   1.686 +    if (!aState.UseConservativeBreaking(1)) {
   1.687 +      char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
   1.688 +      if (prevOfHyphen && next) {
   1.689 +        int8_t prevClass = GetClass(prevOfHyphen);
   1.690 +        int8_t nextClass = GetClass(next);
   1.691 +        bool prevIsNumOrCharOrClose =
   1.692 +          prevIsNum ||
   1.693 +          (prevClass == CLASS_CHARACTER &&
   1.694 +            !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
   1.695 +          prevClass == CLASS_CLOSE ||
   1.696 +          prevClass == CLASS_CLOSE_LIKE_CHARACTER;
   1.697 +        bool nextIsNumOrCharOrOpen =
   1.698 +          nextIsNum ||
   1.699 +          (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||
   1.700 +          nextClass == CLASS_OPEN ||
   1.701 +          nextClass == CLASS_OPEN_LIKE_CHARACTER ||
   1.702 +          next == U_OPEN_SINGLE_QUOTE ||
   1.703 +          next == U_OPEN_DOUBLE_QUOTE ||
   1.704 +          next == U_OPEN_GUILLEMET;
   1.705 +        if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {
   1.706 +          return CLASS_CLOSE;
   1.707 +        }
   1.708 +      }
   1.709 +    }
   1.710 +  } else {
   1.711 +    aState.NotifyNonHyphenCharacter(cur);
   1.712 +    if (cur == U_SLASH || cur == U_BACKSLASH) {
   1.713 +      // If this is immediately after same char, we should not break here.
   1.714 +      if (prev == cur)
   1.715 +        return CLASS_CHARACTER;
   1.716 +      // If this text has two or more (BACK)SLASHs, this may be file path or URL.
   1.717 +      // Make sure to compute shouldReturn before we notify on this slash.
   1.718 +      bool shouldReturn = !aState.UseConservativeBreaking() &&
   1.719 +        (cur == U_SLASH ?
   1.720 +         aState.HasPreviousSlash() : aState.HasPreviousBackslash());
   1.721 +
   1.722 +      if (cur == U_SLASH) {
   1.723 +        aState.NotifySeenSlash();
   1.724 +      } else {
   1.725 +        aState.NotifySeenBackslash();
   1.726 +      }
   1.727 +
   1.728 +      if (shouldReturn)
   1.729 +        return CLASS_OPEN;
   1.730 +    } else if (cur == U_PERCENT) {
   1.731 +      // If this is a part of the param of URL, we should break before.
   1.732 +      if (!aState.UseConservativeBreaking()) {
   1.733 +        if (aState.Index() >= 3 &&
   1.734 +            aState.GetCharAt(aState.Index() - 3) == U_PERCENT)
   1.735 +          return CLASS_OPEN;
   1.736 +        if (aState.Index() + 3 < aState.Length() &&
   1.737 +            aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
   1.738 +          return CLASS_OPEN;
   1.739 +      }
   1.740 +    } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
   1.741 +      // If this may be a separator of params of URL, we should break after.
   1.742 +      if (!aState.UseConservativeBreaking(1) &&
   1.743 +          aState.HasPreviousEqualsSign())
   1.744 +        return CLASS_CLOSE;
   1.745 +    } else if (cur == U_OPEN_SINGLE_QUOTE ||
   1.746 +               cur == U_OPEN_DOUBLE_QUOTE ||
   1.747 +               cur == U_OPEN_GUILLEMET) {
   1.748 +      // for CJK usage, we treat these as openers to allow a break before them,
   1.749 +      // but otherwise treat them as normal characters because quote mark usage
   1.750 +      // in various Western languages varies too much; see bug #450088 discussion.
   1.751 +      if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
   1.752 +        return CLASS_OPEN;
   1.753 +    } else {
   1.754 +      NS_ERROR("Forgot to handle the current character!");
   1.755 +    }
   1.756 +  }
   1.757 +  return GetClass(cur);
   1.758 +}
   1.759 +
   1.760 +
   1.761 +int32_t
   1.762 +nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen,
   1.763 +                                uint32_t aPos, int8_t aDirection)
   1.764 +{
   1.765 +  bool    textNeedsJISx4051 = false;
   1.766 +  int32_t begin, end;
   1.767 +
   1.768 +  for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
   1.769 +    if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) {
   1.770 +      textNeedsJISx4051 = true;
   1.771 +    }
   1.772 +  }
   1.773 +  for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {
   1.774 +    if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
   1.775 +      textNeedsJISx4051 = true;
   1.776 +    }
   1.777 +  }
   1.778 +
   1.779 +  int32_t ret;
   1.780 +  nsAutoTArray<uint8_t, 2000> breakState;
   1.781 +  if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
   1.782 +    // No complex text character, do not try to do complex line break.
   1.783 +    // (This is required for serializers. See Bug #344816.)
   1.784 +    // Also fall back to this when out of memory.
   1.785 +    if (aDirection < 0) {
   1.786 +      ret = (begin == int32_t(aPos)) ? begin - 1 : begin;
   1.787 +    } else {
   1.788 +      ret = end;
   1.789 +    }
   1.790 +  } else {
   1.791 +    GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal,
   1.792 +                      breakState.Elements());
   1.793 +
   1.794 +    ret = aPos;
   1.795 +    do {
   1.796 +      ret += aDirection;
   1.797 +    } while (begin < ret && ret < end && !breakState[ret - begin]);
   1.798 +  }
   1.799 +
   1.800 +  return ret;
   1.801 +}
   1.802 +
   1.803 +int32_t
   1.804 +nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen,
   1.805 +                            uint32_t aPos) 
   1.806 +{
   1.807 +  NS_ASSERTION(aText, "aText shouldn't be null");
   1.808 +  NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next");
   1.809 +
   1.810 +  int32_t nextPos = WordMove(aText, aLen, aPos, 1);
   1.811 +  return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
   1.812 +}
   1.813 +
   1.814 +int32_t
   1.815 +nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen,
   1.816 +                            uint32_t aPos) 
   1.817 +{
   1.818 +  NS_ASSERTION(aText, "aText shouldn't be null");
   1.819 +  NS_ASSERTION(aLen >= aPos && aPos > 0,
   1.820 +               "Bad position passed to nsJISx4051LineBreaker::Prev");
   1.821 +
   1.822 +  int32_t prevPos = WordMove(aText, aLen, aPos, -1);
   1.823 +  return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
   1.824 +}
   1.825 +
   1.826 +void
   1.827 +nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength,
   1.828 +                                         uint8_t aWordBreak,
   1.829 +                                         uint8_t* aBreakBefore)
   1.830 +{
   1.831 +  uint32_t cur;
   1.832 +  int8_t lastClass = CLASS_NONE;
   1.833 +  ContextState state(aChars, aLength);
   1.834 +
   1.835 +  for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
   1.836 +    char16_t ch = aChars[cur];
   1.837 +    int8_t cl;
   1.838 +
   1.839 +    if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
   1.840 +      cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
   1.841 +                              ch,
   1.842 +                              cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
   1.843 +                              state);
   1.844 +    } else {
   1.845 +      if (ch == U_EQUAL)
   1.846 +        state.NotifySeenEqualsSign();
   1.847 +      state.NotifyNonHyphenCharacter(ch);
   1.848 +      cl = GetClass(ch);
   1.849 +    }
   1.850 +
   1.851 +    bool allowBreak = false;
   1.852 +    if (cur > 0) {
   1.853 +      NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
   1.854 +                   "Loop should have prevented adjacent complex chars here");
   1.855 +      if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
   1.856 +        allowBreak = (state.UseConservativeBreaking()) ?
   1.857 +          GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
   1.858 +      } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
   1.859 +        allowBreak = true;
   1.860 +      }
   1.861 +    }
   1.862 +    aBreakBefore[cur] = allowBreak;
   1.863 +    if (allowBreak)
   1.864 +      state.NotifyBreakBefore();
   1.865 +    lastClass = cl;
   1.866 +    if (CLASS_COMPLEX == cl) {
   1.867 +      uint32_t end = cur + 1;
   1.868 +
   1.869 +      while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) {
   1.870 +        ++end;
   1.871 +      }
   1.872 +
   1.873 +      NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
   1.874 +
   1.875 +      // We have to consider word-break value again for complex characters
   1.876 +      if (aWordBreak != nsILineBreaker::kWordBreak_Normal) {
   1.877 +        // Respect word-break property 
   1.878 +        for (uint32_t i = cur; i < end; i++)
   1.879 +          aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll);
   1.880 +      }
   1.881 +
   1.882 +      // restore breakability at chunk begin, which was always set to false
   1.883 +      // by the complex line breaker
   1.884 +      aBreakBefore[cur] = allowBreak;
   1.885 +
   1.886 +      cur = end - 1;
   1.887 +    }
   1.888 +  }
   1.889 +}
   1.890 +
   1.891 +void
   1.892 +nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength,
   1.893 +                                         uint8_t aWordBreak,
   1.894 +                                         uint8_t* aBreakBefore)
   1.895 +{
   1.896 +  uint32_t cur;
   1.897 +  int8_t lastClass = CLASS_NONE;
   1.898 +  ContextState state(aChars, aLength);
   1.899 +
   1.900 +  for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
   1.901 +    char16_t ch = aChars[cur];
   1.902 +    int8_t cl;
   1.903 +
   1.904 +    if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
   1.905 +      cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
   1.906 +                              ch,
   1.907 +                              cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
   1.908 +                              state);
   1.909 +    } else {
   1.910 +      if (ch == U_EQUAL)
   1.911 +        state.NotifySeenEqualsSign();
   1.912 +      state.NotifyNonHyphenCharacter(ch);
   1.913 +      cl = GetClass(ch);
   1.914 +    }
   1.915 +
   1.916 +    bool allowBreak = false;
   1.917 +    if (cur > 0) {
   1.918 +      if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
   1.919 +        allowBreak = (state.UseConservativeBreaking()) ?
   1.920 +          GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
   1.921 +      } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
   1.922 +        allowBreak = true;
   1.923 +      }
   1.924 +    }
   1.925 +    aBreakBefore[cur] = allowBreak;
   1.926 +    if (allowBreak)
   1.927 +      state.NotifyBreakBefore();
   1.928 +    lastClass = cl;
   1.929 +  }
   1.930 +}
The Tor Browser / file diff

diff: intl/lwbrk/src/nsJISx4051LineBreaker.cpp

intl/lwbrk/src/nsJISx4051LineBreaker.cpp