Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | |
michael@0 | 6 | |
michael@0 | 7 | |
michael@0 | 8 | #include "nsJISx4051LineBreaker.h" |
michael@0 | 9 | |
michael@0 | 10 | #include "jisx4051class.h" |
michael@0 | 11 | #include "nsComplexBreaker.h" |
michael@0 | 12 | #include "nsTArray.h" |
michael@0 | 13 | |
michael@0 | 14 | /* |
michael@0 | 15 | |
michael@0 | 16 | Simplification of Pair Table in JIS X 4051 |
michael@0 | 17 | |
michael@0 | 18 | 1. The Origion Table - in 4.1.3 |
michael@0 | 19 | |
michael@0 | 20 | In JIS x 4051. The pair table is defined as below |
michael@0 | 21 | |
michael@0 | 22 | Class of |
michael@0 | 23 | Leading Class of Trailing Char Class |
michael@0 | 24 | Char |
michael@0 | 25 | |
michael@0 | 26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20 |
michael@0 | 27 | * # * # |
michael@0 | 28 | 1 X X X X X X X X X X X X X X X X X X X X X E |
michael@0 | 29 | 2 X X X X X X |
michael@0 | 30 | 3 X X X X X X |
michael@0 | 31 | 4 X X X X X X |
michael@0 | 32 | 5 X X X X X X |
michael@0 | 33 | 6 X X X X X X |
michael@0 | 34 | 7 X X X X X X X |
michael@0 | 35 | 8 X X X X X X E |
michael@0 | 36 | 9 X X X X X X |
michael@0 | 37 | 10 X X X X X X |
michael@0 | 38 | 11 X X X X X X |
michael@0 | 39 | 12 X X X X X X |
michael@0 | 40 | 13 X X X X X X X |
michael@0 | 41 | 14 X X X X X X X |
michael@0 | 42 | 15 X X X X X X X X X |
michael@0 | 43 | 16 X X X X X X X X |
michael@0 | 44 | 17 X X X X X E |
michael@0 | 45 | 18 X X X X X X X X X |
michael@0 | 46 | 19 X E E E E E X X X X X X X X X X X X E X E E |
michael@0 | 47 | 20 X X X X X E |
michael@0 | 48 | |
michael@0 | 49 | * Same Char |
michael@0 | 50 | # Other Char |
michael@0 | 51 | |
michael@0 | 52 | X Cannot Break |
michael@0 | 53 | |
michael@0 | 54 | The classes mean: |
michael@0 | 55 | 1: Open parenthesis |
michael@0 | 56 | 2: Close parenthesis |
michael@0 | 57 | 3: Prohibit a line break before |
michael@0 | 58 | 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?") |
michael@0 | 59 | 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT) |
michael@0 | 60 | 6: Full stop |
michael@0 | 61 | 7: Non-breakable between same characters |
michael@0 | 62 | 8: Prefix (e.g., "$", "NO.") |
michael@0 | 63 | 9: Postfix (e.g., "%") |
michael@0 | 64 | 10: Ideographic space |
michael@0 | 65 | 11: Hiragana |
michael@0 | 66 | 12: Japanese characters (except class 11) |
michael@0 | 67 | 13: Subscript |
michael@0 | 68 | 14: Ruby |
michael@0 | 69 | 15: Numeric |
michael@0 | 70 | 16: Alphabet |
michael@0 | 71 | 17: Space for Western language |
michael@0 | 72 | 18: Western characters (except class 17) |
michael@0 | 73 | 19: Split line note (Warichu) begin quote |
michael@0 | 74 | 20: Split line note (Warichu) end quote |
michael@0 | 75 | |
michael@0 | 76 | 2. Simplified by remove the class which we do not care |
michael@0 | 77 | |
michael@0 | 78 | However, since we do not care about class 13(Subscript), 14(Ruby), |
michael@0 | 79 | 16 (Aphabet), 19(split line note begin quote), and 20(split line note end |
michael@0 | 80 | quote) we can simplify this par table into the following |
michael@0 | 81 | |
michael@0 | 82 | Class of |
michael@0 | 83 | Leading Class of Trailing Char Class |
michael@0 | 84 | Char |
michael@0 | 85 | |
michael@0 | 86 | 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18 |
michael@0 | 87 | |
michael@0 | 88 | 1 X X X X X X X X X X X X X X X |
michael@0 | 89 | 2 X X X X X |
michael@0 | 90 | 3 X X X X X |
michael@0 | 91 | 4 X X X X X |
michael@0 | 92 | 5 X X X X X |
michael@0 | 93 | 6 X X X X X |
michael@0 | 94 | 7 X X X X X X |
michael@0 | 95 | 8 X X X X X X |
michael@0 | 96 | 9 X X X X X |
michael@0 | 97 | 10 X X X X X |
michael@0 | 98 | 11 X X X X X |
michael@0 | 99 | 12 X X X X X |
michael@0 | 100 | 15 X X X X X X X X |
michael@0 | 101 | 17 X X X X X |
michael@0 | 102 | 18 X X X X X X X |
michael@0 | 103 | |
michael@0 | 104 | 3. Simplified by merged classes |
michael@0 | 105 | |
michael@0 | 106 | After the 2 simplification, the pair table have some duplication |
michael@0 | 107 | a. class 2, 3, 4, 5, 6, are the same- we can merged them |
michael@0 | 108 | b. class 10, 11, 12, 17 are the same- we can merged them |
michael@0 | 109 | |
michael@0 | 110 | |
michael@0 | 111 | Class of |
michael@0 | 112 | Leading Class of Trailing Char Class |
michael@0 | 113 | Char |
michael@0 | 114 | |
michael@0 | 115 | 1 [a] 7 8 9 [b]15 18 |
michael@0 | 116 | |
michael@0 | 117 | 1 X X X X X X X X |
michael@0 | 118 | [a] X |
michael@0 | 119 | 7 X X |
michael@0 | 120 | 8 X X |
michael@0 | 121 | 9 X |
michael@0 | 122 | [b] X |
michael@0 | 123 | 15 X X X X |
michael@0 | 124 | 18 X X X |
michael@0 | 125 | |
michael@0 | 126 | |
michael@0 | 127 | 4. We add COMPLEX characters and make it breakable w/ all ther class |
michael@0 | 128 | except after class 1 and before class [a] |
michael@0 | 129 | |
michael@0 | 130 | Class of |
michael@0 | 131 | Leading Class of Trailing Char Class |
michael@0 | 132 | Char |
michael@0 | 133 | |
michael@0 | 134 | 1 [a] 7 8 9 [b]15 18 COMPLEX |
michael@0 | 135 | |
michael@0 | 136 | 1 X X X X X X X X X |
michael@0 | 137 | [a] X |
michael@0 | 138 | 7 X X |
michael@0 | 139 | 8 X X |
michael@0 | 140 | 9 X |
michael@0 | 141 | [b] X |
michael@0 | 142 | 15 X X X X |
michael@0 | 143 | 18 X X X |
michael@0 | 144 | COMPLEX X T |
michael@0 | 145 | |
michael@0 | 146 | T : need special handling |
michael@0 | 147 | |
michael@0 | 148 | |
michael@0 | 149 | 5. However, we need two special class for some punctuations/parentheses, |
michael@0 | 150 | theirs breaking rules like character class (18), see bug 389056. |
michael@0 | 151 | And also we need character like punctuation that is same behavior with 18, |
michael@0 | 152 | but the characters are not letters of all languages. (e.g., '_') |
michael@0 | 153 | [c]. Based on open parenthesis class (1), but it is not breakable after |
michael@0 | 154 | character class (18) or numeric class (15). |
michael@0 | 155 | [d]. Based on close parenthesis (or punctuation) class (2), but it is not |
michael@0 | 156 | breakable before character class (18) or numeric class (15). |
michael@0 | 157 | |
michael@0 | 158 | Class of |
michael@0 | 159 | Leading Class of Trailing Char Class |
michael@0 | 160 | Char |
michael@0 | 161 | |
michael@0 | 162 | 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] |
michael@0 | 163 | |
michael@0 | 164 | 1 X X X X X X X X X X X |
michael@0 | 165 | [a] X X X |
michael@0 | 166 | 7 X X |
michael@0 | 167 | 8 X X |
michael@0 | 168 | 9 X |
michael@0 | 169 | [b] X X |
michael@0 | 170 | 15 X X X X X X |
michael@0 | 171 | 18 X X X X X |
michael@0 | 172 | COMPLEX X T |
michael@0 | 173 | [c] X X X X X X X X X X X |
michael@0 | 174 | [d] X X X X |
michael@0 | 175 | |
michael@0 | 176 | |
michael@0 | 177 | 6. And Unicode has "NON-BREAK" characters. The lines should be broken around |
michael@0 | 178 | them. But in JIS X 4051, such class is not, therefore, we create [e]. |
michael@0 | 179 | |
michael@0 | 180 | Class of |
michael@0 | 181 | Leading Class of Trailing Char Class |
michael@0 | 182 | Char |
michael@0 | 183 | |
michael@0 | 184 | 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] |
michael@0 | 185 | |
michael@0 | 186 | 1 X X X X X X X X X X X X |
michael@0 | 187 | [a] X X X |
michael@0 | 188 | 7 X X X |
michael@0 | 189 | 8 X X X |
michael@0 | 190 | 9 X X |
michael@0 | 191 | [b] X X X |
michael@0 | 192 | 15 X X X X X X X |
michael@0 | 193 | 18 X X X X X X |
michael@0 | 194 | COMPLEX X T X |
michael@0 | 195 | [c] X X X X X X X X X X X X |
michael@0 | 196 | [d] X X X X X |
michael@0 | 197 | [e] X X X X X X X X X X X X |
michael@0 | 198 | |
michael@0 | 199 | |
michael@0 | 200 | 7. Now we use one bit to encode weather it is breakable, and use 2 bytes |
michael@0 | 201 | for one row, then the bit table will look like: |
michael@0 | 202 | |
michael@0 | 203 | 18 <- 1 |
michael@0 | 204 | |
michael@0 | 205 | 1 0000 1111 1111 1111 = 0x0FFF |
michael@0 | 206 | [a] 0000 1100 0000 0010 = 0x0C02 |
michael@0 | 207 | 7 0000 1000 0000 0110 = 0x0806 |
michael@0 | 208 | 8 0000 1000 0100 0010 = 0x0842 |
michael@0 | 209 | 9 0000 1000 0000 0010 = 0x0802 |
michael@0 | 210 | [b] 0000 1100 0000 0010 = 0x0C02 |
michael@0 | 211 | 15 0000 1110 1101 0010 = 0x0ED2 |
michael@0 | 212 | 18 0000 1110 1100 0010 = 0x0EC2 |
michael@0 | 213 | COMPLEX 0000 1001 0000 0010 = 0x0902 |
michael@0 | 214 | [c] 0000 1111 1111 1111 = 0x0FFF |
michael@0 | 215 | [d] 0000 1100 1100 0010 = 0x0CC2 |
michael@0 | 216 | [e] 0000 1111 1111 1111 = 0x0FFF |
michael@0 | 217 | */ |
michael@0 | 218 | |
michael@0 | 219 | #define MAX_CLASSES 12 |
michael@0 | 220 | |
michael@0 | 221 | static const uint16_t gPair[MAX_CLASSES] = { |
michael@0 | 222 | 0x0FFF, |
michael@0 | 223 | 0x0C02, |
michael@0 | 224 | 0x0806, |
michael@0 | 225 | 0x0842, |
michael@0 | 226 | 0x0802, |
michael@0 | 227 | 0x0C02, |
michael@0 | 228 | 0x0ED2, |
michael@0 | 229 | 0x0EC2, |
michael@0 | 230 | 0x0902, |
michael@0 | 231 | 0x0FFF, |
michael@0 | 232 | 0x0CC2, |
michael@0 | 233 | 0x0FFF |
michael@0 | 234 | }; |
michael@0 | 235 | |
michael@0 | 236 | |
michael@0 | 237 | /* |
michael@0 | 238 | |
michael@0 | 239 | 8. And if the character is not enough far from word start, word end and |
michael@0 | 240 | another break point, we should not break in non-CJK languages. |
michael@0 | 241 | I.e., Don't break around 15, 18, [c] and [d], but don't change |
michael@0 | 242 | that if they are related to [b]. |
michael@0 | 243 | |
michael@0 | 244 | Class of |
michael@0 | 245 | Leading Class of Trailing Char Class |
michael@0 | 246 | Char |
michael@0 | 247 | |
michael@0 | 248 | 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] |
michael@0 | 249 | |
michael@0 | 250 | 1 X X X X X X X X X X X X |
michael@0 | 251 | [a] X X X X X X |
michael@0 | 252 | 7 X X X X X X X |
michael@0 | 253 | 8 X X X X X X |
michael@0 | 254 | 9 X X X X X X |
michael@0 | 255 | [b] X X X |
michael@0 | 256 | 15 X X X X X X X X X X X |
michael@0 | 257 | 18 X X X X X X X X X X X |
michael@0 | 258 | COMPLEX X X X T X X X |
michael@0 | 259 | [c] X X X X X X X X X X X X |
michael@0 | 260 | [d] X X X X X X X X X X X |
michael@0 | 261 | [e] X X X X X X X X X X X X |
michael@0 | 262 | |
michael@0 | 263 | 18 <- 1 |
michael@0 | 264 | |
michael@0 | 265 | 1 0000 1111 1111 1111 = 0x0FFF |
michael@0 | 266 | [a] 0000 1110 1100 0010 = 0x0EC2 |
michael@0 | 267 | 7 0000 1110 1100 0110 = 0x0EC6 |
michael@0 | 268 | 8 0000 1110 1100 0010 = 0x0EC2 |
michael@0 | 269 | 9 0000 1110 1100 0010 = 0x0EC2 |
michael@0 | 270 | [b] 0000 1100 0000 0010 = 0x0C02 |
michael@0 | 271 | 15 0000 1111 1101 1111 = 0x0FDF |
michael@0 | 272 | 18 0000 1111 1101 1111 = 0x0FDF |
michael@0 | 273 | COMPLEX 0000 1111 1100 0010 = 0x0FC2 |
michael@0 | 274 | [c] 0000 1111 1111 1111 = 0x0FFF |
michael@0 | 275 | [d] 0000 1111 1101 1111 = 0x0FDF |
michael@0 | 276 | [e] 0000 1111 1111 1111 = 0x0FFF |
michael@0 | 277 | */ |
michael@0 | 278 | |
michael@0 | 279 | static const uint16_t gPairConservative[MAX_CLASSES] = { |
michael@0 | 280 | 0x0FFF, |
michael@0 | 281 | 0x0EC2, |
michael@0 | 282 | 0x0EC6, |
michael@0 | 283 | 0x0EC2, |
michael@0 | 284 | 0x0EC2, |
michael@0 | 285 | 0x0C02, |
michael@0 | 286 | 0x0FDF, |
michael@0 | 287 | 0x0FDF, |
michael@0 | 288 | 0x0FC2, |
michael@0 | 289 | 0x0FFF, |
michael@0 | 290 | 0x0FDF, |
michael@0 | 291 | 0x0FFF |
michael@0 | 292 | }; |
michael@0 | 293 | |
michael@0 | 294 | |
michael@0 | 295 | /* |
michael@0 | 296 | |
michael@0 | 297 | 9. Now we map the class to number |
michael@0 | 298 | |
michael@0 | 299 | 0: 1 |
michael@0 | 300 | 1: [a]- 2, 3, 4, 5, 6 |
michael@0 | 301 | 2: 7 |
michael@0 | 302 | 3: 8 |
michael@0 | 303 | 4: 9 |
michael@0 | 304 | 5: [b]- 10, 11, 12, 17 |
michael@0 | 305 | 6: 15 |
michael@0 | 306 | 7: 18 |
michael@0 | 307 | 8: COMPLEX |
michael@0 | 308 | 9: [c] |
michael@0 | 309 | A: [d] |
michael@0 | 310 | B: [e] |
michael@0 | 311 | |
michael@0 | 312 | and they mean: |
michael@0 | 313 | 0: Open parenthesis |
michael@0 | 314 | 1: Punctuation that prohibits break before |
michael@0 | 315 | 2: Non-breakable between same classes |
michael@0 | 316 | 3: Prefix |
michael@0 | 317 | 4: Postfix |
michael@0 | 318 | 5: Breakable character (Spaces and Most Japanese characters) |
michael@0 | 319 | 6: Numeric |
michael@0 | 320 | 7: Characters |
michael@0 | 321 | 8: Need special handling characters (E.g., Thai) |
michael@0 | 322 | 9: Open parentheses like Character (See bug 389056) |
michael@0 | 323 | A: Close parenthese (or punctuations) like Character (See bug 389056) |
michael@0 | 324 | B: Non breakable (See bug 390920) |
michael@0 | 325 | |
michael@0 | 326 | */ |
michael@0 | 327 | |
michael@0 | 328 | #define CLASS_NONE INT8_MAX |
michael@0 | 329 | |
michael@0 | 330 | #define CLASS_OPEN 0x00 |
michael@0 | 331 | #define CLASS_CLOSE 0x01 |
michael@0 | 332 | #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02 |
michael@0 | 333 | #define CLASS_PREFIX 0x03 |
michael@0 | 334 | #define CLASS_POSTFFIX 0x04 |
michael@0 | 335 | #define CLASS_BREAKABLE 0x05 |
michael@0 | 336 | #define CLASS_NUMERIC 0x06 |
michael@0 | 337 | #define CLASS_CHARACTER 0x07 |
michael@0 | 338 | #define CLASS_COMPLEX 0x08 |
michael@0 | 339 | #define CLASS_OPEN_LIKE_CHARACTER 0x09 |
michael@0 | 340 | #define CLASS_CLOSE_LIKE_CHARACTER 0x0A |
michael@0 | 341 | #define CLASS_NON_BREAKABLE 0x0B |
michael@0 | 342 | |
michael@0 | 343 | #define U_NULL char16_t(0x0000) |
michael@0 | 344 | #define U_SLASH char16_t('/') |
michael@0 | 345 | #define U_SPACE char16_t(' ') |
michael@0 | 346 | #define U_HYPHEN char16_t('-') |
michael@0 | 347 | #define U_EQUAL char16_t('=') |
michael@0 | 348 | #define U_PERCENT char16_t('%') |
michael@0 | 349 | #define U_AMPERSAND char16_t('&') |
michael@0 | 350 | #define U_SEMICOLON char16_t(';') |
michael@0 | 351 | #define U_BACKSLASH char16_t('\\') |
michael@0 | 352 | #define U_OPEN_SINGLE_QUOTE char16_t(0x2018) |
michael@0 | 353 | #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C) |
michael@0 | 354 | #define U_OPEN_GUILLEMET char16_t(0x00AB) |
michael@0 | 355 | |
michael@0 | 356 | #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \ |
michael@0 | 357 | (c) == U_SLASH || \ |
michael@0 | 358 | (c) == U_PERCENT || \ |
michael@0 | 359 | (c) == U_AMPERSAND || \ |
michael@0 | 360 | (c) == U_SEMICOLON || \ |
michael@0 | 361 | (c) == U_BACKSLASH || \ |
michael@0 | 362 | (c) == U_OPEN_SINGLE_QUOTE || \ |
michael@0 | 363 | (c) == U_OPEN_DOUBLE_QUOTE || \ |
michael@0 | 364 | (c) == U_OPEN_GUILLEMET) |
michael@0 | 365 | |
michael@0 | 366 | #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039) |
michael@0 | 367 | |
michael@0 | 368 | static inline int |
michael@0 | 369 | GETCLASSFROMTABLE(const uint32_t* t, uint16_t l) |
michael@0 | 370 | { |
michael@0 | 371 | return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f); |
michael@0 | 372 | } |
michael@0 | 373 | |
michael@0 | 374 | static inline int |
michael@0 | 375 | IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u) |
michael@0 | 376 | { |
michael@0 | 377 | return ((0xff66 <= (u)) && ((u) <= 0xff70)); |
michael@0 | 378 | } |
michael@0 | 379 | |
michael@0 | 380 | static inline int |
michael@0 | 381 | IS_CJK_CHAR(char16_t u) |
michael@0 | 382 | { |
michael@0 | 383 | return ((0x1100 <= (u) && (u) <= 0x11ff) || |
michael@0 | 384 | (0x2e80 <= (u) && (u) <= 0xd7ff) || |
michael@0 | 385 | (0xf900 <= (u) && (u) <= 0xfaff) || |
michael@0 | 386 | (0xff00 <= (u) && (u) <= 0xffef) ); |
michael@0 | 387 | } |
michael@0 | 388 | |
michael@0 | 389 | static inline bool |
michael@0 | 390 | IS_NONBREAKABLE_SPACE(char16_t u) |
michael@0 | 391 | { |
michael@0 | 392 | return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE |
michael@0 | 393 | } |
michael@0 | 394 | |
michael@0 | 395 | static inline bool |
michael@0 | 396 | IS_HYPHEN(char16_t u) |
michael@0 | 397 | { |
michael@0 | 398 | return (u == U_HYPHEN || |
michael@0 | 399 | u == 0x058A || // ARMENIAN HYPHEN |
michael@0 | 400 | u == 0x2010 || // HYPHEN |
michael@0 | 401 | u == 0x2012 || // FIGURE DASH |
michael@0 | 402 | u == 0x2013); // EN DASH |
michael@0 | 403 | } |
michael@0 | 404 | |
michael@0 | 405 | static int8_t |
michael@0 | 406 | GetClass(char16_t u) |
michael@0 | 407 | { |
michael@0 | 408 | uint16_t h = u & 0xFF00; |
michael@0 | 409 | uint16_t l = u & 0x00ff; |
michael@0 | 410 | int8_t c; |
michael@0 | 411 | |
michael@0 | 412 | // Handle 3 range table first |
michael@0 | 413 | if (0x0000 == h) { |
michael@0 | 414 | c = GETCLASSFROMTABLE(gLBClass00, l); |
michael@0 | 415 | } else if (0x1700 == h) { |
michael@0 | 416 | c = GETCLASSFROMTABLE(gLBClass17, l); |
michael@0 | 417 | } else if (NS_NeedsPlatformNativeHandling(u)) { |
michael@0 | 418 | c = CLASS_COMPLEX; |
michael@0 | 419 | } else if (0x0E00 == h) { |
michael@0 | 420 | c = GETCLASSFROMTABLE(gLBClass0E, l); |
michael@0 | 421 | } else if (0x2000 == h) { |
michael@0 | 422 | c = GETCLASSFROMTABLE(gLBClass20, l); |
michael@0 | 423 | } else if (0x2100 == h) { |
michael@0 | 424 | c = GETCLASSFROMTABLE(gLBClass21, l); |
michael@0 | 425 | } else if (0x3000 == h) { |
michael@0 | 426 | c = GETCLASSFROMTABLE(gLBClass30, l); |
michael@0 | 427 | } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi |
michael@0 | 428 | ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul |
michael@0 | 429 | ((0xf900 <= h) && (h <= 0xfaff))) { |
michael@0 | 430 | c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatibility |
michael@0 | 431 | } else if (0xff00 == h) { |
michael@0 | 432 | if (l < 0x0060) { // Fullwidth ASCII variant |
michael@0 | 433 | c = GETCLASSFROMTABLE(gLBClass00, (l+0x20)); |
michael@0 | 434 | } else if (l < 0x00a0) { |
michael@0 | 435 | switch (l) { |
michael@0 | 436 | case 0x61: c = GetClass(0x3002); break; |
michael@0 | 437 | case 0x62: c = GetClass(0x300c); break; |
michael@0 | 438 | case 0x63: c = GetClass(0x300d); break; |
michael@0 | 439 | case 0x64: c = GetClass(0x3001); break; |
michael@0 | 440 | case 0x65: c = GetClass(0x30fb); break; |
michael@0 | 441 | case 0x9e: c = GetClass(0x309b); break; |
michael@0 | 442 | case 0x9f: c = GetClass(0x309c); break; |
michael@0 | 443 | default: |
michael@0 | 444 | if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) |
michael@0 | 445 | c = CLASS_CLOSE; // jis x4051 class 3 |
michael@0 | 446 | else |
michael@0 | 447 | c = CLASS_BREAKABLE; // jis x4051 class 11 |
michael@0 | 448 | break; |
michael@0 | 449 | } |
michael@0 | 450 | // Halfwidth Katakana variants |
michael@0 | 451 | } else if (l < 0x00e0) { |
michael@0 | 452 | c = CLASS_CHARACTER; // Halfwidth Hangul variants |
michael@0 | 453 | } else if (l < 0x00f0) { |
michael@0 | 454 | static char16_t NarrowFFEx[16] = { |
michael@0 | 455 | 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000, |
michael@0 | 456 | 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000 |
michael@0 | 457 | }; |
michael@0 | 458 | c = GetClass(NarrowFFEx[l - 0x00e0]); |
michael@0 | 459 | } else { |
michael@0 | 460 | c = CLASS_CHARACTER; |
michael@0 | 461 | } |
michael@0 | 462 | } else if (0x3100 == h) { |
michael@0 | 463 | if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun |
michael@0 | 464 | // XXX: This is per UAX #14, but UAX #14 may change |
michael@0 | 465 | // the line breaking rules about Kanbun and Bopomofo. |
michael@0 | 466 | c = CLASS_BREAKABLE; |
michael@0 | 467 | } else if (l >= 0xf0) { // Katakana small letters for Ainu |
michael@0 | 468 | c = CLASS_CLOSE; |
michael@0 | 469 | } else { // unassigned |
michael@0 | 470 | c = CLASS_CHARACTER; |
michael@0 | 471 | } |
michael@0 | 472 | } else if (0x0300 == h) { |
michael@0 | 473 | if (0x4F == l || (0x5C <= l && l <= 0x62)) |
michael@0 | 474 | c = CLASS_NON_BREAKABLE; |
michael@0 | 475 | else |
michael@0 | 476 | c = CLASS_CHARACTER; |
michael@0 | 477 | } else if (0x0500 == h) { |
michael@0 | 478 | // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14) |
michael@0 | 479 | if (l == 0x8A) |
michael@0 | 480 | c = GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN)); |
michael@0 | 481 | else |
michael@0 | 482 | c = CLASS_CHARACTER; |
michael@0 | 483 | } else if (0x0F00 == h) { |
michael@0 | 484 | if (0x08 == l || 0x0C == l || 0x12 == l) |
michael@0 | 485 | c = CLASS_NON_BREAKABLE; |
michael@0 | 486 | else |
michael@0 | 487 | c = CLASS_CHARACTER; |
michael@0 | 488 | } else if (0x1800 == h) { |
michael@0 | 489 | if (0x0E == l) |
michael@0 | 490 | c = CLASS_NON_BREAKABLE; |
michael@0 | 491 | else |
michael@0 | 492 | c = CLASS_CHARACTER; |
michael@0 | 493 | } else if (0x1600 == h) { |
michael@0 | 494 | if (0x80 == l) { // U+1680 OGHAM SPACE MARK |
michael@0 | 495 | c = CLASS_BREAKABLE; |
michael@0 | 496 | } else { |
michael@0 | 497 | c = CLASS_CHARACTER; |
michael@0 | 498 | } |
michael@0 | 499 | } else if (u == 0xfeff) { |
michael@0 | 500 | c = CLASS_NON_BREAKABLE; |
michael@0 | 501 | } else { |
michael@0 | 502 | c = CLASS_CHARACTER; // others |
michael@0 | 503 | } |
michael@0 | 504 | return c; |
michael@0 | 505 | } |
michael@0 | 506 | |
michael@0 | 507 | static bool |
michael@0 | 508 | GetPair(int8_t c1, int8_t c2) |
michael@0 | 509 | { |
michael@0 | 510 | NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); |
michael@0 | 511 | NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); |
michael@0 | 512 | |
michael@0 | 513 | return (0 == ((gPair[c1] >> c2) & 0x0001)); |
michael@0 | 514 | } |
michael@0 | 515 | |
michael@0 | 516 | static bool |
michael@0 | 517 | GetPairConservative(int8_t c1, int8_t c2) |
michael@0 | 518 | { |
michael@0 | 519 | NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); |
michael@0 | 520 | NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); |
michael@0 | 521 | |
michael@0 | 522 | return (0 == ((gPairConservative[c1] >> c2) & 0x0001)); |
michael@0 | 523 | } |
michael@0 | 524 | |
michael@0 | 525 | nsJISx4051LineBreaker::nsJISx4051LineBreaker() |
michael@0 | 526 | { |
michael@0 | 527 | } |
michael@0 | 528 | |
michael@0 | 529 | nsJISx4051LineBreaker::~nsJISx4051LineBreaker() |
michael@0 | 530 | { |
michael@0 | 531 | } |
michael@0 | 532 | |
michael@0 | 533 | NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker) |
michael@0 | 534 | |
michael@0 | 535 | class ContextState { |
michael@0 | 536 | public: |
michael@0 | 537 | ContextState(const char16_t* aText, uint32_t aLength) { |
michael@0 | 538 | mUniText = aText; |
michael@0 | 539 | mText = nullptr; |
michael@0 | 540 | mLength = aLength; |
michael@0 | 541 | Init(); |
michael@0 | 542 | } |
michael@0 | 543 | |
michael@0 | 544 | ContextState(const uint8_t* aText, uint32_t aLength) { |
michael@0 | 545 | mUniText = nullptr; |
michael@0 | 546 | mText = aText; |
michael@0 | 547 | mLength = aLength; |
michael@0 | 548 | Init(); |
michael@0 | 549 | } |
michael@0 | 550 | |
michael@0 | 551 | uint32_t Length() { return mLength; } |
michael@0 | 552 | uint32_t Index() { return mIndex; } |
michael@0 | 553 | |
michael@0 | 554 | char16_t GetCharAt(uint32_t aIndex) { |
michael@0 | 555 | NS_ASSERTION(aIndex < mLength, "Out of range!"); |
michael@0 | 556 | return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]); |
michael@0 | 557 | } |
michael@0 | 558 | |
michael@0 | 559 | void AdvanceIndex() { |
michael@0 | 560 | ++mIndex; |
michael@0 | 561 | } |
michael@0 | 562 | |
michael@0 | 563 | void NotifyBreakBefore() { mLastBreakIndex = mIndex; } |
michael@0 | 564 | |
michael@0 | 565 | // A word of western language should not be broken. But even if the word has |
michael@0 | 566 | // only ASCII characters, non-natural context words should be broken, e.g., |
michael@0 | 567 | // URL and file path. For protecting the natural words, we should use |
michael@0 | 568 | // conservative breaking rules at following conditions: |
michael@0 | 569 | // 1. at near the start of word |
michael@0 | 570 | // 2. at near the end of word |
michael@0 | 571 | // 3. at near the latest broken point |
michael@0 | 572 | // CONSERVATIVE_BREAK_RANGE define the 'near' in characters. |
michael@0 | 573 | #define CONSERVATIVE_BREAK_RANGE 6 |
michael@0 | 574 | |
michael@0 | 575 | bool UseConservativeBreaking(uint32_t aOffset = 0) { |
michael@0 | 576 | if (mHasCJKChar) |
michael@0 | 577 | return false; |
michael@0 | 578 | uint32_t index = mIndex + aOffset; |
michael@0 | 579 | bool result = (index < CONSERVATIVE_BREAK_RANGE || |
michael@0 | 580 | mLength - index < CONSERVATIVE_BREAK_RANGE || |
michael@0 | 581 | index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE); |
michael@0 | 582 | if (result || !mHasNonbreakableSpace) |
michael@0 | 583 | return result; |
michael@0 | 584 | |
michael@0 | 585 | // This text has no-breakable space, we need to check whether the index |
michael@0 | 586 | // is near it. |
michael@0 | 587 | |
michael@0 | 588 | // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here. |
michael@0 | 589 | for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) { |
michael@0 | 590 | if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1))) |
michael@0 | 591 | return true; |
michael@0 | 592 | } |
michael@0 | 593 | // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE. |
michael@0 | 594 | for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) { |
michael@0 | 595 | if (IS_NONBREAKABLE_SPACE(GetCharAt(i))) |
michael@0 | 596 | return true; |
michael@0 | 597 | } |
michael@0 | 598 | return false; |
michael@0 | 599 | } |
michael@0 | 600 | |
michael@0 | 601 | bool HasPreviousEqualsSign() const { |
michael@0 | 602 | return mHasPreviousEqualsSign; |
michael@0 | 603 | } |
michael@0 | 604 | void NotifySeenEqualsSign() { |
michael@0 | 605 | mHasPreviousEqualsSign = true; |
michael@0 | 606 | } |
michael@0 | 607 | |
michael@0 | 608 | bool HasPreviousSlash() const { |
michael@0 | 609 | return mHasPreviousSlash; |
michael@0 | 610 | } |
michael@0 | 611 | void NotifySeenSlash() { |
michael@0 | 612 | mHasPreviousSlash = true; |
michael@0 | 613 | } |
michael@0 | 614 | |
michael@0 | 615 | bool HasPreviousBackslash() const { |
michael@0 | 616 | return mHasPreviousBackslash; |
michael@0 | 617 | } |
michael@0 | 618 | void NotifySeenBackslash() { |
michael@0 | 619 | mHasPreviousBackslash = true; |
michael@0 | 620 | } |
michael@0 | 621 | |
michael@0 | 622 | char16_t GetPreviousNonHyphenCharacter() const { |
michael@0 | 623 | return mPreviousNonHyphenCharacter; |
michael@0 | 624 | } |
michael@0 | 625 | void NotifyNonHyphenCharacter(char16_t ch) { |
michael@0 | 626 | mPreviousNonHyphenCharacter = ch; |
michael@0 | 627 | } |
michael@0 | 628 | |
michael@0 | 629 | private: |
michael@0 | 630 | void Init() { |
michael@0 | 631 | mIndex = 0; |
michael@0 | 632 | mLastBreakIndex = 0; |
michael@0 | 633 | mPreviousNonHyphenCharacter = U_NULL; |
michael@0 | 634 | mHasCJKChar = 0; |
michael@0 | 635 | mHasNonbreakableSpace = 0; |
michael@0 | 636 | mHasPreviousEqualsSign = false; |
michael@0 | 637 | mHasPreviousSlash = false; |
michael@0 | 638 | mHasPreviousBackslash = false; |
michael@0 | 639 | |
michael@0 | 640 | for (uint32_t i = 0; i < mLength; ++i) { |
michael@0 | 641 | char16_t u = GetCharAt(i); |
michael@0 | 642 | if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) |
michael@0 | 643 | mHasNonbreakableSpace = 1; |
michael@0 | 644 | else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u)) |
michael@0 | 645 | mHasCJKChar = 1; |
michael@0 | 646 | } |
michael@0 | 647 | } |
michael@0 | 648 | |
michael@0 | 649 | const char16_t* mUniText; |
michael@0 | 650 | const uint8_t* mText; |
michael@0 | 651 | |
michael@0 | 652 | uint32_t mIndex; |
michael@0 | 653 | uint32_t mLength; // length of text |
michael@0 | 654 | uint32_t mLastBreakIndex; |
michael@0 | 655 | char16_t mPreviousNonHyphenCharacter; // The last character we have seen |
michael@0 | 656 | // which is not U_HYPHEN |
michael@0 | 657 | bool mHasCJKChar; // if the text has CJK character, this is true. |
michael@0 | 658 | bool mHasNonbreakableSpace; // if the text has no-breakable space, |
michael@0 | 659 | // this is true. |
michael@0 | 660 | bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL |
michael@0 | 661 | bool mHasPreviousSlash; // True if we have seen a U_SLASH |
michael@0 | 662 | bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH |
michael@0 | 663 | }; |
michael@0 | 664 | |
michael@0 | 665 | static int8_t |
michael@0 | 666 | ContextualAnalysis(char16_t prev, char16_t cur, char16_t next, |
michael@0 | 667 | ContextState &aState) |
michael@0 | 668 | { |
michael@0 | 669 | // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE. |
michael@0 | 670 | |
michael@0 | 671 | if (IS_HYPHEN(cur)) { |
michael@0 | 672 | // If next character is hyphen, we don't need to break between them. |
michael@0 | 673 | if (IS_HYPHEN(next)) |
michael@0 | 674 | return CLASS_CHARACTER; |
michael@0 | 675 | // If prev and next characters are numeric, it may be in Math context. |
michael@0 | 676 | // So, we should not break here. |
michael@0 | 677 | bool prevIsNum = IS_ASCII_DIGIT(prev); |
michael@0 | 678 | bool nextIsNum = IS_ASCII_DIGIT(next); |
michael@0 | 679 | if (prevIsNum && nextIsNum) |
michael@0 | 680 | return CLASS_NUMERIC; |
michael@0 | 681 | // If one side is numeric and the other is a character, or if both sides are |
michael@0 | 682 | // characters, the hyphen should be breakable. |
michael@0 | 683 | if (!aState.UseConservativeBreaking(1)) { |
michael@0 | 684 | char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter(); |
michael@0 | 685 | if (prevOfHyphen && next) { |
michael@0 | 686 | int8_t prevClass = GetClass(prevOfHyphen); |
michael@0 | 687 | int8_t nextClass = GetClass(next); |
michael@0 | 688 | bool prevIsNumOrCharOrClose = |
michael@0 | 689 | prevIsNum || |
michael@0 | 690 | (prevClass == CLASS_CHARACTER && |
michael@0 | 691 | !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) || |
michael@0 | 692 | prevClass == CLASS_CLOSE || |
michael@0 | 693 | prevClass == CLASS_CLOSE_LIKE_CHARACTER; |
michael@0 | 694 | bool nextIsNumOrCharOrOpen = |
michael@0 | 695 | nextIsNum || |
michael@0 | 696 | (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) || |
michael@0 | 697 | nextClass == CLASS_OPEN || |
michael@0 | 698 | nextClass == CLASS_OPEN_LIKE_CHARACTER || |
michael@0 | 699 | next == U_OPEN_SINGLE_QUOTE || |
michael@0 | 700 | next == U_OPEN_DOUBLE_QUOTE || |
michael@0 | 701 | next == U_OPEN_GUILLEMET; |
michael@0 | 702 | if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) { |
michael@0 | 703 | return CLASS_CLOSE; |
michael@0 | 704 | } |
michael@0 | 705 | } |
michael@0 | 706 | } |
michael@0 | 707 | } else { |
michael@0 | 708 | aState.NotifyNonHyphenCharacter(cur); |
michael@0 | 709 | if (cur == U_SLASH || cur == U_BACKSLASH) { |
michael@0 | 710 | // If this is immediately after same char, we should not break here. |
michael@0 | 711 | if (prev == cur) |
michael@0 | 712 | return CLASS_CHARACTER; |
michael@0 | 713 | // If this text has two or more (BACK)SLASHs, this may be file path or URL. |
michael@0 | 714 | // Make sure to compute shouldReturn before we notify on this slash. |
michael@0 | 715 | bool shouldReturn = !aState.UseConservativeBreaking() && |
michael@0 | 716 | (cur == U_SLASH ? |
michael@0 | 717 | aState.HasPreviousSlash() : aState.HasPreviousBackslash()); |
michael@0 | 718 | |
michael@0 | 719 | if (cur == U_SLASH) { |
michael@0 | 720 | aState.NotifySeenSlash(); |
michael@0 | 721 | } else { |
michael@0 | 722 | aState.NotifySeenBackslash(); |
michael@0 | 723 | } |
michael@0 | 724 | |
michael@0 | 725 | if (shouldReturn) |
michael@0 | 726 | return CLASS_OPEN; |
michael@0 | 727 | } else if (cur == U_PERCENT) { |
michael@0 | 728 | // If this is a part of the param of URL, we should break before. |
michael@0 | 729 | if (!aState.UseConservativeBreaking()) { |
michael@0 | 730 | if (aState.Index() >= 3 && |
michael@0 | 731 | aState.GetCharAt(aState.Index() - 3) == U_PERCENT) |
michael@0 | 732 | return CLASS_OPEN; |
michael@0 | 733 | if (aState.Index() + 3 < aState.Length() && |
michael@0 | 734 | aState.GetCharAt(aState.Index() + 3) == U_PERCENT) |
michael@0 | 735 | return CLASS_OPEN; |
michael@0 | 736 | } |
michael@0 | 737 | } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) { |
michael@0 | 738 | // If this may be a separator of params of URL, we should break after. |
michael@0 | 739 | if (!aState.UseConservativeBreaking(1) && |
michael@0 | 740 | aState.HasPreviousEqualsSign()) |
michael@0 | 741 | return CLASS_CLOSE; |
michael@0 | 742 | } else if (cur == U_OPEN_SINGLE_QUOTE || |
michael@0 | 743 | cur == U_OPEN_DOUBLE_QUOTE || |
michael@0 | 744 | cur == U_OPEN_GUILLEMET) { |
michael@0 | 745 | // for CJK usage, we treat these as openers to allow a break before them, |
michael@0 | 746 | // but otherwise treat them as normal characters because quote mark usage |
michael@0 | 747 | // in various Western languages varies too much; see bug #450088 discussion. |
michael@0 | 748 | if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next)) |
michael@0 | 749 | return CLASS_OPEN; |
michael@0 | 750 | } else { |
michael@0 | 751 | NS_ERROR("Forgot to handle the current character!"); |
michael@0 | 752 | } |
michael@0 | 753 | } |
michael@0 | 754 | return GetClass(cur); |
michael@0 | 755 | } |
michael@0 | 756 | |
michael@0 | 757 | |
michael@0 | 758 | int32_t |
michael@0 | 759 | nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen, |
michael@0 | 760 | uint32_t aPos, int8_t aDirection) |
michael@0 | 761 | { |
michael@0 | 762 | bool textNeedsJISx4051 = false; |
michael@0 | 763 | int32_t begin, end; |
michael@0 | 764 | |
michael@0 | 765 | for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) { |
michael@0 | 766 | if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) { |
michael@0 | 767 | textNeedsJISx4051 = true; |
michael@0 | 768 | } |
michael@0 | 769 | } |
michael@0 | 770 | for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) { |
michael@0 | 771 | if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) { |
michael@0 | 772 | textNeedsJISx4051 = true; |
michael@0 | 773 | } |
michael@0 | 774 | } |
michael@0 | 775 | |
michael@0 | 776 | int32_t ret; |
michael@0 | 777 | nsAutoTArray<uint8_t, 2000> breakState; |
michael@0 | 778 | if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) { |
michael@0 | 779 | // No complex text character, do not try to do complex line break. |
michael@0 | 780 | // (This is required for serializers. See Bug #344816.) |
michael@0 | 781 | // Also fall back to this when out of memory. |
michael@0 | 782 | if (aDirection < 0) { |
michael@0 | 783 | ret = (begin == int32_t(aPos)) ? begin - 1 : begin; |
michael@0 | 784 | } else { |
michael@0 | 785 | ret = end; |
michael@0 | 786 | } |
michael@0 | 787 | } else { |
michael@0 | 788 | GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal, |
michael@0 | 789 | breakState.Elements()); |
michael@0 | 790 | |
michael@0 | 791 | ret = aPos; |
michael@0 | 792 | do { |
michael@0 | 793 | ret += aDirection; |
michael@0 | 794 | } while (begin < ret && ret < end && !breakState[ret - begin]); |
michael@0 | 795 | } |
michael@0 | 796 | |
michael@0 | 797 | return ret; |
michael@0 | 798 | } |
michael@0 | 799 | |
michael@0 | 800 | int32_t |
michael@0 | 801 | nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen, |
michael@0 | 802 | uint32_t aPos) |
michael@0 | 803 | { |
michael@0 | 804 | NS_ASSERTION(aText, "aText shouldn't be null"); |
michael@0 | 805 | NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next"); |
michael@0 | 806 | |
michael@0 | 807 | int32_t nextPos = WordMove(aText, aLen, aPos, 1); |
michael@0 | 808 | return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT; |
michael@0 | 809 | } |
michael@0 | 810 | |
michael@0 | 811 | int32_t |
michael@0 | 812 | nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen, |
michael@0 | 813 | uint32_t aPos) |
michael@0 | 814 | { |
michael@0 | 815 | NS_ASSERTION(aText, "aText shouldn't be null"); |
michael@0 | 816 | NS_ASSERTION(aLen >= aPos && aPos > 0, |
michael@0 | 817 | "Bad position passed to nsJISx4051LineBreaker::Prev"); |
michael@0 | 818 | |
michael@0 | 819 | int32_t prevPos = WordMove(aText, aLen, aPos, -1); |
michael@0 | 820 | return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT; |
michael@0 | 821 | } |
michael@0 | 822 | |
michael@0 | 823 | void |
michael@0 | 824 | nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength, |
michael@0 | 825 | uint8_t aWordBreak, |
michael@0 | 826 | uint8_t* aBreakBefore) |
michael@0 | 827 | { |
michael@0 | 828 | uint32_t cur; |
michael@0 | 829 | int8_t lastClass = CLASS_NONE; |
michael@0 | 830 | ContextState state(aChars, aLength); |
michael@0 | 831 | |
michael@0 | 832 | for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { |
michael@0 | 833 | char16_t ch = aChars[cur]; |
michael@0 | 834 | int8_t cl; |
michael@0 | 835 | |
michael@0 | 836 | if (NEED_CONTEXTUAL_ANALYSIS(ch)) { |
michael@0 | 837 | cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, |
michael@0 | 838 | ch, |
michael@0 | 839 | cur + 1 < aLength ? aChars[cur + 1] : U_NULL, |
michael@0 | 840 | state); |
michael@0 | 841 | } else { |
michael@0 | 842 | if (ch == U_EQUAL) |
michael@0 | 843 | state.NotifySeenEqualsSign(); |
michael@0 | 844 | state.NotifyNonHyphenCharacter(ch); |
michael@0 | 845 | cl = GetClass(ch); |
michael@0 | 846 | } |
michael@0 | 847 | |
michael@0 | 848 | bool allowBreak = false; |
michael@0 | 849 | if (cur > 0) { |
michael@0 | 850 | NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl, |
michael@0 | 851 | "Loop should have prevented adjacent complex chars here"); |
michael@0 | 852 | if (aWordBreak == nsILineBreaker::kWordBreak_Normal) { |
michael@0 | 853 | allowBreak = (state.UseConservativeBreaking()) ? |
michael@0 | 854 | GetPairConservative(lastClass, cl) : GetPair(lastClass, cl); |
michael@0 | 855 | } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) { |
michael@0 | 856 | allowBreak = true; |
michael@0 | 857 | } |
michael@0 | 858 | } |
michael@0 | 859 | aBreakBefore[cur] = allowBreak; |
michael@0 | 860 | if (allowBreak) |
michael@0 | 861 | state.NotifyBreakBefore(); |
michael@0 | 862 | lastClass = cl; |
michael@0 | 863 | if (CLASS_COMPLEX == cl) { |
michael@0 | 864 | uint32_t end = cur + 1; |
michael@0 | 865 | |
michael@0 | 866 | while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) { |
michael@0 | 867 | ++end; |
michael@0 | 868 | } |
michael@0 | 869 | |
michael@0 | 870 | NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur); |
michael@0 | 871 | |
michael@0 | 872 | // We have to consider word-break value again for complex characters |
michael@0 | 873 | if (aWordBreak != nsILineBreaker::kWordBreak_Normal) { |
michael@0 | 874 | // Respect word-break property |
michael@0 | 875 | for (uint32_t i = cur; i < end; i++) |
michael@0 | 876 | aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll); |
michael@0 | 877 | } |
michael@0 | 878 | |
michael@0 | 879 | // restore breakability at chunk begin, which was always set to false |
michael@0 | 880 | // by the complex line breaker |
michael@0 | 881 | aBreakBefore[cur] = allowBreak; |
michael@0 | 882 | |
michael@0 | 883 | cur = end - 1; |
michael@0 | 884 | } |
michael@0 | 885 | } |
michael@0 | 886 | } |
michael@0 | 887 | |
michael@0 | 888 | void |
michael@0 | 889 | nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength, |
michael@0 | 890 | uint8_t aWordBreak, |
michael@0 | 891 | uint8_t* aBreakBefore) |
michael@0 | 892 | { |
michael@0 | 893 | uint32_t cur; |
michael@0 | 894 | int8_t lastClass = CLASS_NONE; |
michael@0 | 895 | ContextState state(aChars, aLength); |
michael@0 | 896 | |
michael@0 | 897 | for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { |
michael@0 | 898 | char16_t ch = aChars[cur]; |
michael@0 | 899 | int8_t cl; |
michael@0 | 900 | |
michael@0 | 901 | if (NEED_CONTEXTUAL_ANALYSIS(ch)) { |
michael@0 | 902 | cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, |
michael@0 | 903 | ch, |
michael@0 | 904 | cur + 1 < aLength ? aChars[cur + 1] : U_NULL, |
michael@0 | 905 | state); |
michael@0 | 906 | } else { |
michael@0 | 907 | if (ch == U_EQUAL) |
michael@0 | 908 | state.NotifySeenEqualsSign(); |
michael@0 | 909 | state.NotifyNonHyphenCharacter(ch); |
michael@0 | 910 | cl = GetClass(ch); |
michael@0 | 911 | } |
michael@0 | 912 | |
michael@0 | 913 | bool allowBreak = false; |
michael@0 | 914 | if (cur > 0) { |
michael@0 | 915 | if (aWordBreak == nsILineBreaker::kWordBreak_Normal) { |
michael@0 | 916 | allowBreak = (state.UseConservativeBreaking()) ? |
michael@0 | 917 | GetPairConservative(lastClass, cl) : GetPair(lastClass, cl); |
michael@0 | 918 | } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) { |
michael@0 | 919 | allowBreak = true; |
michael@0 | 920 | } |
michael@0 | 921 | } |
michael@0 | 922 | aBreakBefore[cur] = allowBreak; |
michael@0 | 923 | if (allowBreak) |
michael@0 | 924 | state.NotifyBreakBefore(); |
michael@0 | 925 | lastClass = cl; |
michael@0 | 926 | } |
michael@0 | 927 | } |