intl/lwbrk/src/nsJISx4051LineBreaker.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6
michael@0 7
michael@0 8 #include "nsJISx4051LineBreaker.h"
michael@0 9
michael@0 10 #include "jisx4051class.h"
michael@0 11 #include "nsComplexBreaker.h"
michael@0 12 #include "nsTArray.h"
michael@0 13
michael@0 14 /*
michael@0 15
michael@0 16 Simplification of Pair Table in JIS X 4051
michael@0 17
michael@0 18 1. The Origion Table - in 4.1.3
michael@0 19
michael@0 20 In JIS x 4051. The pair table is defined as below
michael@0 21
michael@0 22 Class of
michael@0 23 Leading Class of Trailing Char Class
michael@0 24 Char
michael@0 25
michael@0 26 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20
michael@0 27 * # * #
michael@0 28 1 X X X X X X X X X X X X X X X X X X X X X E
michael@0 29 2 X X X X X X
michael@0 30 3 X X X X X X
michael@0 31 4 X X X X X X
michael@0 32 5 X X X X X X
michael@0 33 6 X X X X X X
michael@0 34 7 X X X X X X X
michael@0 35 8 X X X X X X E
michael@0 36 9 X X X X X X
michael@0 37 10 X X X X X X
michael@0 38 11 X X X X X X
michael@0 39 12 X X X X X X
michael@0 40 13 X X X X X X X
michael@0 41 14 X X X X X X X
michael@0 42 15 X X X X X X X X X
michael@0 43 16 X X X X X X X X
michael@0 44 17 X X X X X E
michael@0 45 18 X X X X X X X X X
michael@0 46 19 X E E E E E X X X X X X X X X X X X E X E E
michael@0 47 20 X X X X X E
michael@0 48
michael@0 49 * Same Char
michael@0 50 # Other Char
michael@0 51
michael@0 52 X Cannot Break
michael@0 53
michael@0 54 The classes mean:
michael@0 55 1: Open parenthesis
michael@0 56 2: Close parenthesis
michael@0 57 3: Prohibit a line break before
michael@0 58 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
michael@0 59 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
michael@0 60 6: Full stop
michael@0 61 7: Non-breakable between same characters
michael@0 62 8: Prefix (e.g., "$", "NO.")
michael@0 63 9: Postfix (e.g., "%")
michael@0 64 10: Ideographic space
michael@0 65 11: Hiragana
michael@0 66 12: Japanese characters (except class 11)
michael@0 67 13: Subscript
michael@0 68 14: Ruby
michael@0 69 15: Numeric
michael@0 70 16: Alphabet
michael@0 71 17: Space for Western language
michael@0 72 18: Western characters (except class 17)
michael@0 73 19: Split line note (Warichu) begin quote
michael@0 74 20: Split line note (Warichu) end quote
michael@0 75
michael@0 76 2. Simplified by remove the class which we do not care
michael@0 77
michael@0 78 However, since we do not care about class 13(Subscript), 14(Ruby),
michael@0 79 16 (Aphabet), 19(split line note begin quote), and 20(split line note end
michael@0 80 quote) we can simplify this par table into the following
michael@0 81
michael@0 82 Class of
michael@0 83 Leading Class of Trailing Char Class
michael@0 84 Char
michael@0 85
michael@0 86 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18
michael@0 87
michael@0 88 1 X X X X X X X X X X X X X X X
michael@0 89 2 X X X X X
michael@0 90 3 X X X X X
michael@0 91 4 X X X X X
michael@0 92 5 X X X X X
michael@0 93 6 X X X X X
michael@0 94 7 X X X X X X
michael@0 95 8 X X X X X X
michael@0 96 9 X X X X X
michael@0 97 10 X X X X X
michael@0 98 11 X X X X X
michael@0 99 12 X X X X X
michael@0 100 15 X X X X X X X X
michael@0 101 17 X X X X X
michael@0 102 18 X X X X X X X
michael@0 103
michael@0 104 3. Simplified by merged classes
michael@0 105
michael@0 106 After the 2 simplification, the pair table have some duplication
michael@0 107 a. class 2, 3, 4, 5, 6, are the same- we can merged them
michael@0 108 b. class 10, 11, 12, 17 are the same- we can merged them
michael@0 109
michael@0 110
michael@0 111 Class of
michael@0 112 Leading Class of Trailing Char Class
michael@0 113 Char
michael@0 114
michael@0 115 1 [a] 7 8 9 [b]15 18
michael@0 116
michael@0 117 1 X X X X X X X X
michael@0 118 [a] X
michael@0 119 7 X X
michael@0 120 8 X X
michael@0 121 9 X
michael@0 122 [b] X
michael@0 123 15 X X X X
michael@0 124 18 X X X
michael@0 125
michael@0 126
michael@0 127 4. We add COMPLEX characters and make it breakable w/ all ther class
michael@0 128 except after class 1 and before class [a]
michael@0 129
michael@0 130 Class of
michael@0 131 Leading Class of Trailing Char Class
michael@0 132 Char
michael@0 133
michael@0 134 1 [a] 7 8 9 [b]15 18 COMPLEX
michael@0 135
michael@0 136 1 X X X X X X X X X
michael@0 137 [a] X
michael@0 138 7 X X
michael@0 139 8 X X
michael@0 140 9 X
michael@0 141 [b] X
michael@0 142 15 X X X X
michael@0 143 18 X X X
michael@0 144 COMPLEX X T
michael@0 145
michael@0 146 T : need special handling
michael@0 147
michael@0 148
michael@0 149 5. However, we need two special class for some punctuations/parentheses,
michael@0 150 theirs breaking rules like character class (18), see bug 389056.
michael@0 151 And also we need character like punctuation that is same behavior with 18,
michael@0 152 but the characters are not letters of all languages. (e.g., '_')
michael@0 153 [c]. Based on open parenthesis class (1), but it is not breakable after
michael@0 154 character class (18) or numeric class (15).
michael@0 155 [d]. Based on close parenthesis (or punctuation) class (2), but it is not
michael@0 156 breakable before character class (18) or numeric class (15).
michael@0 157
michael@0 158 Class of
michael@0 159 Leading Class of Trailing Char Class
michael@0 160 Char
michael@0 161
michael@0 162 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d]
michael@0 163
michael@0 164 1 X X X X X X X X X X X
michael@0 165 [a] X X X
michael@0 166 7 X X
michael@0 167 8 X X
michael@0 168 9 X
michael@0 169 [b] X X
michael@0 170 15 X X X X X X
michael@0 171 18 X X X X X
michael@0 172 COMPLEX X T
michael@0 173 [c] X X X X X X X X X X X
michael@0 174 [d] X X X X
michael@0 175
michael@0 176
michael@0 177 6. And Unicode has "NON-BREAK" characters. The lines should be broken around
michael@0 178 them. But in JIS X 4051, such class is not, therefore, we create [e].
michael@0 179
michael@0 180 Class of
michael@0 181 Leading Class of Trailing Char Class
michael@0 182 Char
michael@0 183
michael@0 184 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
michael@0 185
michael@0 186 1 X X X X X X X X X X X X
michael@0 187 [a] X X X
michael@0 188 7 X X X
michael@0 189 8 X X X
michael@0 190 9 X X
michael@0 191 [b] X X X
michael@0 192 15 X X X X X X X
michael@0 193 18 X X X X X X
michael@0 194 COMPLEX X T X
michael@0 195 [c] X X X X X X X X X X X X
michael@0 196 [d] X X X X X
michael@0 197 [e] X X X X X X X X X X X X
michael@0 198
michael@0 199
michael@0 200 7. Now we use one bit to encode weather it is breakable, and use 2 bytes
michael@0 201 for one row, then the bit table will look like:
michael@0 202
michael@0 203 18 <- 1
michael@0 204
michael@0 205 1 0000 1111 1111 1111 = 0x0FFF
michael@0 206 [a] 0000 1100 0000 0010 = 0x0C02
michael@0 207 7 0000 1000 0000 0110 = 0x0806
michael@0 208 8 0000 1000 0100 0010 = 0x0842
michael@0 209 9 0000 1000 0000 0010 = 0x0802
michael@0 210 [b] 0000 1100 0000 0010 = 0x0C02
michael@0 211 15 0000 1110 1101 0010 = 0x0ED2
michael@0 212 18 0000 1110 1100 0010 = 0x0EC2
michael@0 213 COMPLEX 0000 1001 0000 0010 = 0x0902
michael@0 214 [c] 0000 1111 1111 1111 = 0x0FFF
michael@0 215 [d] 0000 1100 1100 0010 = 0x0CC2
michael@0 216 [e] 0000 1111 1111 1111 = 0x0FFF
michael@0 217 */
michael@0 218
michael@0 219 #define MAX_CLASSES 12
michael@0 220
michael@0 221 static const uint16_t gPair[MAX_CLASSES] = {
michael@0 222 0x0FFF,
michael@0 223 0x0C02,
michael@0 224 0x0806,
michael@0 225 0x0842,
michael@0 226 0x0802,
michael@0 227 0x0C02,
michael@0 228 0x0ED2,
michael@0 229 0x0EC2,
michael@0 230 0x0902,
michael@0 231 0x0FFF,
michael@0 232 0x0CC2,
michael@0 233 0x0FFF
michael@0 234 };
michael@0 235
michael@0 236
michael@0 237 /*
michael@0 238
michael@0 239 8. And if the character is not enough far from word start, word end and
michael@0 240 another break point, we should not break in non-CJK languages.
michael@0 241 I.e., Don't break around 15, 18, [c] and [d], but don't change
michael@0 242 that if they are related to [b].
michael@0 243
michael@0 244 Class of
michael@0 245 Leading Class of Trailing Char Class
michael@0 246 Char
michael@0 247
michael@0 248 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
michael@0 249
michael@0 250 1 X X X X X X X X X X X X
michael@0 251 [a] X X X X X X
michael@0 252 7 X X X X X X X
michael@0 253 8 X X X X X X
michael@0 254 9 X X X X X X
michael@0 255 [b] X X X
michael@0 256 15 X X X X X X X X X X X
michael@0 257 18 X X X X X X X X X X X
michael@0 258 COMPLEX X X X T X X X
michael@0 259 [c] X X X X X X X X X X X X
michael@0 260 [d] X X X X X X X X X X X
michael@0 261 [e] X X X X X X X X X X X X
michael@0 262
michael@0 263 18 <- 1
michael@0 264
michael@0 265 1 0000 1111 1111 1111 = 0x0FFF
michael@0 266 [a] 0000 1110 1100 0010 = 0x0EC2
michael@0 267 7 0000 1110 1100 0110 = 0x0EC6
michael@0 268 8 0000 1110 1100 0010 = 0x0EC2
michael@0 269 9 0000 1110 1100 0010 = 0x0EC2
michael@0 270 [b] 0000 1100 0000 0010 = 0x0C02
michael@0 271 15 0000 1111 1101 1111 = 0x0FDF
michael@0 272 18 0000 1111 1101 1111 = 0x0FDF
michael@0 273 COMPLEX 0000 1111 1100 0010 = 0x0FC2
michael@0 274 [c] 0000 1111 1111 1111 = 0x0FFF
michael@0 275 [d] 0000 1111 1101 1111 = 0x0FDF
michael@0 276 [e] 0000 1111 1111 1111 = 0x0FFF
michael@0 277 */
michael@0 278
michael@0 279 static const uint16_t gPairConservative[MAX_CLASSES] = {
michael@0 280 0x0FFF,
michael@0 281 0x0EC2,
michael@0 282 0x0EC6,
michael@0 283 0x0EC2,
michael@0 284 0x0EC2,
michael@0 285 0x0C02,
michael@0 286 0x0FDF,
michael@0 287 0x0FDF,
michael@0 288 0x0FC2,
michael@0 289 0x0FFF,
michael@0 290 0x0FDF,
michael@0 291 0x0FFF
michael@0 292 };
michael@0 293
michael@0 294
michael@0 295 /*
michael@0 296
michael@0 297 9. Now we map the class to number
michael@0 298
michael@0 299 0: 1
michael@0 300 1: [a]- 2, 3, 4, 5, 6
michael@0 301 2: 7
michael@0 302 3: 8
michael@0 303 4: 9
michael@0 304 5: [b]- 10, 11, 12, 17
michael@0 305 6: 15
michael@0 306 7: 18
michael@0 307 8: COMPLEX
michael@0 308 9: [c]
michael@0 309 A: [d]
michael@0 310 B: [e]
michael@0 311
michael@0 312 and they mean:
michael@0 313 0: Open parenthesis
michael@0 314 1: Punctuation that prohibits break before
michael@0 315 2: Non-breakable between same classes
michael@0 316 3: Prefix
michael@0 317 4: Postfix
michael@0 318 5: Breakable character (Spaces and Most Japanese characters)
michael@0 319 6: Numeric
michael@0 320 7: Characters
michael@0 321 8: Need special handling characters (E.g., Thai)
michael@0 322 9: Open parentheses like Character (See bug 389056)
michael@0 323 A: Close parenthese (or punctuations) like Character (See bug 389056)
michael@0 324 B: Non breakable (See bug 390920)
michael@0 325
michael@0 326 */
michael@0 327
michael@0 328 #define CLASS_NONE INT8_MAX
michael@0 329
michael@0 330 #define CLASS_OPEN 0x00
michael@0 331 #define CLASS_CLOSE 0x01
michael@0 332 #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
michael@0 333 #define CLASS_PREFIX 0x03
michael@0 334 #define CLASS_POSTFFIX 0x04
michael@0 335 #define CLASS_BREAKABLE 0x05
michael@0 336 #define CLASS_NUMERIC 0x06
michael@0 337 #define CLASS_CHARACTER 0x07
michael@0 338 #define CLASS_COMPLEX 0x08
michael@0 339 #define CLASS_OPEN_LIKE_CHARACTER 0x09
michael@0 340 #define CLASS_CLOSE_LIKE_CHARACTER 0x0A
michael@0 341 #define CLASS_NON_BREAKABLE 0x0B
michael@0 342
michael@0 343 #define U_NULL char16_t(0x0000)
michael@0 344 #define U_SLASH char16_t('/')
michael@0 345 #define U_SPACE char16_t(' ')
michael@0 346 #define U_HYPHEN char16_t('-')
michael@0 347 #define U_EQUAL char16_t('=')
michael@0 348 #define U_PERCENT char16_t('%')
michael@0 349 #define U_AMPERSAND char16_t('&')
michael@0 350 #define U_SEMICOLON char16_t(';')
michael@0 351 #define U_BACKSLASH char16_t('\\')
michael@0 352 #define U_OPEN_SINGLE_QUOTE char16_t(0x2018)
michael@0 353 #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)
michael@0 354 #define U_OPEN_GUILLEMET char16_t(0x00AB)
michael@0 355
michael@0 356 #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \
michael@0 357 (c) == U_SLASH || \
michael@0 358 (c) == U_PERCENT || \
michael@0 359 (c) == U_AMPERSAND || \
michael@0 360 (c) == U_SEMICOLON || \
michael@0 361 (c) == U_BACKSLASH || \
michael@0 362 (c) == U_OPEN_SINGLE_QUOTE || \
michael@0 363 (c) == U_OPEN_DOUBLE_QUOTE || \
michael@0 364 (c) == U_OPEN_GUILLEMET)
michael@0 365
michael@0 366 #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
michael@0 367
michael@0 368 static inline int
michael@0 369 GETCLASSFROMTABLE(const uint32_t* t, uint16_t l)
michael@0 370 {
michael@0 371 return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
michael@0 372 }
michael@0 373
michael@0 374 static inline int
michael@0 375 IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u)
michael@0 376 {
michael@0 377 return ((0xff66 <= (u)) && ((u) <= 0xff70));
michael@0 378 }
michael@0 379
michael@0 380 static inline int
michael@0 381 IS_CJK_CHAR(char16_t u)
michael@0 382 {
michael@0 383 return ((0x1100 <= (u) && (u) <= 0x11ff) ||
michael@0 384 (0x2e80 <= (u) && (u) <= 0xd7ff) ||
michael@0 385 (0xf900 <= (u) && (u) <= 0xfaff) ||
michael@0 386 (0xff00 <= (u) && (u) <= 0xffef) );
michael@0 387 }
michael@0 388
michael@0 389 static inline bool
michael@0 390 IS_NONBREAKABLE_SPACE(char16_t u)
michael@0 391 {
michael@0 392 return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
michael@0 393 }
michael@0 394
michael@0 395 static inline bool
michael@0 396 IS_HYPHEN(char16_t u)
michael@0 397 {
michael@0 398 return (u == U_HYPHEN ||
michael@0 399 u == 0x058A || // ARMENIAN HYPHEN
michael@0 400 u == 0x2010 || // HYPHEN
michael@0 401 u == 0x2012 || // FIGURE DASH
michael@0 402 u == 0x2013); // EN DASH
michael@0 403 }
michael@0 404
michael@0 405 static int8_t
michael@0 406 GetClass(char16_t u)
michael@0 407 {
michael@0 408 uint16_t h = u & 0xFF00;
michael@0 409 uint16_t l = u & 0x00ff;
michael@0 410 int8_t c;
michael@0 411
michael@0 412 // Handle 3 range table first
michael@0 413 if (0x0000 == h) {
michael@0 414 c = GETCLASSFROMTABLE(gLBClass00, l);
michael@0 415 } else if (0x1700 == h) {
michael@0 416 c = GETCLASSFROMTABLE(gLBClass17, l);
michael@0 417 } else if (NS_NeedsPlatformNativeHandling(u)) {
michael@0 418 c = CLASS_COMPLEX;
michael@0 419 } else if (0x0E00 == h) {
michael@0 420 c = GETCLASSFROMTABLE(gLBClass0E, l);
michael@0 421 } else if (0x2000 == h) {
michael@0 422 c = GETCLASSFROMTABLE(gLBClass20, l);
michael@0 423 } else if (0x2100 == h) {
michael@0 424 c = GETCLASSFROMTABLE(gLBClass21, l);
michael@0 425 } else if (0x3000 == h) {
michael@0 426 c = GETCLASSFROMTABLE(gLBClass30, l);
michael@0 427 } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi
michael@0 428 ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul
michael@0 429 ((0xf900 <= h) && (h <= 0xfaff))) {
michael@0 430 c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatibility
michael@0 431 } else if (0xff00 == h) {
michael@0 432 if (l < 0x0060) { // Fullwidth ASCII variant
michael@0 433 c = GETCLASSFROMTABLE(gLBClass00, (l+0x20));
michael@0 434 } else if (l < 0x00a0) {
michael@0 435 switch (l) {
michael@0 436 case 0x61: c = GetClass(0x3002); break;
michael@0 437 case 0x62: c = GetClass(0x300c); break;
michael@0 438 case 0x63: c = GetClass(0x300d); break;
michael@0 439 case 0x64: c = GetClass(0x3001); break;
michael@0 440 case 0x65: c = GetClass(0x30fb); break;
michael@0 441 case 0x9e: c = GetClass(0x309b); break;
michael@0 442 case 0x9f: c = GetClass(0x309c); break;
michael@0 443 default:
michael@0 444 if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u))
michael@0 445 c = CLASS_CLOSE; // jis x4051 class 3
michael@0 446 else
michael@0 447 c = CLASS_BREAKABLE; // jis x4051 class 11
michael@0 448 break;
michael@0 449 }
michael@0 450 // Halfwidth Katakana variants
michael@0 451 } else if (l < 0x00e0) {
michael@0 452 c = CLASS_CHARACTER; // Halfwidth Hangul variants
michael@0 453 } else if (l < 0x00f0) {
michael@0 454 static char16_t NarrowFFEx[16] = {
michael@0 455 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
michael@0 456 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
michael@0 457 };
michael@0 458 c = GetClass(NarrowFFEx[l - 0x00e0]);
michael@0 459 } else {
michael@0 460 c = CLASS_CHARACTER;
michael@0 461 }
michael@0 462 } else if (0x3100 == h) {
michael@0 463 if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
michael@0 464 // XXX: This is per UAX #14, but UAX #14 may change
michael@0 465 // the line breaking rules about Kanbun and Bopomofo.
michael@0 466 c = CLASS_BREAKABLE;
michael@0 467 } else if (l >= 0xf0) { // Katakana small letters for Ainu
michael@0 468 c = CLASS_CLOSE;
michael@0 469 } else { // unassigned
michael@0 470 c = CLASS_CHARACTER;
michael@0 471 }
michael@0 472 } else if (0x0300 == h) {
michael@0 473 if (0x4F == l || (0x5C <= l && l <= 0x62))
michael@0 474 c = CLASS_NON_BREAKABLE;
michael@0 475 else
michael@0 476 c = CLASS_CHARACTER;
michael@0 477 } else if (0x0500 == h) {
michael@0 478 // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
michael@0 479 if (l == 0x8A)
michael@0 480 c = GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
michael@0 481 else
michael@0 482 c = CLASS_CHARACTER;
michael@0 483 } else if (0x0F00 == h) {
michael@0 484 if (0x08 == l || 0x0C == l || 0x12 == l)
michael@0 485 c = CLASS_NON_BREAKABLE;
michael@0 486 else
michael@0 487 c = CLASS_CHARACTER;
michael@0 488 } else if (0x1800 == h) {
michael@0 489 if (0x0E == l)
michael@0 490 c = CLASS_NON_BREAKABLE;
michael@0 491 else
michael@0 492 c = CLASS_CHARACTER;
michael@0 493 } else if (0x1600 == h) {
michael@0 494 if (0x80 == l) { // U+1680 OGHAM SPACE MARK
michael@0 495 c = CLASS_BREAKABLE;
michael@0 496 } else {
michael@0 497 c = CLASS_CHARACTER;
michael@0 498 }
michael@0 499 } else if (u == 0xfeff) {
michael@0 500 c = CLASS_NON_BREAKABLE;
michael@0 501 } else {
michael@0 502 c = CLASS_CHARACTER; // others
michael@0 503 }
michael@0 504 return c;
michael@0 505 }
michael@0 506
michael@0 507 static bool
michael@0 508 GetPair(int8_t c1, int8_t c2)
michael@0 509 {
michael@0 510 NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
michael@0 511 NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
michael@0 512
michael@0 513 return (0 == ((gPair[c1] >> c2) & 0x0001));
michael@0 514 }
michael@0 515
michael@0 516 static bool
michael@0 517 GetPairConservative(int8_t c1, int8_t c2)
michael@0 518 {
michael@0 519 NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
michael@0 520 NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
michael@0 521
michael@0 522 return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
michael@0 523 }
michael@0 524
michael@0 525 nsJISx4051LineBreaker::nsJISx4051LineBreaker()
michael@0 526 {
michael@0 527 }
michael@0 528
michael@0 529 nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
michael@0 530 {
michael@0 531 }
michael@0 532
michael@0 533 NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker)
michael@0 534
michael@0 535 class ContextState {
michael@0 536 public:
michael@0 537 ContextState(const char16_t* aText, uint32_t aLength) {
michael@0 538 mUniText = aText;
michael@0 539 mText = nullptr;
michael@0 540 mLength = aLength;
michael@0 541 Init();
michael@0 542 }
michael@0 543
michael@0 544 ContextState(const uint8_t* aText, uint32_t aLength) {
michael@0 545 mUniText = nullptr;
michael@0 546 mText = aText;
michael@0 547 mLength = aLength;
michael@0 548 Init();
michael@0 549 }
michael@0 550
michael@0 551 uint32_t Length() { return mLength; }
michael@0 552 uint32_t Index() { return mIndex; }
michael@0 553
michael@0 554 char16_t GetCharAt(uint32_t aIndex) {
michael@0 555 NS_ASSERTION(aIndex < mLength, "Out of range!");
michael@0 556 return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
michael@0 557 }
michael@0 558
michael@0 559 void AdvanceIndex() {
michael@0 560 ++mIndex;
michael@0 561 }
michael@0 562
michael@0 563 void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
michael@0 564
michael@0 565 // A word of western language should not be broken. But even if the word has
michael@0 566 // only ASCII characters, non-natural context words should be broken, e.g.,
michael@0 567 // URL and file path. For protecting the natural words, we should use
michael@0 568 // conservative breaking rules at following conditions:
michael@0 569 // 1. at near the start of word
michael@0 570 // 2. at near the end of word
michael@0 571 // 3. at near the latest broken point
michael@0 572 // CONSERVATIVE_BREAK_RANGE define the 'near' in characters.
michael@0 573 #define CONSERVATIVE_BREAK_RANGE 6
michael@0 574
michael@0 575 bool UseConservativeBreaking(uint32_t aOffset = 0) {
michael@0 576 if (mHasCJKChar)
michael@0 577 return false;
michael@0 578 uint32_t index = mIndex + aOffset;
michael@0 579 bool result = (index < CONSERVATIVE_BREAK_RANGE ||
michael@0 580 mLength - index < CONSERVATIVE_BREAK_RANGE ||
michael@0 581 index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE);
michael@0 582 if (result || !mHasNonbreakableSpace)
michael@0 583 return result;
michael@0 584
michael@0 585 // This text has no-breakable space, we need to check whether the index
michael@0 586 // is near it.
michael@0 587
michael@0 588 // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here.
michael@0 589 for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) {
michael@0 590 if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1)))
michael@0 591 return true;
michael@0 592 }
michael@0 593 // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE.
michael@0 594 for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) {
michael@0 595 if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))
michael@0 596 return true;
michael@0 597 }
michael@0 598 return false;
michael@0 599 }
michael@0 600
michael@0 601 bool HasPreviousEqualsSign() const {
michael@0 602 return mHasPreviousEqualsSign;
michael@0 603 }
michael@0 604 void NotifySeenEqualsSign() {
michael@0 605 mHasPreviousEqualsSign = true;
michael@0 606 }
michael@0 607
michael@0 608 bool HasPreviousSlash() const {
michael@0 609 return mHasPreviousSlash;
michael@0 610 }
michael@0 611 void NotifySeenSlash() {
michael@0 612 mHasPreviousSlash = true;
michael@0 613 }
michael@0 614
michael@0 615 bool HasPreviousBackslash() const {
michael@0 616 return mHasPreviousBackslash;
michael@0 617 }
michael@0 618 void NotifySeenBackslash() {
michael@0 619 mHasPreviousBackslash = true;
michael@0 620 }
michael@0 621
michael@0 622 char16_t GetPreviousNonHyphenCharacter() const {
michael@0 623 return mPreviousNonHyphenCharacter;
michael@0 624 }
michael@0 625 void NotifyNonHyphenCharacter(char16_t ch) {
michael@0 626 mPreviousNonHyphenCharacter = ch;
michael@0 627 }
michael@0 628
michael@0 629 private:
michael@0 630 void Init() {
michael@0 631 mIndex = 0;
michael@0 632 mLastBreakIndex = 0;
michael@0 633 mPreviousNonHyphenCharacter = U_NULL;
michael@0 634 mHasCJKChar = 0;
michael@0 635 mHasNonbreakableSpace = 0;
michael@0 636 mHasPreviousEqualsSign = false;
michael@0 637 mHasPreviousSlash = false;
michael@0 638 mHasPreviousBackslash = false;
michael@0 639
michael@0 640 for (uint32_t i = 0; i < mLength; ++i) {
michael@0 641 char16_t u = GetCharAt(i);
michael@0 642 if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u))
michael@0 643 mHasNonbreakableSpace = 1;
michael@0 644 else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u))
michael@0 645 mHasCJKChar = 1;
michael@0 646 }
michael@0 647 }
michael@0 648
michael@0 649 const char16_t* mUniText;
michael@0 650 const uint8_t* mText;
michael@0 651
michael@0 652 uint32_t mIndex;
michael@0 653 uint32_t mLength; // length of text
michael@0 654 uint32_t mLastBreakIndex;
michael@0 655 char16_t mPreviousNonHyphenCharacter; // The last character we have seen
michael@0 656 // which is not U_HYPHEN
michael@0 657 bool mHasCJKChar; // if the text has CJK character, this is true.
michael@0 658 bool mHasNonbreakableSpace; // if the text has no-breakable space,
michael@0 659 // this is true.
michael@0 660 bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
michael@0 661 bool mHasPreviousSlash; // True if we have seen a U_SLASH
michael@0 662 bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH
michael@0 663 };
michael@0 664
michael@0 665 static int8_t
michael@0 666 ContextualAnalysis(char16_t prev, char16_t cur, char16_t next,
michael@0 667 ContextState &aState)
michael@0 668 {
michael@0 669 // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
michael@0 670
michael@0 671 if (IS_HYPHEN(cur)) {
michael@0 672 // If next character is hyphen, we don't need to break between them.
michael@0 673 if (IS_HYPHEN(next))
michael@0 674 return CLASS_CHARACTER;
michael@0 675 // If prev and next characters are numeric, it may be in Math context.
michael@0 676 // So, we should not break here.
michael@0 677 bool prevIsNum = IS_ASCII_DIGIT(prev);
michael@0 678 bool nextIsNum = IS_ASCII_DIGIT(next);
michael@0 679 if (prevIsNum && nextIsNum)
michael@0 680 return CLASS_NUMERIC;
michael@0 681 // If one side is numeric and the other is a character, or if both sides are
michael@0 682 // characters, the hyphen should be breakable.
michael@0 683 if (!aState.UseConservativeBreaking(1)) {
michael@0 684 char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
michael@0 685 if (prevOfHyphen && next) {
michael@0 686 int8_t prevClass = GetClass(prevOfHyphen);
michael@0 687 int8_t nextClass = GetClass(next);
michael@0 688 bool prevIsNumOrCharOrClose =
michael@0 689 prevIsNum ||
michael@0 690 (prevClass == CLASS_CHARACTER &&
michael@0 691 !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
michael@0 692 prevClass == CLASS_CLOSE ||
michael@0 693 prevClass == CLASS_CLOSE_LIKE_CHARACTER;
michael@0 694 bool nextIsNumOrCharOrOpen =
michael@0 695 nextIsNum ||
michael@0 696 (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||
michael@0 697 nextClass == CLASS_OPEN ||
michael@0 698 nextClass == CLASS_OPEN_LIKE_CHARACTER ||
michael@0 699 next == U_OPEN_SINGLE_QUOTE ||
michael@0 700 next == U_OPEN_DOUBLE_QUOTE ||
michael@0 701 next == U_OPEN_GUILLEMET;
michael@0 702 if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {
michael@0 703 return CLASS_CLOSE;
michael@0 704 }
michael@0 705 }
michael@0 706 }
michael@0 707 } else {
michael@0 708 aState.NotifyNonHyphenCharacter(cur);
michael@0 709 if (cur == U_SLASH || cur == U_BACKSLASH) {
michael@0 710 // If this is immediately after same char, we should not break here.
michael@0 711 if (prev == cur)
michael@0 712 return CLASS_CHARACTER;
michael@0 713 // If this text has two or more (BACK)SLASHs, this may be file path or URL.
michael@0 714 // Make sure to compute shouldReturn before we notify on this slash.
michael@0 715 bool shouldReturn = !aState.UseConservativeBreaking() &&
michael@0 716 (cur == U_SLASH ?
michael@0 717 aState.HasPreviousSlash() : aState.HasPreviousBackslash());
michael@0 718
michael@0 719 if (cur == U_SLASH) {
michael@0 720 aState.NotifySeenSlash();
michael@0 721 } else {
michael@0 722 aState.NotifySeenBackslash();
michael@0 723 }
michael@0 724
michael@0 725 if (shouldReturn)
michael@0 726 return CLASS_OPEN;
michael@0 727 } else if (cur == U_PERCENT) {
michael@0 728 // If this is a part of the param of URL, we should break before.
michael@0 729 if (!aState.UseConservativeBreaking()) {
michael@0 730 if (aState.Index() >= 3 &&
michael@0 731 aState.GetCharAt(aState.Index() - 3) == U_PERCENT)
michael@0 732 return CLASS_OPEN;
michael@0 733 if (aState.Index() + 3 < aState.Length() &&
michael@0 734 aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
michael@0 735 return CLASS_OPEN;
michael@0 736 }
michael@0 737 } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
michael@0 738 // If this may be a separator of params of URL, we should break after.
michael@0 739 if (!aState.UseConservativeBreaking(1) &&
michael@0 740 aState.HasPreviousEqualsSign())
michael@0 741 return CLASS_CLOSE;
michael@0 742 } else if (cur == U_OPEN_SINGLE_QUOTE ||
michael@0 743 cur == U_OPEN_DOUBLE_QUOTE ||
michael@0 744 cur == U_OPEN_GUILLEMET) {
michael@0 745 // for CJK usage, we treat these as openers to allow a break before them,
michael@0 746 // but otherwise treat them as normal characters because quote mark usage
michael@0 747 // in various Western languages varies too much; see bug #450088 discussion.
michael@0 748 if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
michael@0 749 return CLASS_OPEN;
michael@0 750 } else {
michael@0 751 NS_ERROR("Forgot to handle the current character!");
michael@0 752 }
michael@0 753 }
michael@0 754 return GetClass(cur);
michael@0 755 }
michael@0 756
michael@0 757
michael@0 758 int32_t
michael@0 759 nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen,
michael@0 760 uint32_t aPos, int8_t aDirection)
michael@0 761 {
michael@0 762 bool textNeedsJISx4051 = false;
michael@0 763 int32_t begin, end;
michael@0 764
michael@0 765 for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
michael@0 766 if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) {
michael@0 767 textNeedsJISx4051 = true;
michael@0 768 }
michael@0 769 }
michael@0 770 for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {
michael@0 771 if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
michael@0 772 textNeedsJISx4051 = true;
michael@0 773 }
michael@0 774 }
michael@0 775
michael@0 776 int32_t ret;
michael@0 777 nsAutoTArray<uint8_t, 2000> breakState;
michael@0 778 if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
michael@0 779 // No complex text character, do not try to do complex line break.
michael@0 780 // (This is required for serializers. See Bug #344816.)
michael@0 781 // Also fall back to this when out of memory.
michael@0 782 if (aDirection < 0) {
michael@0 783 ret = (begin == int32_t(aPos)) ? begin - 1 : begin;
michael@0 784 } else {
michael@0 785 ret = end;
michael@0 786 }
michael@0 787 } else {
michael@0 788 GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal,
michael@0 789 breakState.Elements());
michael@0 790
michael@0 791 ret = aPos;
michael@0 792 do {
michael@0 793 ret += aDirection;
michael@0 794 } while (begin < ret && ret < end && !breakState[ret - begin]);
michael@0 795 }
michael@0 796
michael@0 797 return ret;
michael@0 798 }
michael@0 799
michael@0 800 int32_t
michael@0 801 nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen,
michael@0 802 uint32_t aPos)
michael@0 803 {
michael@0 804 NS_ASSERTION(aText, "aText shouldn't be null");
michael@0 805 NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next");
michael@0 806
michael@0 807 int32_t nextPos = WordMove(aText, aLen, aPos, 1);
michael@0 808 return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
michael@0 809 }
michael@0 810
michael@0 811 int32_t
michael@0 812 nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen,
michael@0 813 uint32_t aPos)
michael@0 814 {
michael@0 815 NS_ASSERTION(aText, "aText shouldn't be null");
michael@0 816 NS_ASSERTION(aLen >= aPos && aPos > 0,
michael@0 817 "Bad position passed to nsJISx4051LineBreaker::Prev");
michael@0 818
michael@0 819 int32_t prevPos = WordMove(aText, aLen, aPos, -1);
michael@0 820 return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
michael@0 821 }
michael@0 822
michael@0 823 void
michael@0 824 nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength,
michael@0 825 uint8_t aWordBreak,
michael@0 826 uint8_t* aBreakBefore)
michael@0 827 {
michael@0 828 uint32_t cur;
michael@0 829 int8_t lastClass = CLASS_NONE;
michael@0 830 ContextState state(aChars, aLength);
michael@0 831
michael@0 832 for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
michael@0 833 char16_t ch = aChars[cur];
michael@0 834 int8_t cl;
michael@0 835
michael@0 836 if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
michael@0 837 cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
michael@0 838 ch,
michael@0 839 cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
michael@0 840 state);
michael@0 841 } else {
michael@0 842 if (ch == U_EQUAL)
michael@0 843 state.NotifySeenEqualsSign();
michael@0 844 state.NotifyNonHyphenCharacter(ch);
michael@0 845 cl = GetClass(ch);
michael@0 846 }
michael@0 847
michael@0 848 bool allowBreak = false;
michael@0 849 if (cur > 0) {
michael@0 850 NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
michael@0 851 "Loop should have prevented adjacent complex chars here");
michael@0 852 if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
michael@0 853 allowBreak = (state.UseConservativeBreaking()) ?
michael@0 854 GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
michael@0 855 } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
michael@0 856 allowBreak = true;
michael@0 857 }
michael@0 858 }
michael@0 859 aBreakBefore[cur] = allowBreak;
michael@0 860 if (allowBreak)
michael@0 861 state.NotifyBreakBefore();
michael@0 862 lastClass = cl;
michael@0 863 if (CLASS_COMPLEX == cl) {
michael@0 864 uint32_t end = cur + 1;
michael@0 865
michael@0 866 while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) {
michael@0 867 ++end;
michael@0 868 }
michael@0 869
michael@0 870 NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
michael@0 871
michael@0 872 // We have to consider word-break value again for complex characters
michael@0 873 if (aWordBreak != nsILineBreaker::kWordBreak_Normal) {
michael@0 874 // Respect word-break property
michael@0 875 for (uint32_t i = cur; i < end; i++)
michael@0 876 aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll);
michael@0 877 }
michael@0 878
michael@0 879 // restore breakability at chunk begin, which was always set to false
michael@0 880 // by the complex line breaker
michael@0 881 aBreakBefore[cur] = allowBreak;
michael@0 882
michael@0 883 cur = end - 1;
michael@0 884 }
michael@0 885 }
michael@0 886 }
michael@0 887
michael@0 888 void
michael@0 889 nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength,
michael@0 890 uint8_t aWordBreak,
michael@0 891 uint8_t* aBreakBefore)
michael@0 892 {
michael@0 893 uint32_t cur;
michael@0 894 int8_t lastClass = CLASS_NONE;
michael@0 895 ContextState state(aChars, aLength);
michael@0 896
michael@0 897 for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
michael@0 898 char16_t ch = aChars[cur];
michael@0 899 int8_t cl;
michael@0 900
michael@0 901 if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
michael@0 902 cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
michael@0 903 ch,
michael@0 904 cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
michael@0 905 state);
michael@0 906 } else {
michael@0 907 if (ch == U_EQUAL)
michael@0 908 state.NotifySeenEqualsSign();
michael@0 909 state.NotifyNonHyphenCharacter(ch);
michael@0 910 cl = GetClass(ch);
michael@0 911 }
michael@0 912
michael@0 913 bool allowBreak = false;
michael@0 914 if (cur > 0) {
michael@0 915 if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
michael@0 916 allowBreak = (state.UseConservativeBreaking()) ?
michael@0 917 GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
michael@0 918 } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
michael@0 919 allowBreak = true;
michael@0 920 }
michael@0 921 }
michael@0 922 aBreakBefore[cur] = allowBreak;
michael@0 923 if (allowBreak)
michael@0 924 state.NotifyBreakBefore();
michael@0 925 lastClass = cl;
michael@0 926 }
michael@0 927 }

mercurial