1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/lwbrk/src/nsJISx4051LineBreaker.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,927 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 + 1.10 + 1.11 +#include "nsJISx4051LineBreaker.h" 1.12 + 1.13 +#include "jisx4051class.h" 1.14 +#include "nsComplexBreaker.h" 1.15 +#include "nsTArray.h" 1.16 + 1.17 +/* 1.18 + 1.19 + Simplification of Pair Table in JIS X 4051 1.20 + 1.21 + 1. The Origion Table - in 4.1.3 1.22 + 1.23 + In JIS x 4051. The pair table is defined as below 1.24 + 1.25 + Class of 1.26 + Leading Class of Trailing Char Class 1.27 + Char 1.28 + 1.29 + 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20 1.30 + * # * # 1.31 + 1 X X X X X X X X X X X X X X X X X X X X X E 1.32 + 2 X X X X X X 1.33 + 3 X X X X X X 1.34 + 4 X X X X X X 1.35 + 5 X X X X X X 1.36 + 6 X X X X X X 1.37 + 7 X X X X X X X 1.38 + 8 X X X X X X E 1.39 + 9 X X X X X X 1.40 + 10 X X X X X X 1.41 + 11 X X X X X X 1.42 + 12 X X X X X X 1.43 + 13 X X X X X X X 1.44 + 14 X X X X X X X 1.45 + 15 X X X X X X X X X 1.46 + 16 X X X X X X X X 1.47 + 17 X X X X X E 1.48 + 18 X X X X X X X X X 1.49 + 19 X E E E E E X X X X X X X X X X X X E X E E 1.50 + 20 X X X X X E 1.51 + 1.52 + * Same Char 1.53 + # Other Char 1.54 + 1.55 + X Cannot Break 1.56 + 1.57 + The classes mean: 1.58 + 1: Open parenthesis 1.59 + 2: Close parenthesis 1.60 + 3: Prohibit a line break before 1.61 + 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?") 1.62 + 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT) 1.63 + 6: Full stop 1.64 + 7: Non-breakable between same characters 1.65 + 8: Prefix (e.g., "$", "NO.") 1.66 + 9: Postfix (e.g., "%") 1.67 + 10: Ideographic space 1.68 + 11: Hiragana 1.69 + 12: Japanese characters (except class 11) 1.70 + 13: Subscript 1.71 + 14: Ruby 1.72 + 15: Numeric 1.73 + 16: Alphabet 1.74 + 17: Space for Western language 1.75 + 18: Western characters (except class 17) 1.76 + 19: Split line note (Warichu) begin quote 1.77 + 20: Split line note (Warichu) end quote 1.78 + 1.79 + 2. Simplified by remove the class which we do not care 1.80 + 1.81 + However, since we do not care about class 13(Subscript), 14(Ruby), 1.82 + 16 (Aphabet), 19(split line note begin quote), and 20(split line note end 1.83 + quote) we can simplify this par table into the following 1.84 + 1.85 + Class of 1.86 + Leading Class of Trailing Char Class 1.87 + Char 1.88 + 1.89 + 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18 1.90 + 1.91 + 1 X X X X X X X X X X X X X X X 1.92 + 2 X X X X X 1.93 + 3 X X X X X 1.94 + 4 X X X X X 1.95 + 5 X X X X X 1.96 + 6 X X X X X 1.97 + 7 X X X X X X 1.98 + 8 X X X X X X 1.99 + 9 X X X X X 1.100 + 10 X X X X X 1.101 + 11 X X X X X 1.102 + 12 X X X X X 1.103 + 15 X X X X X X X X 1.104 + 17 X X X X X 1.105 + 18 X X X X X X X 1.106 + 1.107 + 3. Simplified by merged classes 1.108 + 1.109 + After the 2 simplification, the pair table have some duplication 1.110 + a. class 2, 3, 4, 5, 6, are the same- we can merged them 1.111 + b. class 10, 11, 12, 17 are the same- we can merged them 1.112 + 1.113 + 1.114 + Class of 1.115 + Leading Class of Trailing Char Class 1.116 + Char 1.117 + 1.118 + 1 [a] 7 8 9 [b]15 18 1.119 + 1.120 + 1 X X X X X X X X 1.121 + [a] X 1.122 + 7 X X 1.123 + 8 X X 1.124 + 9 X 1.125 + [b] X 1.126 + 15 X X X X 1.127 + 18 X X X 1.128 + 1.129 + 1.130 + 4. We add COMPLEX characters and make it breakable w/ all ther class 1.131 + except after class 1 and before class [a] 1.132 + 1.133 + Class of 1.134 + Leading Class of Trailing Char Class 1.135 + Char 1.136 + 1.137 + 1 [a] 7 8 9 [b]15 18 COMPLEX 1.138 + 1.139 + 1 X X X X X X X X X 1.140 + [a] X 1.141 + 7 X X 1.142 + 8 X X 1.143 + 9 X 1.144 + [b] X 1.145 + 15 X X X X 1.146 + 18 X X X 1.147 + COMPLEX X T 1.148 + 1.149 + T : need special handling 1.150 + 1.151 + 1.152 + 5. However, we need two special class for some punctuations/parentheses, 1.153 + theirs breaking rules like character class (18), see bug 389056. 1.154 + And also we need character like punctuation that is same behavior with 18, 1.155 + but the characters are not letters of all languages. (e.g., '_') 1.156 + [c]. Based on open parenthesis class (1), but it is not breakable after 1.157 + character class (18) or numeric class (15). 1.158 + [d]. Based on close parenthesis (or punctuation) class (2), but it is not 1.159 + breakable before character class (18) or numeric class (15). 1.160 + 1.161 + Class of 1.162 + Leading Class of Trailing Char Class 1.163 + Char 1.164 + 1.165 + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] 1.166 + 1.167 + 1 X X X X X X X X X X X 1.168 + [a] X X X 1.169 + 7 X X 1.170 + 8 X X 1.171 + 9 X 1.172 + [b] X X 1.173 + 15 X X X X X X 1.174 + 18 X X X X X 1.175 + COMPLEX X T 1.176 + [c] X X X X X X X X X X X 1.177 + [d] X X X X 1.178 + 1.179 + 1.180 + 6. And Unicode has "NON-BREAK" characters. The lines should be broken around 1.181 + them. But in JIS X 4051, such class is not, therefore, we create [e]. 1.182 + 1.183 + Class of 1.184 + Leading Class of Trailing Char Class 1.185 + Char 1.186 + 1.187 + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] 1.188 + 1.189 + 1 X X X X X X X X X X X X 1.190 + [a] X X X 1.191 + 7 X X X 1.192 + 8 X X X 1.193 + 9 X X 1.194 + [b] X X X 1.195 + 15 X X X X X X X 1.196 + 18 X X X X X X 1.197 + COMPLEX X T X 1.198 + [c] X X X X X X X X X X X X 1.199 + [d] X X X X X 1.200 + [e] X X X X X X X X X X X X 1.201 + 1.202 + 1.203 + 7. Now we use one bit to encode weather it is breakable, and use 2 bytes 1.204 + for one row, then the bit table will look like: 1.205 + 1.206 + 18 <- 1 1.207 + 1.208 + 1 0000 1111 1111 1111 = 0x0FFF 1.209 + [a] 0000 1100 0000 0010 = 0x0C02 1.210 + 7 0000 1000 0000 0110 = 0x0806 1.211 + 8 0000 1000 0100 0010 = 0x0842 1.212 + 9 0000 1000 0000 0010 = 0x0802 1.213 + [b] 0000 1100 0000 0010 = 0x0C02 1.214 + 15 0000 1110 1101 0010 = 0x0ED2 1.215 + 18 0000 1110 1100 0010 = 0x0EC2 1.216 + COMPLEX 0000 1001 0000 0010 = 0x0902 1.217 + [c] 0000 1111 1111 1111 = 0x0FFF 1.218 + [d] 0000 1100 1100 0010 = 0x0CC2 1.219 + [e] 0000 1111 1111 1111 = 0x0FFF 1.220 +*/ 1.221 + 1.222 +#define MAX_CLASSES 12 1.223 + 1.224 +static const uint16_t gPair[MAX_CLASSES] = { 1.225 + 0x0FFF, 1.226 + 0x0C02, 1.227 + 0x0806, 1.228 + 0x0842, 1.229 + 0x0802, 1.230 + 0x0C02, 1.231 + 0x0ED2, 1.232 + 0x0EC2, 1.233 + 0x0902, 1.234 + 0x0FFF, 1.235 + 0x0CC2, 1.236 + 0x0FFF 1.237 +}; 1.238 + 1.239 + 1.240 +/* 1.241 + 1.242 + 8. And if the character is not enough far from word start, word end and 1.243 + another break point, we should not break in non-CJK languages. 1.244 + I.e., Don't break around 15, 18, [c] and [d], but don't change 1.245 + that if they are related to [b]. 1.246 + 1.247 + Class of 1.248 + Leading Class of Trailing Char Class 1.249 + Char 1.250 + 1.251 + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] 1.252 + 1.253 + 1 X X X X X X X X X X X X 1.254 + [a] X X X X X X 1.255 + 7 X X X X X X X 1.256 + 8 X X X X X X 1.257 + 9 X X X X X X 1.258 + [b] X X X 1.259 + 15 X X X X X X X X X X X 1.260 + 18 X X X X X X X X X X X 1.261 + COMPLEX X X X T X X X 1.262 + [c] X X X X X X X X X X X X 1.263 + [d] X X X X X X X X X X X 1.264 + [e] X X X X X X X X X X X X 1.265 + 1.266 + 18 <- 1 1.267 + 1.268 + 1 0000 1111 1111 1111 = 0x0FFF 1.269 + [a] 0000 1110 1100 0010 = 0x0EC2 1.270 + 7 0000 1110 1100 0110 = 0x0EC6 1.271 + 8 0000 1110 1100 0010 = 0x0EC2 1.272 + 9 0000 1110 1100 0010 = 0x0EC2 1.273 + [b] 0000 1100 0000 0010 = 0x0C02 1.274 + 15 0000 1111 1101 1111 = 0x0FDF 1.275 + 18 0000 1111 1101 1111 = 0x0FDF 1.276 + COMPLEX 0000 1111 1100 0010 = 0x0FC2 1.277 + [c] 0000 1111 1111 1111 = 0x0FFF 1.278 + [d] 0000 1111 1101 1111 = 0x0FDF 1.279 + [e] 0000 1111 1111 1111 = 0x0FFF 1.280 +*/ 1.281 + 1.282 +static const uint16_t gPairConservative[MAX_CLASSES] = { 1.283 + 0x0FFF, 1.284 + 0x0EC2, 1.285 + 0x0EC6, 1.286 + 0x0EC2, 1.287 + 0x0EC2, 1.288 + 0x0C02, 1.289 + 0x0FDF, 1.290 + 0x0FDF, 1.291 + 0x0FC2, 1.292 + 0x0FFF, 1.293 + 0x0FDF, 1.294 + 0x0FFF 1.295 +}; 1.296 + 1.297 + 1.298 +/* 1.299 + 1.300 + 9. Now we map the class to number 1.301 + 1.302 + 0: 1 1.303 + 1: [a]- 2, 3, 4, 5, 6 1.304 + 2: 7 1.305 + 3: 8 1.306 + 4: 9 1.307 + 5: [b]- 10, 11, 12, 17 1.308 + 6: 15 1.309 + 7: 18 1.310 + 8: COMPLEX 1.311 + 9: [c] 1.312 + A: [d] 1.313 + B: [e] 1.314 + 1.315 + and they mean: 1.316 + 0: Open parenthesis 1.317 + 1: Punctuation that prohibits break before 1.318 + 2: Non-breakable between same classes 1.319 + 3: Prefix 1.320 + 4: Postfix 1.321 + 5: Breakable character (Spaces and Most Japanese characters) 1.322 + 6: Numeric 1.323 + 7: Characters 1.324 + 8: Need special handling characters (E.g., Thai) 1.325 + 9: Open parentheses like Character (See bug 389056) 1.326 + A: Close parenthese (or punctuations) like Character (See bug 389056) 1.327 + B: Non breakable (See bug 390920) 1.328 + 1.329 +*/ 1.330 + 1.331 +#define CLASS_NONE INT8_MAX 1.332 + 1.333 +#define CLASS_OPEN 0x00 1.334 +#define CLASS_CLOSE 0x01 1.335 +#define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02 1.336 +#define CLASS_PREFIX 0x03 1.337 +#define CLASS_POSTFFIX 0x04 1.338 +#define CLASS_BREAKABLE 0x05 1.339 +#define CLASS_NUMERIC 0x06 1.340 +#define CLASS_CHARACTER 0x07 1.341 +#define CLASS_COMPLEX 0x08 1.342 +#define CLASS_OPEN_LIKE_CHARACTER 0x09 1.343 +#define CLASS_CLOSE_LIKE_CHARACTER 0x0A 1.344 +#define CLASS_NON_BREAKABLE 0x0B 1.345 + 1.346 +#define U_NULL char16_t(0x0000) 1.347 +#define U_SLASH char16_t('/') 1.348 +#define U_SPACE char16_t(' ') 1.349 +#define U_HYPHEN char16_t('-') 1.350 +#define U_EQUAL char16_t('=') 1.351 +#define U_PERCENT char16_t('%') 1.352 +#define U_AMPERSAND char16_t('&') 1.353 +#define U_SEMICOLON char16_t(';') 1.354 +#define U_BACKSLASH char16_t('\\') 1.355 +#define U_OPEN_SINGLE_QUOTE char16_t(0x2018) 1.356 +#define U_OPEN_DOUBLE_QUOTE char16_t(0x201C) 1.357 +#define U_OPEN_GUILLEMET char16_t(0x00AB) 1.358 + 1.359 +#define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \ 1.360 + (c) == U_SLASH || \ 1.361 + (c) == U_PERCENT || \ 1.362 + (c) == U_AMPERSAND || \ 1.363 + (c) == U_SEMICOLON || \ 1.364 + (c) == U_BACKSLASH || \ 1.365 + (c) == U_OPEN_SINGLE_QUOTE || \ 1.366 + (c) == U_OPEN_DOUBLE_QUOTE || \ 1.367 + (c) == U_OPEN_GUILLEMET) 1.368 + 1.369 +#define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039) 1.370 + 1.371 +static inline int 1.372 +GETCLASSFROMTABLE(const uint32_t* t, uint16_t l) 1.373 +{ 1.374 + return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f); 1.375 +} 1.376 + 1.377 +static inline int 1.378 +IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u) 1.379 +{ 1.380 + return ((0xff66 <= (u)) && ((u) <= 0xff70)); 1.381 +} 1.382 + 1.383 +static inline int 1.384 +IS_CJK_CHAR(char16_t u) 1.385 +{ 1.386 + return ((0x1100 <= (u) && (u) <= 0x11ff) || 1.387 + (0x2e80 <= (u) && (u) <= 0xd7ff) || 1.388 + (0xf900 <= (u) && (u) <= 0xfaff) || 1.389 + (0xff00 <= (u) && (u) <= 0xffef) ); 1.390 +} 1.391 + 1.392 +static inline bool 1.393 +IS_NONBREAKABLE_SPACE(char16_t u) 1.394 +{ 1.395 + return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE 1.396 +} 1.397 + 1.398 +static inline bool 1.399 +IS_HYPHEN(char16_t u) 1.400 +{ 1.401 + return (u == U_HYPHEN || 1.402 + u == 0x058A || // ARMENIAN HYPHEN 1.403 + u == 0x2010 || // HYPHEN 1.404 + u == 0x2012 || // FIGURE DASH 1.405 + u == 0x2013); // EN DASH 1.406 +} 1.407 + 1.408 +static int8_t 1.409 +GetClass(char16_t u) 1.410 +{ 1.411 + uint16_t h = u & 0xFF00; 1.412 + uint16_t l = u & 0x00ff; 1.413 + int8_t c; 1.414 + 1.415 + // Handle 3 range table first 1.416 + if (0x0000 == h) { 1.417 + c = GETCLASSFROMTABLE(gLBClass00, l); 1.418 + } else if (0x1700 == h) { 1.419 + c = GETCLASSFROMTABLE(gLBClass17, l); 1.420 + } else if (NS_NeedsPlatformNativeHandling(u)) { 1.421 + c = CLASS_COMPLEX; 1.422 + } else if (0x0E00 == h) { 1.423 + c = GETCLASSFROMTABLE(gLBClass0E, l); 1.424 + } else if (0x2000 == h) { 1.425 + c = GETCLASSFROMTABLE(gLBClass20, l); 1.426 + } else if (0x2100 == h) { 1.427 + c = GETCLASSFROMTABLE(gLBClass21, l); 1.428 + } else if (0x3000 == h) { 1.429 + c = GETCLASSFROMTABLE(gLBClass30, l); 1.430 + } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi 1.431 + ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul 1.432 + ((0xf900 <= h) && (h <= 0xfaff))) { 1.433 + c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatibility 1.434 + } else if (0xff00 == h) { 1.435 + if (l < 0x0060) { // Fullwidth ASCII variant 1.436 + c = GETCLASSFROMTABLE(gLBClass00, (l+0x20)); 1.437 + } else if (l < 0x00a0) { 1.438 + switch (l) { 1.439 + case 0x61: c = GetClass(0x3002); break; 1.440 + case 0x62: c = GetClass(0x300c); break; 1.441 + case 0x63: c = GetClass(0x300d); break; 1.442 + case 0x64: c = GetClass(0x3001); break; 1.443 + case 0x65: c = GetClass(0x30fb); break; 1.444 + case 0x9e: c = GetClass(0x309b); break; 1.445 + case 0x9f: c = GetClass(0x309c); break; 1.446 + default: 1.447 + if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) 1.448 + c = CLASS_CLOSE; // jis x4051 class 3 1.449 + else 1.450 + c = CLASS_BREAKABLE; // jis x4051 class 11 1.451 + break; 1.452 + } 1.453 + // Halfwidth Katakana variants 1.454 + } else if (l < 0x00e0) { 1.455 + c = CLASS_CHARACTER; // Halfwidth Hangul variants 1.456 + } else if (l < 0x00f0) { 1.457 + static char16_t NarrowFFEx[16] = { 1.458 + 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000, 1.459 + 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000 1.460 + }; 1.461 + c = GetClass(NarrowFFEx[l - 0x00e0]); 1.462 + } else { 1.463 + c = CLASS_CHARACTER; 1.464 + } 1.465 + } else if (0x3100 == h) { 1.466 + if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun 1.467 + // XXX: This is per UAX #14, but UAX #14 may change 1.468 + // the line breaking rules about Kanbun and Bopomofo. 1.469 + c = CLASS_BREAKABLE; 1.470 + } else if (l >= 0xf0) { // Katakana small letters for Ainu 1.471 + c = CLASS_CLOSE; 1.472 + } else { // unassigned 1.473 + c = CLASS_CHARACTER; 1.474 + } 1.475 + } else if (0x0300 == h) { 1.476 + if (0x4F == l || (0x5C <= l && l <= 0x62)) 1.477 + c = CLASS_NON_BREAKABLE; 1.478 + else 1.479 + c = CLASS_CHARACTER; 1.480 + } else if (0x0500 == h) { 1.481 + // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14) 1.482 + if (l == 0x8A) 1.483 + c = GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN)); 1.484 + else 1.485 + c = CLASS_CHARACTER; 1.486 + } else if (0x0F00 == h) { 1.487 + if (0x08 == l || 0x0C == l || 0x12 == l) 1.488 + c = CLASS_NON_BREAKABLE; 1.489 + else 1.490 + c = CLASS_CHARACTER; 1.491 + } else if (0x1800 == h) { 1.492 + if (0x0E == l) 1.493 + c = CLASS_NON_BREAKABLE; 1.494 + else 1.495 + c = CLASS_CHARACTER; 1.496 + } else if (0x1600 == h) { 1.497 + if (0x80 == l) { // U+1680 OGHAM SPACE MARK 1.498 + c = CLASS_BREAKABLE; 1.499 + } else { 1.500 + c = CLASS_CHARACTER; 1.501 + } 1.502 + } else if (u == 0xfeff) { 1.503 + c = CLASS_NON_BREAKABLE; 1.504 + } else { 1.505 + c = CLASS_CHARACTER; // others 1.506 + } 1.507 + return c; 1.508 +} 1.509 + 1.510 +static bool 1.511 +GetPair(int8_t c1, int8_t c2) 1.512 +{ 1.513 + NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); 1.514 + NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); 1.515 + 1.516 + return (0 == ((gPair[c1] >> c2) & 0x0001)); 1.517 +} 1.518 + 1.519 +static bool 1.520 +GetPairConservative(int8_t c1, int8_t c2) 1.521 +{ 1.522 + NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); 1.523 + NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); 1.524 + 1.525 + return (0 == ((gPairConservative[c1] >> c2) & 0x0001)); 1.526 +} 1.527 + 1.528 +nsJISx4051LineBreaker::nsJISx4051LineBreaker() 1.529 +{ 1.530 +} 1.531 + 1.532 +nsJISx4051LineBreaker::~nsJISx4051LineBreaker() 1.533 +{ 1.534 +} 1.535 + 1.536 +NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker) 1.537 + 1.538 +class ContextState { 1.539 +public: 1.540 + ContextState(const char16_t* aText, uint32_t aLength) { 1.541 + mUniText = aText; 1.542 + mText = nullptr; 1.543 + mLength = aLength; 1.544 + Init(); 1.545 + } 1.546 + 1.547 + ContextState(const uint8_t* aText, uint32_t aLength) { 1.548 + mUniText = nullptr; 1.549 + mText = aText; 1.550 + mLength = aLength; 1.551 + Init(); 1.552 + } 1.553 + 1.554 + uint32_t Length() { return mLength; } 1.555 + uint32_t Index() { return mIndex; } 1.556 + 1.557 + char16_t GetCharAt(uint32_t aIndex) { 1.558 + NS_ASSERTION(aIndex < mLength, "Out of range!"); 1.559 + return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]); 1.560 + } 1.561 + 1.562 + void AdvanceIndex() { 1.563 + ++mIndex; 1.564 + } 1.565 + 1.566 + void NotifyBreakBefore() { mLastBreakIndex = mIndex; } 1.567 + 1.568 +// A word of western language should not be broken. But even if the word has 1.569 +// only ASCII characters, non-natural context words should be broken, e.g., 1.570 +// URL and file path. For protecting the natural words, we should use 1.571 +// conservative breaking rules at following conditions: 1.572 +// 1. at near the start of word 1.573 +// 2. at near the end of word 1.574 +// 3. at near the latest broken point 1.575 +// CONSERVATIVE_BREAK_RANGE define the 'near' in characters. 1.576 +#define CONSERVATIVE_BREAK_RANGE 6 1.577 + 1.578 + bool UseConservativeBreaking(uint32_t aOffset = 0) { 1.579 + if (mHasCJKChar) 1.580 + return false; 1.581 + uint32_t index = mIndex + aOffset; 1.582 + bool result = (index < CONSERVATIVE_BREAK_RANGE || 1.583 + mLength - index < CONSERVATIVE_BREAK_RANGE || 1.584 + index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE); 1.585 + if (result || !mHasNonbreakableSpace) 1.586 + return result; 1.587 + 1.588 + // This text has no-breakable space, we need to check whether the index 1.589 + // is near it. 1.590 + 1.591 + // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here. 1.592 + for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) { 1.593 + if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1))) 1.594 + return true; 1.595 + } 1.596 + // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE. 1.597 + for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) { 1.598 + if (IS_NONBREAKABLE_SPACE(GetCharAt(i))) 1.599 + return true; 1.600 + } 1.601 + return false; 1.602 + } 1.603 + 1.604 + bool HasPreviousEqualsSign() const { 1.605 + return mHasPreviousEqualsSign; 1.606 + } 1.607 + void NotifySeenEqualsSign() { 1.608 + mHasPreviousEqualsSign = true; 1.609 + } 1.610 + 1.611 + bool HasPreviousSlash() const { 1.612 + return mHasPreviousSlash; 1.613 + } 1.614 + void NotifySeenSlash() { 1.615 + mHasPreviousSlash = true; 1.616 + } 1.617 + 1.618 + bool HasPreviousBackslash() const { 1.619 + return mHasPreviousBackslash; 1.620 + } 1.621 + void NotifySeenBackslash() { 1.622 + mHasPreviousBackslash = true; 1.623 + } 1.624 + 1.625 + char16_t GetPreviousNonHyphenCharacter() const { 1.626 + return mPreviousNonHyphenCharacter; 1.627 + } 1.628 + void NotifyNonHyphenCharacter(char16_t ch) { 1.629 + mPreviousNonHyphenCharacter = ch; 1.630 + } 1.631 + 1.632 +private: 1.633 + void Init() { 1.634 + mIndex = 0; 1.635 + mLastBreakIndex = 0; 1.636 + mPreviousNonHyphenCharacter = U_NULL; 1.637 + mHasCJKChar = 0; 1.638 + mHasNonbreakableSpace = 0; 1.639 + mHasPreviousEqualsSign = false; 1.640 + mHasPreviousSlash = false; 1.641 + mHasPreviousBackslash = false; 1.642 + 1.643 + for (uint32_t i = 0; i < mLength; ++i) { 1.644 + char16_t u = GetCharAt(i); 1.645 + if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) 1.646 + mHasNonbreakableSpace = 1; 1.647 + else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u)) 1.648 + mHasCJKChar = 1; 1.649 + } 1.650 + } 1.651 + 1.652 + const char16_t* mUniText; 1.653 + const uint8_t* mText; 1.654 + 1.655 + uint32_t mIndex; 1.656 + uint32_t mLength; // length of text 1.657 + uint32_t mLastBreakIndex; 1.658 + char16_t mPreviousNonHyphenCharacter; // The last character we have seen 1.659 + // which is not U_HYPHEN 1.660 + bool mHasCJKChar; // if the text has CJK character, this is true. 1.661 + bool mHasNonbreakableSpace; // if the text has no-breakable space, 1.662 + // this is true. 1.663 + bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL 1.664 + bool mHasPreviousSlash; // True if we have seen a U_SLASH 1.665 + bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH 1.666 +}; 1.667 + 1.668 +static int8_t 1.669 +ContextualAnalysis(char16_t prev, char16_t cur, char16_t next, 1.670 + ContextState &aState) 1.671 +{ 1.672 + // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE. 1.673 + 1.674 + if (IS_HYPHEN(cur)) { 1.675 + // If next character is hyphen, we don't need to break between them. 1.676 + if (IS_HYPHEN(next)) 1.677 + return CLASS_CHARACTER; 1.678 + // If prev and next characters are numeric, it may be in Math context. 1.679 + // So, we should not break here. 1.680 + bool prevIsNum = IS_ASCII_DIGIT(prev); 1.681 + bool nextIsNum = IS_ASCII_DIGIT(next); 1.682 + if (prevIsNum && nextIsNum) 1.683 + return CLASS_NUMERIC; 1.684 + // If one side is numeric and the other is a character, or if both sides are 1.685 + // characters, the hyphen should be breakable. 1.686 + if (!aState.UseConservativeBreaking(1)) { 1.687 + char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter(); 1.688 + if (prevOfHyphen && next) { 1.689 + int8_t prevClass = GetClass(prevOfHyphen); 1.690 + int8_t nextClass = GetClass(next); 1.691 + bool prevIsNumOrCharOrClose = 1.692 + prevIsNum || 1.693 + (prevClass == CLASS_CHARACTER && 1.694 + !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) || 1.695 + prevClass == CLASS_CLOSE || 1.696 + prevClass == CLASS_CLOSE_LIKE_CHARACTER; 1.697 + bool nextIsNumOrCharOrOpen = 1.698 + nextIsNum || 1.699 + (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) || 1.700 + nextClass == CLASS_OPEN || 1.701 + nextClass == CLASS_OPEN_LIKE_CHARACTER || 1.702 + next == U_OPEN_SINGLE_QUOTE || 1.703 + next == U_OPEN_DOUBLE_QUOTE || 1.704 + next == U_OPEN_GUILLEMET; 1.705 + if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) { 1.706 + return CLASS_CLOSE; 1.707 + } 1.708 + } 1.709 + } 1.710 + } else { 1.711 + aState.NotifyNonHyphenCharacter(cur); 1.712 + if (cur == U_SLASH || cur == U_BACKSLASH) { 1.713 + // If this is immediately after same char, we should not break here. 1.714 + if (prev == cur) 1.715 + return CLASS_CHARACTER; 1.716 + // If this text has two or more (BACK)SLASHs, this may be file path or URL. 1.717 + // Make sure to compute shouldReturn before we notify on this slash. 1.718 + bool shouldReturn = !aState.UseConservativeBreaking() && 1.719 + (cur == U_SLASH ? 1.720 + aState.HasPreviousSlash() : aState.HasPreviousBackslash()); 1.721 + 1.722 + if (cur == U_SLASH) { 1.723 + aState.NotifySeenSlash(); 1.724 + } else { 1.725 + aState.NotifySeenBackslash(); 1.726 + } 1.727 + 1.728 + if (shouldReturn) 1.729 + return CLASS_OPEN; 1.730 + } else if (cur == U_PERCENT) { 1.731 + // If this is a part of the param of URL, we should break before. 1.732 + if (!aState.UseConservativeBreaking()) { 1.733 + if (aState.Index() >= 3 && 1.734 + aState.GetCharAt(aState.Index() - 3) == U_PERCENT) 1.735 + return CLASS_OPEN; 1.736 + if (aState.Index() + 3 < aState.Length() && 1.737 + aState.GetCharAt(aState.Index() + 3) == U_PERCENT) 1.738 + return CLASS_OPEN; 1.739 + } 1.740 + } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) { 1.741 + // If this may be a separator of params of URL, we should break after. 1.742 + if (!aState.UseConservativeBreaking(1) && 1.743 + aState.HasPreviousEqualsSign()) 1.744 + return CLASS_CLOSE; 1.745 + } else if (cur == U_OPEN_SINGLE_QUOTE || 1.746 + cur == U_OPEN_DOUBLE_QUOTE || 1.747 + cur == U_OPEN_GUILLEMET) { 1.748 + // for CJK usage, we treat these as openers to allow a break before them, 1.749 + // but otherwise treat them as normal characters because quote mark usage 1.750 + // in various Western languages varies too much; see bug #450088 discussion. 1.751 + if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next)) 1.752 + return CLASS_OPEN; 1.753 + } else { 1.754 + NS_ERROR("Forgot to handle the current character!"); 1.755 + } 1.756 + } 1.757 + return GetClass(cur); 1.758 +} 1.759 + 1.760 + 1.761 +int32_t 1.762 +nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen, 1.763 + uint32_t aPos, int8_t aDirection) 1.764 +{ 1.765 + bool textNeedsJISx4051 = false; 1.766 + int32_t begin, end; 1.767 + 1.768 + for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) { 1.769 + if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) { 1.770 + textNeedsJISx4051 = true; 1.771 + } 1.772 + } 1.773 + for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) { 1.774 + if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) { 1.775 + textNeedsJISx4051 = true; 1.776 + } 1.777 + } 1.778 + 1.779 + int32_t ret; 1.780 + nsAutoTArray<uint8_t, 2000> breakState; 1.781 + if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) { 1.782 + // No complex text character, do not try to do complex line break. 1.783 + // (This is required for serializers. See Bug #344816.) 1.784 + // Also fall back to this when out of memory. 1.785 + if (aDirection < 0) { 1.786 + ret = (begin == int32_t(aPos)) ? begin - 1 : begin; 1.787 + } else { 1.788 + ret = end; 1.789 + } 1.790 + } else { 1.791 + GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal, 1.792 + breakState.Elements()); 1.793 + 1.794 + ret = aPos; 1.795 + do { 1.796 + ret += aDirection; 1.797 + } while (begin < ret && ret < end && !breakState[ret - begin]); 1.798 + } 1.799 + 1.800 + return ret; 1.801 +} 1.802 + 1.803 +int32_t 1.804 +nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen, 1.805 + uint32_t aPos) 1.806 +{ 1.807 + NS_ASSERTION(aText, "aText shouldn't be null"); 1.808 + NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next"); 1.809 + 1.810 + int32_t nextPos = WordMove(aText, aLen, aPos, 1); 1.811 + return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT; 1.812 +} 1.813 + 1.814 +int32_t 1.815 +nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen, 1.816 + uint32_t aPos) 1.817 +{ 1.818 + NS_ASSERTION(aText, "aText shouldn't be null"); 1.819 + NS_ASSERTION(aLen >= aPos && aPos > 0, 1.820 + "Bad position passed to nsJISx4051LineBreaker::Prev"); 1.821 + 1.822 + int32_t prevPos = WordMove(aText, aLen, aPos, -1); 1.823 + return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT; 1.824 +} 1.825 + 1.826 +void 1.827 +nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength, 1.828 + uint8_t aWordBreak, 1.829 + uint8_t* aBreakBefore) 1.830 +{ 1.831 + uint32_t cur; 1.832 + int8_t lastClass = CLASS_NONE; 1.833 + ContextState state(aChars, aLength); 1.834 + 1.835 + for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { 1.836 + char16_t ch = aChars[cur]; 1.837 + int8_t cl; 1.838 + 1.839 + if (NEED_CONTEXTUAL_ANALYSIS(ch)) { 1.840 + cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, 1.841 + ch, 1.842 + cur + 1 < aLength ? aChars[cur + 1] : U_NULL, 1.843 + state); 1.844 + } else { 1.845 + if (ch == U_EQUAL) 1.846 + state.NotifySeenEqualsSign(); 1.847 + state.NotifyNonHyphenCharacter(ch); 1.848 + cl = GetClass(ch); 1.849 + } 1.850 + 1.851 + bool allowBreak = false; 1.852 + if (cur > 0) { 1.853 + NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl, 1.854 + "Loop should have prevented adjacent complex chars here"); 1.855 + if (aWordBreak == nsILineBreaker::kWordBreak_Normal) { 1.856 + allowBreak = (state.UseConservativeBreaking()) ? 1.857 + GetPairConservative(lastClass, cl) : GetPair(lastClass, cl); 1.858 + } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) { 1.859 + allowBreak = true; 1.860 + } 1.861 + } 1.862 + aBreakBefore[cur] = allowBreak; 1.863 + if (allowBreak) 1.864 + state.NotifyBreakBefore(); 1.865 + lastClass = cl; 1.866 + if (CLASS_COMPLEX == cl) { 1.867 + uint32_t end = cur + 1; 1.868 + 1.869 + while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) { 1.870 + ++end; 1.871 + } 1.872 + 1.873 + NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur); 1.874 + 1.875 + // We have to consider word-break value again for complex characters 1.876 + if (aWordBreak != nsILineBreaker::kWordBreak_Normal) { 1.877 + // Respect word-break property 1.878 + for (uint32_t i = cur; i < end; i++) 1.879 + aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll); 1.880 + } 1.881 + 1.882 + // restore breakability at chunk begin, which was always set to false 1.883 + // by the complex line breaker 1.884 + aBreakBefore[cur] = allowBreak; 1.885 + 1.886 + cur = end - 1; 1.887 + } 1.888 + } 1.889 +} 1.890 + 1.891 +void 1.892 +nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength, 1.893 + uint8_t aWordBreak, 1.894 + uint8_t* aBreakBefore) 1.895 +{ 1.896 + uint32_t cur; 1.897 + int8_t lastClass = CLASS_NONE; 1.898 + ContextState state(aChars, aLength); 1.899 + 1.900 + for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { 1.901 + char16_t ch = aChars[cur]; 1.902 + int8_t cl; 1.903 + 1.904 + if (NEED_CONTEXTUAL_ANALYSIS(ch)) { 1.905 + cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, 1.906 + ch, 1.907 + cur + 1 < aLength ? aChars[cur + 1] : U_NULL, 1.908 + state); 1.909 + } else { 1.910 + if (ch == U_EQUAL) 1.911 + state.NotifySeenEqualsSign(); 1.912 + state.NotifyNonHyphenCharacter(ch); 1.913 + cl = GetClass(ch); 1.914 + } 1.915 + 1.916 + bool allowBreak = false; 1.917 + if (cur > 0) { 1.918 + if (aWordBreak == nsILineBreaker::kWordBreak_Normal) { 1.919 + allowBreak = (state.UseConservativeBreaking()) ? 1.920 + GetPairConservative(lastClass, cl) : GetPair(lastClass, cl); 1.921 + } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) { 1.922 + allowBreak = true; 1.923 + } 1.924 + } 1.925 + aBreakBefore[cur] = allowBreak; 1.926 + if (allowBreak) 1.927 + state.NotifyBreakBefore(); 1.928 + lastClass = cl; 1.929 + } 1.930 +}