|
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 |
|
7 |
|
8 #include "nsJISx4051LineBreaker.h" |
|
9 |
|
10 #include "jisx4051class.h" |
|
11 #include "nsComplexBreaker.h" |
|
12 #include "nsTArray.h" |
|
13 |
|
14 /* |
|
15 |
|
16 Simplification of Pair Table in JIS X 4051 |
|
17 |
|
18 1. The Origion Table - in 4.1.3 |
|
19 |
|
20 In JIS x 4051. The pair table is defined as below |
|
21 |
|
22 Class of |
|
23 Leading Class of Trailing Char Class |
|
24 Char |
|
25 |
|
26 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20 |
|
27 * # * # |
|
28 1 X X X X X X X X X X X X X X X X X X X X X E |
|
29 2 X X X X X X |
|
30 3 X X X X X X |
|
31 4 X X X X X X |
|
32 5 X X X X X X |
|
33 6 X X X X X X |
|
34 7 X X X X X X X |
|
35 8 X X X X X X E |
|
36 9 X X X X X X |
|
37 10 X X X X X X |
|
38 11 X X X X X X |
|
39 12 X X X X X X |
|
40 13 X X X X X X X |
|
41 14 X X X X X X X |
|
42 15 X X X X X X X X X |
|
43 16 X X X X X X X X |
|
44 17 X X X X X E |
|
45 18 X X X X X X X X X |
|
46 19 X E E E E E X X X X X X X X X X X X E X E E |
|
47 20 X X X X X E |
|
48 |
|
49 * Same Char |
|
50 # Other Char |
|
51 |
|
52 X Cannot Break |
|
53 |
|
54 The classes mean: |
|
55 1: Open parenthesis |
|
56 2: Close parenthesis |
|
57 3: Prohibit a line break before |
|
58 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?") |
|
59 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT) |
|
60 6: Full stop |
|
61 7: Non-breakable between same characters |
|
62 8: Prefix (e.g., "$", "NO.") |
|
63 9: Postfix (e.g., "%") |
|
64 10: Ideographic space |
|
65 11: Hiragana |
|
66 12: Japanese characters (except class 11) |
|
67 13: Subscript |
|
68 14: Ruby |
|
69 15: Numeric |
|
70 16: Alphabet |
|
71 17: Space for Western language |
|
72 18: Western characters (except class 17) |
|
73 19: Split line note (Warichu) begin quote |
|
74 20: Split line note (Warichu) end quote |
|
75 |
|
76 2. Simplified by remove the class which we do not care |
|
77 |
|
78 However, since we do not care about class 13(Subscript), 14(Ruby), |
|
79 16 (Aphabet), 19(split line note begin quote), and 20(split line note end |
|
80 quote) we can simplify this par table into the following |
|
81 |
|
82 Class of |
|
83 Leading Class of Trailing Char Class |
|
84 Char |
|
85 |
|
86 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18 |
|
87 |
|
88 1 X X X X X X X X X X X X X X X |
|
89 2 X X X X X |
|
90 3 X X X X X |
|
91 4 X X X X X |
|
92 5 X X X X X |
|
93 6 X X X X X |
|
94 7 X X X X X X |
|
95 8 X X X X X X |
|
96 9 X X X X X |
|
97 10 X X X X X |
|
98 11 X X X X X |
|
99 12 X X X X X |
|
100 15 X X X X X X X X |
|
101 17 X X X X X |
|
102 18 X X X X X X X |
|
103 |
|
104 3. Simplified by merged classes |
|
105 |
|
106 After the 2 simplification, the pair table have some duplication |
|
107 a. class 2, 3, 4, 5, 6, are the same- we can merged them |
|
108 b. class 10, 11, 12, 17 are the same- we can merged them |
|
109 |
|
110 |
|
111 Class of |
|
112 Leading Class of Trailing Char Class |
|
113 Char |
|
114 |
|
115 1 [a] 7 8 9 [b]15 18 |
|
116 |
|
117 1 X X X X X X X X |
|
118 [a] X |
|
119 7 X X |
|
120 8 X X |
|
121 9 X |
|
122 [b] X |
|
123 15 X X X X |
|
124 18 X X X |
|
125 |
|
126 |
|
127 4. We add COMPLEX characters and make it breakable w/ all ther class |
|
128 except after class 1 and before class [a] |
|
129 |
|
130 Class of |
|
131 Leading Class of Trailing Char Class |
|
132 Char |
|
133 |
|
134 1 [a] 7 8 9 [b]15 18 COMPLEX |
|
135 |
|
136 1 X X X X X X X X X |
|
137 [a] X |
|
138 7 X X |
|
139 8 X X |
|
140 9 X |
|
141 [b] X |
|
142 15 X X X X |
|
143 18 X X X |
|
144 COMPLEX X T |
|
145 |
|
146 T : need special handling |
|
147 |
|
148 |
|
149 5. However, we need two special class for some punctuations/parentheses, |
|
150 theirs breaking rules like character class (18), see bug 389056. |
|
151 And also we need character like punctuation that is same behavior with 18, |
|
152 but the characters are not letters of all languages. (e.g., '_') |
|
153 [c]. Based on open parenthesis class (1), but it is not breakable after |
|
154 character class (18) or numeric class (15). |
|
155 [d]. Based on close parenthesis (or punctuation) class (2), but it is not |
|
156 breakable before character class (18) or numeric class (15). |
|
157 |
|
158 Class of |
|
159 Leading Class of Trailing Char Class |
|
160 Char |
|
161 |
|
162 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] |
|
163 |
|
164 1 X X X X X X X X X X X |
|
165 [a] X X X |
|
166 7 X X |
|
167 8 X X |
|
168 9 X |
|
169 [b] X X |
|
170 15 X X X X X X |
|
171 18 X X X X X |
|
172 COMPLEX X T |
|
173 [c] X X X X X X X X X X X |
|
174 [d] X X X X |
|
175 |
|
176 |
|
177 6. And Unicode has "NON-BREAK" characters. The lines should be broken around |
|
178 them. But in JIS X 4051, such class is not, therefore, we create [e]. |
|
179 |
|
180 Class of |
|
181 Leading Class of Trailing Char Class |
|
182 Char |
|
183 |
|
184 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] |
|
185 |
|
186 1 X X X X X X X X X X X X |
|
187 [a] X X X |
|
188 7 X X X |
|
189 8 X X X |
|
190 9 X X |
|
191 [b] X X X |
|
192 15 X X X X X X X |
|
193 18 X X X X X X |
|
194 COMPLEX X T X |
|
195 [c] X X X X X X X X X X X X |
|
196 [d] X X X X X |
|
197 [e] X X X X X X X X X X X X |
|
198 |
|
199 |
|
200 7. Now we use one bit to encode weather it is breakable, and use 2 bytes |
|
201 for one row, then the bit table will look like: |
|
202 |
|
203 18 <- 1 |
|
204 |
|
205 1 0000 1111 1111 1111 = 0x0FFF |
|
206 [a] 0000 1100 0000 0010 = 0x0C02 |
|
207 7 0000 1000 0000 0110 = 0x0806 |
|
208 8 0000 1000 0100 0010 = 0x0842 |
|
209 9 0000 1000 0000 0010 = 0x0802 |
|
210 [b] 0000 1100 0000 0010 = 0x0C02 |
|
211 15 0000 1110 1101 0010 = 0x0ED2 |
|
212 18 0000 1110 1100 0010 = 0x0EC2 |
|
213 COMPLEX 0000 1001 0000 0010 = 0x0902 |
|
214 [c] 0000 1111 1111 1111 = 0x0FFF |
|
215 [d] 0000 1100 1100 0010 = 0x0CC2 |
|
216 [e] 0000 1111 1111 1111 = 0x0FFF |
|
217 */ |
|
218 |
|
219 #define MAX_CLASSES 12 |
|
220 |
|
221 static const uint16_t gPair[MAX_CLASSES] = { |
|
222 0x0FFF, |
|
223 0x0C02, |
|
224 0x0806, |
|
225 0x0842, |
|
226 0x0802, |
|
227 0x0C02, |
|
228 0x0ED2, |
|
229 0x0EC2, |
|
230 0x0902, |
|
231 0x0FFF, |
|
232 0x0CC2, |
|
233 0x0FFF |
|
234 }; |
|
235 |
|
236 |
|
237 /* |
|
238 |
|
239 8. And if the character is not enough far from word start, word end and |
|
240 another break point, we should not break in non-CJK languages. |
|
241 I.e., Don't break around 15, 18, [c] and [d], but don't change |
|
242 that if they are related to [b]. |
|
243 |
|
244 Class of |
|
245 Leading Class of Trailing Char Class |
|
246 Char |
|
247 |
|
248 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] |
|
249 |
|
250 1 X X X X X X X X X X X X |
|
251 [a] X X X X X X |
|
252 7 X X X X X X X |
|
253 8 X X X X X X |
|
254 9 X X X X X X |
|
255 [b] X X X |
|
256 15 X X X X X X X X X X X |
|
257 18 X X X X X X X X X X X |
|
258 COMPLEX X X X T X X X |
|
259 [c] X X X X X X X X X X X X |
|
260 [d] X X X X X X X X X X X |
|
261 [e] X X X X X X X X X X X X |
|
262 |
|
263 18 <- 1 |
|
264 |
|
265 1 0000 1111 1111 1111 = 0x0FFF |
|
266 [a] 0000 1110 1100 0010 = 0x0EC2 |
|
267 7 0000 1110 1100 0110 = 0x0EC6 |
|
268 8 0000 1110 1100 0010 = 0x0EC2 |
|
269 9 0000 1110 1100 0010 = 0x0EC2 |
|
270 [b] 0000 1100 0000 0010 = 0x0C02 |
|
271 15 0000 1111 1101 1111 = 0x0FDF |
|
272 18 0000 1111 1101 1111 = 0x0FDF |
|
273 COMPLEX 0000 1111 1100 0010 = 0x0FC2 |
|
274 [c] 0000 1111 1111 1111 = 0x0FFF |
|
275 [d] 0000 1111 1101 1111 = 0x0FDF |
|
276 [e] 0000 1111 1111 1111 = 0x0FFF |
|
277 */ |
|
278 |
|
279 static const uint16_t gPairConservative[MAX_CLASSES] = { |
|
280 0x0FFF, |
|
281 0x0EC2, |
|
282 0x0EC6, |
|
283 0x0EC2, |
|
284 0x0EC2, |
|
285 0x0C02, |
|
286 0x0FDF, |
|
287 0x0FDF, |
|
288 0x0FC2, |
|
289 0x0FFF, |
|
290 0x0FDF, |
|
291 0x0FFF |
|
292 }; |
|
293 |
|
294 |
|
295 /* |
|
296 |
|
297 9. Now we map the class to number |
|
298 |
|
299 0: 1 |
|
300 1: [a]- 2, 3, 4, 5, 6 |
|
301 2: 7 |
|
302 3: 8 |
|
303 4: 9 |
|
304 5: [b]- 10, 11, 12, 17 |
|
305 6: 15 |
|
306 7: 18 |
|
307 8: COMPLEX |
|
308 9: [c] |
|
309 A: [d] |
|
310 B: [e] |
|
311 |
|
312 and they mean: |
|
313 0: Open parenthesis |
|
314 1: Punctuation that prohibits break before |
|
315 2: Non-breakable between same classes |
|
316 3: Prefix |
|
317 4: Postfix |
|
318 5: Breakable character (Spaces and Most Japanese characters) |
|
319 6: Numeric |
|
320 7: Characters |
|
321 8: Need special handling characters (E.g., Thai) |
|
322 9: Open parentheses like Character (See bug 389056) |
|
323 A: Close parenthese (or punctuations) like Character (See bug 389056) |
|
324 B: Non breakable (See bug 390920) |
|
325 |
|
326 */ |
|
327 |
|
328 #define CLASS_NONE INT8_MAX |
|
329 |
|
330 #define CLASS_OPEN 0x00 |
|
331 #define CLASS_CLOSE 0x01 |
|
332 #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02 |
|
333 #define CLASS_PREFIX 0x03 |
|
334 #define CLASS_POSTFFIX 0x04 |
|
335 #define CLASS_BREAKABLE 0x05 |
|
336 #define CLASS_NUMERIC 0x06 |
|
337 #define CLASS_CHARACTER 0x07 |
|
338 #define CLASS_COMPLEX 0x08 |
|
339 #define CLASS_OPEN_LIKE_CHARACTER 0x09 |
|
340 #define CLASS_CLOSE_LIKE_CHARACTER 0x0A |
|
341 #define CLASS_NON_BREAKABLE 0x0B |
|
342 |
|
343 #define U_NULL char16_t(0x0000) |
|
344 #define U_SLASH char16_t('/') |
|
345 #define U_SPACE char16_t(' ') |
|
346 #define U_HYPHEN char16_t('-') |
|
347 #define U_EQUAL char16_t('=') |
|
348 #define U_PERCENT char16_t('%') |
|
349 #define U_AMPERSAND char16_t('&') |
|
350 #define U_SEMICOLON char16_t(';') |
|
351 #define U_BACKSLASH char16_t('\\') |
|
352 #define U_OPEN_SINGLE_QUOTE char16_t(0x2018) |
|
353 #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C) |
|
354 #define U_OPEN_GUILLEMET char16_t(0x00AB) |
|
355 |
|
356 #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \ |
|
357 (c) == U_SLASH || \ |
|
358 (c) == U_PERCENT || \ |
|
359 (c) == U_AMPERSAND || \ |
|
360 (c) == U_SEMICOLON || \ |
|
361 (c) == U_BACKSLASH || \ |
|
362 (c) == U_OPEN_SINGLE_QUOTE || \ |
|
363 (c) == U_OPEN_DOUBLE_QUOTE || \ |
|
364 (c) == U_OPEN_GUILLEMET) |
|
365 |
|
366 #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039) |
|
367 |
|
368 static inline int |
|
369 GETCLASSFROMTABLE(const uint32_t* t, uint16_t l) |
|
370 { |
|
371 return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f); |
|
372 } |
|
373 |
|
374 static inline int |
|
375 IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u) |
|
376 { |
|
377 return ((0xff66 <= (u)) && ((u) <= 0xff70)); |
|
378 } |
|
379 |
|
380 static inline int |
|
381 IS_CJK_CHAR(char16_t u) |
|
382 { |
|
383 return ((0x1100 <= (u) && (u) <= 0x11ff) || |
|
384 (0x2e80 <= (u) && (u) <= 0xd7ff) || |
|
385 (0xf900 <= (u) && (u) <= 0xfaff) || |
|
386 (0xff00 <= (u) && (u) <= 0xffef) ); |
|
387 } |
|
388 |
|
389 static inline bool |
|
390 IS_NONBREAKABLE_SPACE(char16_t u) |
|
391 { |
|
392 return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE |
|
393 } |
|
394 |
|
395 static inline bool |
|
396 IS_HYPHEN(char16_t u) |
|
397 { |
|
398 return (u == U_HYPHEN || |
|
399 u == 0x058A || // ARMENIAN HYPHEN |
|
400 u == 0x2010 || // HYPHEN |
|
401 u == 0x2012 || // FIGURE DASH |
|
402 u == 0x2013); // EN DASH |
|
403 } |
|
404 |
|
405 static int8_t |
|
406 GetClass(char16_t u) |
|
407 { |
|
408 uint16_t h = u & 0xFF00; |
|
409 uint16_t l = u & 0x00ff; |
|
410 int8_t c; |
|
411 |
|
412 // Handle 3 range table first |
|
413 if (0x0000 == h) { |
|
414 c = GETCLASSFROMTABLE(gLBClass00, l); |
|
415 } else if (0x1700 == h) { |
|
416 c = GETCLASSFROMTABLE(gLBClass17, l); |
|
417 } else if (NS_NeedsPlatformNativeHandling(u)) { |
|
418 c = CLASS_COMPLEX; |
|
419 } else if (0x0E00 == h) { |
|
420 c = GETCLASSFROMTABLE(gLBClass0E, l); |
|
421 } else if (0x2000 == h) { |
|
422 c = GETCLASSFROMTABLE(gLBClass20, l); |
|
423 } else if (0x2100 == h) { |
|
424 c = GETCLASSFROMTABLE(gLBClass21, l); |
|
425 } else if (0x3000 == h) { |
|
426 c = GETCLASSFROMTABLE(gLBClass30, l); |
|
427 } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi |
|
428 ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul |
|
429 ((0xf900 <= h) && (h <= 0xfaff))) { |
|
430 c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatibility |
|
431 } else if (0xff00 == h) { |
|
432 if (l < 0x0060) { // Fullwidth ASCII variant |
|
433 c = GETCLASSFROMTABLE(gLBClass00, (l+0x20)); |
|
434 } else if (l < 0x00a0) { |
|
435 switch (l) { |
|
436 case 0x61: c = GetClass(0x3002); break; |
|
437 case 0x62: c = GetClass(0x300c); break; |
|
438 case 0x63: c = GetClass(0x300d); break; |
|
439 case 0x64: c = GetClass(0x3001); break; |
|
440 case 0x65: c = GetClass(0x30fb); break; |
|
441 case 0x9e: c = GetClass(0x309b); break; |
|
442 case 0x9f: c = GetClass(0x309c); break; |
|
443 default: |
|
444 if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) |
|
445 c = CLASS_CLOSE; // jis x4051 class 3 |
|
446 else |
|
447 c = CLASS_BREAKABLE; // jis x4051 class 11 |
|
448 break; |
|
449 } |
|
450 // Halfwidth Katakana variants |
|
451 } else if (l < 0x00e0) { |
|
452 c = CLASS_CHARACTER; // Halfwidth Hangul variants |
|
453 } else if (l < 0x00f0) { |
|
454 static char16_t NarrowFFEx[16] = { |
|
455 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000, |
|
456 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000 |
|
457 }; |
|
458 c = GetClass(NarrowFFEx[l - 0x00e0]); |
|
459 } else { |
|
460 c = CLASS_CHARACTER; |
|
461 } |
|
462 } else if (0x3100 == h) { |
|
463 if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun |
|
464 // XXX: This is per UAX #14, but UAX #14 may change |
|
465 // the line breaking rules about Kanbun and Bopomofo. |
|
466 c = CLASS_BREAKABLE; |
|
467 } else if (l >= 0xf0) { // Katakana small letters for Ainu |
|
468 c = CLASS_CLOSE; |
|
469 } else { // unassigned |
|
470 c = CLASS_CHARACTER; |
|
471 } |
|
472 } else if (0x0300 == h) { |
|
473 if (0x4F == l || (0x5C <= l && l <= 0x62)) |
|
474 c = CLASS_NON_BREAKABLE; |
|
475 else |
|
476 c = CLASS_CHARACTER; |
|
477 } else if (0x0500 == h) { |
|
478 // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14) |
|
479 if (l == 0x8A) |
|
480 c = GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN)); |
|
481 else |
|
482 c = CLASS_CHARACTER; |
|
483 } else if (0x0F00 == h) { |
|
484 if (0x08 == l || 0x0C == l || 0x12 == l) |
|
485 c = CLASS_NON_BREAKABLE; |
|
486 else |
|
487 c = CLASS_CHARACTER; |
|
488 } else if (0x1800 == h) { |
|
489 if (0x0E == l) |
|
490 c = CLASS_NON_BREAKABLE; |
|
491 else |
|
492 c = CLASS_CHARACTER; |
|
493 } else if (0x1600 == h) { |
|
494 if (0x80 == l) { // U+1680 OGHAM SPACE MARK |
|
495 c = CLASS_BREAKABLE; |
|
496 } else { |
|
497 c = CLASS_CHARACTER; |
|
498 } |
|
499 } else if (u == 0xfeff) { |
|
500 c = CLASS_NON_BREAKABLE; |
|
501 } else { |
|
502 c = CLASS_CHARACTER; // others |
|
503 } |
|
504 return c; |
|
505 } |
|
506 |
|
507 static bool |
|
508 GetPair(int8_t c1, int8_t c2) |
|
509 { |
|
510 NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); |
|
511 NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); |
|
512 |
|
513 return (0 == ((gPair[c1] >> c2) & 0x0001)); |
|
514 } |
|
515 |
|
516 static bool |
|
517 GetPairConservative(int8_t c1, int8_t c2) |
|
518 { |
|
519 NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); |
|
520 NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); |
|
521 |
|
522 return (0 == ((gPairConservative[c1] >> c2) & 0x0001)); |
|
523 } |
|
524 |
|
525 nsJISx4051LineBreaker::nsJISx4051LineBreaker() |
|
526 { |
|
527 } |
|
528 |
|
529 nsJISx4051LineBreaker::~nsJISx4051LineBreaker() |
|
530 { |
|
531 } |
|
532 |
|
533 NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker) |
|
534 |
|
535 class ContextState { |
|
536 public: |
|
537 ContextState(const char16_t* aText, uint32_t aLength) { |
|
538 mUniText = aText; |
|
539 mText = nullptr; |
|
540 mLength = aLength; |
|
541 Init(); |
|
542 } |
|
543 |
|
544 ContextState(const uint8_t* aText, uint32_t aLength) { |
|
545 mUniText = nullptr; |
|
546 mText = aText; |
|
547 mLength = aLength; |
|
548 Init(); |
|
549 } |
|
550 |
|
551 uint32_t Length() { return mLength; } |
|
552 uint32_t Index() { return mIndex; } |
|
553 |
|
554 char16_t GetCharAt(uint32_t aIndex) { |
|
555 NS_ASSERTION(aIndex < mLength, "Out of range!"); |
|
556 return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]); |
|
557 } |
|
558 |
|
559 void AdvanceIndex() { |
|
560 ++mIndex; |
|
561 } |
|
562 |
|
563 void NotifyBreakBefore() { mLastBreakIndex = mIndex; } |
|
564 |
|
565 // A word of western language should not be broken. But even if the word has |
|
566 // only ASCII characters, non-natural context words should be broken, e.g., |
|
567 // URL and file path. For protecting the natural words, we should use |
|
568 // conservative breaking rules at following conditions: |
|
569 // 1. at near the start of word |
|
570 // 2. at near the end of word |
|
571 // 3. at near the latest broken point |
|
572 // CONSERVATIVE_BREAK_RANGE define the 'near' in characters. |
|
573 #define CONSERVATIVE_BREAK_RANGE 6 |
|
574 |
|
575 bool UseConservativeBreaking(uint32_t aOffset = 0) { |
|
576 if (mHasCJKChar) |
|
577 return false; |
|
578 uint32_t index = mIndex + aOffset; |
|
579 bool result = (index < CONSERVATIVE_BREAK_RANGE || |
|
580 mLength - index < CONSERVATIVE_BREAK_RANGE || |
|
581 index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE); |
|
582 if (result || !mHasNonbreakableSpace) |
|
583 return result; |
|
584 |
|
585 // This text has no-breakable space, we need to check whether the index |
|
586 // is near it. |
|
587 |
|
588 // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here. |
|
589 for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) { |
|
590 if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1))) |
|
591 return true; |
|
592 } |
|
593 // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE. |
|
594 for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) { |
|
595 if (IS_NONBREAKABLE_SPACE(GetCharAt(i))) |
|
596 return true; |
|
597 } |
|
598 return false; |
|
599 } |
|
600 |
|
601 bool HasPreviousEqualsSign() const { |
|
602 return mHasPreviousEqualsSign; |
|
603 } |
|
604 void NotifySeenEqualsSign() { |
|
605 mHasPreviousEqualsSign = true; |
|
606 } |
|
607 |
|
608 bool HasPreviousSlash() const { |
|
609 return mHasPreviousSlash; |
|
610 } |
|
611 void NotifySeenSlash() { |
|
612 mHasPreviousSlash = true; |
|
613 } |
|
614 |
|
615 bool HasPreviousBackslash() const { |
|
616 return mHasPreviousBackslash; |
|
617 } |
|
618 void NotifySeenBackslash() { |
|
619 mHasPreviousBackslash = true; |
|
620 } |
|
621 |
|
622 char16_t GetPreviousNonHyphenCharacter() const { |
|
623 return mPreviousNonHyphenCharacter; |
|
624 } |
|
625 void NotifyNonHyphenCharacter(char16_t ch) { |
|
626 mPreviousNonHyphenCharacter = ch; |
|
627 } |
|
628 |
|
629 private: |
|
630 void Init() { |
|
631 mIndex = 0; |
|
632 mLastBreakIndex = 0; |
|
633 mPreviousNonHyphenCharacter = U_NULL; |
|
634 mHasCJKChar = 0; |
|
635 mHasNonbreakableSpace = 0; |
|
636 mHasPreviousEqualsSign = false; |
|
637 mHasPreviousSlash = false; |
|
638 mHasPreviousBackslash = false; |
|
639 |
|
640 for (uint32_t i = 0; i < mLength; ++i) { |
|
641 char16_t u = GetCharAt(i); |
|
642 if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) |
|
643 mHasNonbreakableSpace = 1; |
|
644 else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u)) |
|
645 mHasCJKChar = 1; |
|
646 } |
|
647 } |
|
648 |
|
649 const char16_t* mUniText; |
|
650 const uint8_t* mText; |
|
651 |
|
652 uint32_t mIndex; |
|
653 uint32_t mLength; // length of text |
|
654 uint32_t mLastBreakIndex; |
|
655 char16_t mPreviousNonHyphenCharacter; // The last character we have seen |
|
656 // which is not U_HYPHEN |
|
657 bool mHasCJKChar; // if the text has CJK character, this is true. |
|
658 bool mHasNonbreakableSpace; // if the text has no-breakable space, |
|
659 // this is true. |
|
660 bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL |
|
661 bool mHasPreviousSlash; // True if we have seen a U_SLASH |
|
662 bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH |
|
663 }; |
|
664 |
|
665 static int8_t |
|
666 ContextualAnalysis(char16_t prev, char16_t cur, char16_t next, |
|
667 ContextState &aState) |
|
668 { |
|
669 // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE. |
|
670 |
|
671 if (IS_HYPHEN(cur)) { |
|
672 // If next character is hyphen, we don't need to break between them. |
|
673 if (IS_HYPHEN(next)) |
|
674 return CLASS_CHARACTER; |
|
675 // If prev and next characters are numeric, it may be in Math context. |
|
676 // So, we should not break here. |
|
677 bool prevIsNum = IS_ASCII_DIGIT(prev); |
|
678 bool nextIsNum = IS_ASCII_DIGIT(next); |
|
679 if (prevIsNum && nextIsNum) |
|
680 return CLASS_NUMERIC; |
|
681 // If one side is numeric and the other is a character, or if both sides are |
|
682 // characters, the hyphen should be breakable. |
|
683 if (!aState.UseConservativeBreaking(1)) { |
|
684 char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter(); |
|
685 if (prevOfHyphen && next) { |
|
686 int8_t prevClass = GetClass(prevOfHyphen); |
|
687 int8_t nextClass = GetClass(next); |
|
688 bool prevIsNumOrCharOrClose = |
|
689 prevIsNum || |
|
690 (prevClass == CLASS_CHARACTER && |
|
691 !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) || |
|
692 prevClass == CLASS_CLOSE || |
|
693 prevClass == CLASS_CLOSE_LIKE_CHARACTER; |
|
694 bool nextIsNumOrCharOrOpen = |
|
695 nextIsNum || |
|
696 (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) || |
|
697 nextClass == CLASS_OPEN || |
|
698 nextClass == CLASS_OPEN_LIKE_CHARACTER || |
|
699 next == U_OPEN_SINGLE_QUOTE || |
|
700 next == U_OPEN_DOUBLE_QUOTE || |
|
701 next == U_OPEN_GUILLEMET; |
|
702 if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) { |
|
703 return CLASS_CLOSE; |
|
704 } |
|
705 } |
|
706 } |
|
707 } else { |
|
708 aState.NotifyNonHyphenCharacter(cur); |
|
709 if (cur == U_SLASH || cur == U_BACKSLASH) { |
|
710 // If this is immediately after same char, we should not break here. |
|
711 if (prev == cur) |
|
712 return CLASS_CHARACTER; |
|
713 // If this text has two or more (BACK)SLASHs, this may be file path or URL. |
|
714 // Make sure to compute shouldReturn before we notify on this slash. |
|
715 bool shouldReturn = !aState.UseConservativeBreaking() && |
|
716 (cur == U_SLASH ? |
|
717 aState.HasPreviousSlash() : aState.HasPreviousBackslash()); |
|
718 |
|
719 if (cur == U_SLASH) { |
|
720 aState.NotifySeenSlash(); |
|
721 } else { |
|
722 aState.NotifySeenBackslash(); |
|
723 } |
|
724 |
|
725 if (shouldReturn) |
|
726 return CLASS_OPEN; |
|
727 } else if (cur == U_PERCENT) { |
|
728 // If this is a part of the param of URL, we should break before. |
|
729 if (!aState.UseConservativeBreaking()) { |
|
730 if (aState.Index() >= 3 && |
|
731 aState.GetCharAt(aState.Index() - 3) == U_PERCENT) |
|
732 return CLASS_OPEN; |
|
733 if (aState.Index() + 3 < aState.Length() && |
|
734 aState.GetCharAt(aState.Index() + 3) == U_PERCENT) |
|
735 return CLASS_OPEN; |
|
736 } |
|
737 } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) { |
|
738 // If this may be a separator of params of URL, we should break after. |
|
739 if (!aState.UseConservativeBreaking(1) && |
|
740 aState.HasPreviousEqualsSign()) |
|
741 return CLASS_CLOSE; |
|
742 } else if (cur == U_OPEN_SINGLE_QUOTE || |
|
743 cur == U_OPEN_DOUBLE_QUOTE || |
|
744 cur == U_OPEN_GUILLEMET) { |
|
745 // for CJK usage, we treat these as openers to allow a break before them, |
|
746 // but otherwise treat them as normal characters because quote mark usage |
|
747 // in various Western languages varies too much; see bug #450088 discussion. |
|
748 if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next)) |
|
749 return CLASS_OPEN; |
|
750 } else { |
|
751 NS_ERROR("Forgot to handle the current character!"); |
|
752 } |
|
753 } |
|
754 return GetClass(cur); |
|
755 } |
|
756 |
|
757 |
|
758 int32_t |
|
759 nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen, |
|
760 uint32_t aPos, int8_t aDirection) |
|
761 { |
|
762 bool textNeedsJISx4051 = false; |
|
763 int32_t begin, end; |
|
764 |
|
765 for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) { |
|
766 if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) { |
|
767 textNeedsJISx4051 = true; |
|
768 } |
|
769 } |
|
770 for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) { |
|
771 if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) { |
|
772 textNeedsJISx4051 = true; |
|
773 } |
|
774 } |
|
775 |
|
776 int32_t ret; |
|
777 nsAutoTArray<uint8_t, 2000> breakState; |
|
778 if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) { |
|
779 // No complex text character, do not try to do complex line break. |
|
780 // (This is required for serializers. See Bug #344816.) |
|
781 // Also fall back to this when out of memory. |
|
782 if (aDirection < 0) { |
|
783 ret = (begin == int32_t(aPos)) ? begin - 1 : begin; |
|
784 } else { |
|
785 ret = end; |
|
786 } |
|
787 } else { |
|
788 GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal, |
|
789 breakState.Elements()); |
|
790 |
|
791 ret = aPos; |
|
792 do { |
|
793 ret += aDirection; |
|
794 } while (begin < ret && ret < end && !breakState[ret - begin]); |
|
795 } |
|
796 |
|
797 return ret; |
|
798 } |
|
799 |
|
800 int32_t |
|
801 nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen, |
|
802 uint32_t aPos) |
|
803 { |
|
804 NS_ASSERTION(aText, "aText shouldn't be null"); |
|
805 NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next"); |
|
806 |
|
807 int32_t nextPos = WordMove(aText, aLen, aPos, 1); |
|
808 return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT; |
|
809 } |
|
810 |
|
811 int32_t |
|
812 nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen, |
|
813 uint32_t aPos) |
|
814 { |
|
815 NS_ASSERTION(aText, "aText shouldn't be null"); |
|
816 NS_ASSERTION(aLen >= aPos && aPos > 0, |
|
817 "Bad position passed to nsJISx4051LineBreaker::Prev"); |
|
818 |
|
819 int32_t prevPos = WordMove(aText, aLen, aPos, -1); |
|
820 return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT; |
|
821 } |
|
822 |
|
823 void |
|
824 nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength, |
|
825 uint8_t aWordBreak, |
|
826 uint8_t* aBreakBefore) |
|
827 { |
|
828 uint32_t cur; |
|
829 int8_t lastClass = CLASS_NONE; |
|
830 ContextState state(aChars, aLength); |
|
831 |
|
832 for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { |
|
833 char16_t ch = aChars[cur]; |
|
834 int8_t cl; |
|
835 |
|
836 if (NEED_CONTEXTUAL_ANALYSIS(ch)) { |
|
837 cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, |
|
838 ch, |
|
839 cur + 1 < aLength ? aChars[cur + 1] : U_NULL, |
|
840 state); |
|
841 } else { |
|
842 if (ch == U_EQUAL) |
|
843 state.NotifySeenEqualsSign(); |
|
844 state.NotifyNonHyphenCharacter(ch); |
|
845 cl = GetClass(ch); |
|
846 } |
|
847 |
|
848 bool allowBreak = false; |
|
849 if (cur > 0) { |
|
850 NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl, |
|
851 "Loop should have prevented adjacent complex chars here"); |
|
852 if (aWordBreak == nsILineBreaker::kWordBreak_Normal) { |
|
853 allowBreak = (state.UseConservativeBreaking()) ? |
|
854 GetPairConservative(lastClass, cl) : GetPair(lastClass, cl); |
|
855 } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) { |
|
856 allowBreak = true; |
|
857 } |
|
858 } |
|
859 aBreakBefore[cur] = allowBreak; |
|
860 if (allowBreak) |
|
861 state.NotifyBreakBefore(); |
|
862 lastClass = cl; |
|
863 if (CLASS_COMPLEX == cl) { |
|
864 uint32_t end = cur + 1; |
|
865 |
|
866 while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) { |
|
867 ++end; |
|
868 } |
|
869 |
|
870 NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur); |
|
871 |
|
872 // We have to consider word-break value again for complex characters |
|
873 if (aWordBreak != nsILineBreaker::kWordBreak_Normal) { |
|
874 // Respect word-break property |
|
875 for (uint32_t i = cur; i < end; i++) |
|
876 aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll); |
|
877 } |
|
878 |
|
879 // restore breakability at chunk begin, which was always set to false |
|
880 // by the complex line breaker |
|
881 aBreakBefore[cur] = allowBreak; |
|
882 |
|
883 cur = end - 1; |
|
884 } |
|
885 } |
|
886 } |
|
887 |
|
888 void |
|
889 nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength, |
|
890 uint8_t aWordBreak, |
|
891 uint8_t* aBreakBefore) |
|
892 { |
|
893 uint32_t cur; |
|
894 int8_t lastClass = CLASS_NONE; |
|
895 ContextState state(aChars, aLength); |
|
896 |
|
897 for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { |
|
898 char16_t ch = aChars[cur]; |
|
899 int8_t cl; |
|
900 |
|
901 if (NEED_CONTEXTUAL_ANALYSIS(ch)) { |
|
902 cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, |
|
903 ch, |
|
904 cur + 1 < aLength ? aChars[cur + 1] : U_NULL, |
|
905 state); |
|
906 } else { |
|
907 if (ch == U_EQUAL) |
|
908 state.NotifySeenEqualsSign(); |
|
909 state.NotifyNonHyphenCharacter(ch); |
|
910 cl = GetClass(ch); |
|
911 } |
|
912 |
|
913 bool allowBreak = false; |
|
914 if (cur > 0) { |
|
915 if (aWordBreak == nsILineBreaker::kWordBreak_Normal) { |
|
916 allowBreak = (state.UseConservativeBreaking()) ? |
|
917 GetPairConservative(lastClass, cl) : GetPair(lastClass, cl); |
|
918 } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) { |
|
919 allowBreak = true; |
|
920 } |
|
921 } |
|
922 aBreakBefore[cur] = allowBreak; |
|
923 if (allowBreak) |
|
924 state.NotifyBreakBefore(); |
|
925 lastClass = cl; |
|
926 } |
|
927 } |