intl/lwbrk/src/nsJISx4051LineBreaker.cpp

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:f55328cfbfc7
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6
7
8 #include "nsJISx4051LineBreaker.h"
9
10 #include "jisx4051class.h"
11 #include "nsComplexBreaker.h"
12 #include "nsTArray.h"
13
14 /*
15
16 Simplification of Pair Table in JIS X 4051
17
18 1. The Origion Table - in 4.1.3
19
20 In JIS x 4051. The pair table is defined as below
21
22 Class of
23 Leading Class of Trailing Char Class
24 Char
25
26 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20
27 * # * #
28 1 X X X X X X X X X X X X X X X X X X X X X E
29 2 X X X X X X
30 3 X X X X X X
31 4 X X X X X X
32 5 X X X X X X
33 6 X X X X X X
34 7 X X X X X X X
35 8 X X X X X X E
36 9 X X X X X X
37 10 X X X X X X
38 11 X X X X X X
39 12 X X X X X X
40 13 X X X X X X X
41 14 X X X X X X X
42 15 X X X X X X X X X
43 16 X X X X X X X X
44 17 X X X X X E
45 18 X X X X X X X X X
46 19 X E E E E E X X X X X X X X X X X X E X E E
47 20 X X X X X E
48
49 * Same Char
50 # Other Char
51
52 X Cannot Break
53
54 The classes mean:
55 1: Open parenthesis
56 2: Close parenthesis
57 3: Prohibit a line break before
58 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
59 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
60 6: Full stop
61 7: Non-breakable between same characters
62 8: Prefix (e.g., "$", "NO.")
63 9: Postfix (e.g., "%")
64 10: Ideographic space
65 11: Hiragana
66 12: Japanese characters (except class 11)
67 13: Subscript
68 14: Ruby
69 15: Numeric
70 16: Alphabet
71 17: Space for Western language
72 18: Western characters (except class 17)
73 19: Split line note (Warichu) begin quote
74 20: Split line note (Warichu) end quote
75
76 2. Simplified by remove the class which we do not care
77
78 However, since we do not care about class 13(Subscript), 14(Ruby),
79 16 (Aphabet), 19(split line note begin quote), and 20(split line note end
80 quote) we can simplify this par table into the following
81
82 Class of
83 Leading Class of Trailing Char Class
84 Char
85
86 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18
87
88 1 X X X X X X X X X X X X X X X
89 2 X X X X X
90 3 X X X X X
91 4 X X X X X
92 5 X X X X X
93 6 X X X X X
94 7 X X X X X X
95 8 X X X X X X
96 9 X X X X X
97 10 X X X X X
98 11 X X X X X
99 12 X X X X X
100 15 X X X X X X X X
101 17 X X X X X
102 18 X X X X X X X
103
104 3. Simplified by merged classes
105
106 After the 2 simplification, the pair table have some duplication
107 a. class 2, 3, 4, 5, 6, are the same- we can merged them
108 b. class 10, 11, 12, 17 are the same- we can merged them
109
110
111 Class of
112 Leading Class of Trailing Char Class
113 Char
114
115 1 [a] 7 8 9 [b]15 18
116
117 1 X X X X X X X X
118 [a] X
119 7 X X
120 8 X X
121 9 X
122 [b] X
123 15 X X X X
124 18 X X X
125
126
127 4. We add COMPLEX characters and make it breakable w/ all ther class
128 except after class 1 and before class [a]
129
130 Class of
131 Leading Class of Trailing Char Class
132 Char
133
134 1 [a] 7 8 9 [b]15 18 COMPLEX
135
136 1 X X X X X X X X X
137 [a] X
138 7 X X
139 8 X X
140 9 X
141 [b] X
142 15 X X X X
143 18 X X X
144 COMPLEX X T
145
146 T : need special handling
147
148
149 5. However, we need two special class for some punctuations/parentheses,
150 theirs breaking rules like character class (18), see bug 389056.
151 And also we need character like punctuation that is same behavior with 18,
152 but the characters are not letters of all languages. (e.g., '_')
153 [c]. Based on open parenthesis class (1), but it is not breakable after
154 character class (18) or numeric class (15).
155 [d]. Based on close parenthesis (or punctuation) class (2), but it is not
156 breakable before character class (18) or numeric class (15).
157
158 Class of
159 Leading Class of Trailing Char Class
160 Char
161
162 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d]
163
164 1 X X X X X X X X X X X
165 [a] X X X
166 7 X X
167 8 X X
168 9 X
169 [b] X X
170 15 X X X X X X
171 18 X X X X X
172 COMPLEX X T
173 [c] X X X X X X X X X X X
174 [d] X X X X
175
176
177 6. And Unicode has "NON-BREAK" characters. The lines should be broken around
178 them. But in JIS X 4051, such class is not, therefore, we create [e].
179
180 Class of
181 Leading Class of Trailing Char Class
182 Char
183
184 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
185
186 1 X X X X X X X X X X X X
187 [a] X X X
188 7 X X X
189 8 X X X
190 9 X X
191 [b] X X X
192 15 X X X X X X X
193 18 X X X X X X
194 COMPLEX X T X
195 [c] X X X X X X X X X X X X
196 [d] X X X X X
197 [e] X X X X X X X X X X X X
198
199
200 7. Now we use one bit to encode weather it is breakable, and use 2 bytes
201 for one row, then the bit table will look like:
202
203 18 <- 1
204
205 1 0000 1111 1111 1111 = 0x0FFF
206 [a] 0000 1100 0000 0010 = 0x0C02
207 7 0000 1000 0000 0110 = 0x0806
208 8 0000 1000 0100 0010 = 0x0842
209 9 0000 1000 0000 0010 = 0x0802
210 [b] 0000 1100 0000 0010 = 0x0C02
211 15 0000 1110 1101 0010 = 0x0ED2
212 18 0000 1110 1100 0010 = 0x0EC2
213 COMPLEX 0000 1001 0000 0010 = 0x0902
214 [c] 0000 1111 1111 1111 = 0x0FFF
215 [d] 0000 1100 1100 0010 = 0x0CC2
216 [e] 0000 1111 1111 1111 = 0x0FFF
217 */
218
219 #define MAX_CLASSES 12
220
221 static const uint16_t gPair[MAX_CLASSES] = {
222 0x0FFF,
223 0x0C02,
224 0x0806,
225 0x0842,
226 0x0802,
227 0x0C02,
228 0x0ED2,
229 0x0EC2,
230 0x0902,
231 0x0FFF,
232 0x0CC2,
233 0x0FFF
234 };
235
236
237 /*
238
239 8. And if the character is not enough far from word start, word end and
240 another break point, we should not break in non-CJK languages.
241 I.e., Don't break around 15, 18, [c] and [d], but don't change
242 that if they are related to [b].
243
244 Class of
245 Leading Class of Trailing Char Class
246 Char
247
248 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
249
250 1 X X X X X X X X X X X X
251 [a] X X X X X X
252 7 X X X X X X X
253 8 X X X X X X
254 9 X X X X X X
255 [b] X X X
256 15 X X X X X X X X X X X
257 18 X X X X X X X X X X X
258 COMPLEX X X X T X X X
259 [c] X X X X X X X X X X X X
260 [d] X X X X X X X X X X X
261 [e] X X X X X X X X X X X X
262
263 18 <- 1
264
265 1 0000 1111 1111 1111 = 0x0FFF
266 [a] 0000 1110 1100 0010 = 0x0EC2
267 7 0000 1110 1100 0110 = 0x0EC6
268 8 0000 1110 1100 0010 = 0x0EC2
269 9 0000 1110 1100 0010 = 0x0EC2
270 [b] 0000 1100 0000 0010 = 0x0C02
271 15 0000 1111 1101 1111 = 0x0FDF
272 18 0000 1111 1101 1111 = 0x0FDF
273 COMPLEX 0000 1111 1100 0010 = 0x0FC2
274 [c] 0000 1111 1111 1111 = 0x0FFF
275 [d] 0000 1111 1101 1111 = 0x0FDF
276 [e] 0000 1111 1111 1111 = 0x0FFF
277 */
278
279 static const uint16_t gPairConservative[MAX_CLASSES] = {
280 0x0FFF,
281 0x0EC2,
282 0x0EC6,
283 0x0EC2,
284 0x0EC2,
285 0x0C02,
286 0x0FDF,
287 0x0FDF,
288 0x0FC2,
289 0x0FFF,
290 0x0FDF,
291 0x0FFF
292 };
293
294
295 /*
296
297 9. Now we map the class to number
298
299 0: 1
300 1: [a]- 2, 3, 4, 5, 6
301 2: 7
302 3: 8
303 4: 9
304 5: [b]- 10, 11, 12, 17
305 6: 15
306 7: 18
307 8: COMPLEX
308 9: [c]
309 A: [d]
310 B: [e]
311
312 and they mean:
313 0: Open parenthesis
314 1: Punctuation that prohibits break before
315 2: Non-breakable between same classes
316 3: Prefix
317 4: Postfix
318 5: Breakable character (Spaces and Most Japanese characters)
319 6: Numeric
320 7: Characters
321 8: Need special handling characters (E.g., Thai)
322 9: Open parentheses like Character (See bug 389056)
323 A: Close parenthese (or punctuations) like Character (See bug 389056)
324 B: Non breakable (See bug 390920)
325
326 */
327
328 #define CLASS_NONE INT8_MAX
329
330 #define CLASS_OPEN 0x00
331 #define CLASS_CLOSE 0x01
332 #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
333 #define CLASS_PREFIX 0x03
334 #define CLASS_POSTFFIX 0x04
335 #define CLASS_BREAKABLE 0x05
336 #define CLASS_NUMERIC 0x06
337 #define CLASS_CHARACTER 0x07
338 #define CLASS_COMPLEX 0x08
339 #define CLASS_OPEN_LIKE_CHARACTER 0x09
340 #define CLASS_CLOSE_LIKE_CHARACTER 0x0A
341 #define CLASS_NON_BREAKABLE 0x0B
342
343 #define U_NULL char16_t(0x0000)
344 #define U_SLASH char16_t('/')
345 #define U_SPACE char16_t(' ')
346 #define U_HYPHEN char16_t('-')
347 #define U_EQUAL char16_t('=')
348 #define U_PERCENT char16_t('%')
349 #define U_AMPERSAND char16_t('&')
350 #define U_SEMICOLON char16_t(';')
351 #define U_BACKSLASH char16_t('\\')
352 #define U_OPEN_SINGLE_QUOTE char16_t(0x2018)
353 #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)
354 #define U_OPEN_GUILLEMET char16_t(0x00AB)
355
356 #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \
357 (c) == U_SLASH || \
358 (c) == U_PERCENT || \
359 (c) == U_AMPERSAND || \
360 (c) == U_SEMICOLON || \
361 (c) == U_BACKSLASH || \
362 (c) == U_OPEN_SINGLE_QUOTE || \
363 (c) == U_OPEN_DOUBLE_QUOTE || \
364 (c) == U_OPEN_GUILLEMET)
365
366 #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
367
368 static inline int
369 GETCLASSFROMTABLE(const uint32_t* t, uint16_t l)
370 {
371 return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
372 }
373
374 static inline int
375 IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u)
376 {
377 return ((0xff66 <= (u)) && ((u) <= 0xff70));
378 }
379
380 static inline int
381 IS_CJK_CHAR(char16_t u)
382 {
383 return ((0x1100 <= (u) && (u) <= 0x11ff) ||
384 (0x2e80 <= (u) && (u) <= 0xd7ff) ||
385 (0xf900 <= (u) && (u) <= 0xfaff) ||
386 (0xff00 <= (u) && (u) <= 0xffef) );
387 }
388
389 static inline bool
390 IS_NONBREAKABLE_SPACE(char16_t u)
391 {
392 return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
393 }
394
395 static inline bool
396 IS_HYPHEN(char16_t u)
397 {
398 return (u == U_HYPHEN ||
399 u == 0x058A || // ARMENIAN HYPHEN
400 u == 0x2010 || // HYPHEN
401 u == 0x2012 || // FIGURE DASH
402 u == 0x2013); // EN DASH
403 }
404
405 static int8_t
406 GetClass(char16_t u)
407 {
408 uint16_t h = u & 0xFF00;
409 uint16_t l = u & 0x00ff;
410 int8_t c;
411
412 // Handle 3 range table first
413 if (0x0000 == h) {
414 c = GETCLASSFROMTABLE(gLBClass00, l);
415 } else if (0x1700 == h) {
416 c = GETCLASSFROMTABLE(gLBClass17, l);
417 } else if (NS_NeedsPlatformNativeHandling(u)) {
418 c = CLASS_COMPLEX;
419 } else if (0x0E00 == h) {
420 c = GETCLASSFROMTABLE(gLBClass0E, l);
421 } else if (0x2000 == h) {
422 c = GETCLASSFROMTABLE(gLBClass20, l);
423 } else if (0x2100 == h) {
424 c = GETCLASSFROMTABLE(gLBClass21, l);
425 } else if (0x3000 == h) {
426 c = GETCLASSFROMTABLE(gLBClass30, l);
427 } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi
428 ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul
429 ((0xf900 <= h) && (h <= 0xfaff))) {
430 c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatibility
431 } else if (0xff00 == h) {
432 if (l < 0x0060) { // Fullwidth ASCII variant
433 c = GETCLASSFROMTABLE(gLBClass00, (l+0x20));
434 } else if (l < 0x00a0) {
435 switch (l) {
436 case 0x61: c = GetClass(0x3002); break;
437 case 0x62: c = GetClass(0x300c); break;
438 case 0x63: c = GetClass(0x300d); break;
439 case 0x64: c = GetClass(0x3001); break;
440 case 0x65: c = GetClass(0x30fb); break;
441 case 0x9e: c = GetClass(0x309b); break;
442 case 0x9f: c = GetClass(0x309c); break;
443 default:
444 if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u))
445 c = CLASS_CLOSE; // jis x4051 class 3
446 else
447 c = CLASS_BREAKABLE; // jis x4051 class 11
448 break;
449 }
450 // Halfwidth Katakana variants
451 } else if (l < 0x00e0) {
452 c = CLASS_CHARACTER; // Halfwidth Hangul variants
453 } else if (l < 0x00f0) {
454 static char16_t NarrowFFEx[16] = {
455 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
456 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
457 };
458 c = GetClass(NarrowFFEx[l - 0x00e0]);
459 } else {
460 c = CLASS_CHARACTER;
461 }
462 } else if (0x3100 == h) {
463 if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
464 // XXX: This is per UAX #14, but UAX #14 may change
465 // the line breaking rules about Kanbun and Bopomofo.
466 c = CLASS_BREAKABLE;
467 } else if (l >= 0xf0) { // Katakana small letters for Ainu
468 c = CLASS_CLOSE;
469 } else { // unassigned
470 c = CLASS_CHARACTER;
471 }
472 } else if (0x0300 == h) {
473 if (0x4F == l || (0x5C <= l && l <= 0x62))
474 c = CLASS_NON_BREAKABLE;
475 else
476 c = CLASS_CHARACTER;
477 } else if (0x0500 == h) {
478 // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
479 if (l == 0x8A)
480 c = GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
481 else
482 c = CLASS_CHARACTER;
483 } else if (0x0F00 == h) {
484 if (0x08 == l || 0x0C == l || 0x12 == l)
485 c = CLASS_NON_BREAKABLE;
486 else
487 c = CLASS_CHARACTER;
488 } else if (0x1800 == h) {
489 if (0x0E == l)
490 c = CLASS_NON_BREAKABLE;
491 else
492 c = CLASS_CHARACTER;
493 } else if (0x1600 == h) {
494 if (0x80 == l) { // U+1680 OGHAM SPACE MARK
495 c = CLASS_BREAKABLE;
496 } else {
497 c = CLASS_CHARACTER;
498 }
499 } else if (u == 0xfeff) {
500 c = CLASS_NON_BREAKABLE;
501 } else {
502 c = CLASS_CHARACTER; // others
503 }
504 return c;
505 }
506
507 static bool
508 GetPair(int8_t c1, int8_t c2)
509 {
510 NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
511 NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
512
513 return (0 == ((gPair[c1] >> c2) & 0x0001));
514 }
515
516 static bool
517 GetPairConservative(int8_t c1, int8_t c2)
518 {
519 NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
520 NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
521
522 return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
523 }
524
525 nsJISx4051LineBreaker::nsJISx4051LineBreaker()
526 {
527 }
528
529 nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
530 {
531 }
532
533 NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker)
534
535 class ContextState {
536 public:
537 ContextState(const char16_t* aText, uint32_t aLength) {
538 mUniText = aText;
539 mText = nullptr;
540 mLength = aLength;
541 Init();
542 }
543
544 ContextState(const uint8_t* aText, uint32_t aLength) {
545 mUniText = nullptr;
546 mText = aText;
547 mLength = aLength;
548 Init();
549 }
550
551 uint32_t Length() { return mLength; }
552 uint32_t Index() { return mIndex; }
553
554 char16_t GetCharAt(uint32_t aIndex) {
555 NS_ASSERTION(aIndex < mLength, "Out of range!");
556 return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
557 }
558
559 void AdvanceIndex() {
560 ++mIndex;
561 }
562
563 void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
564
565 // A word of western language should not be broken. But even if the word has
566 // only ASCII characters, non-natural context words should be broken, e.g.,
567 // URL and file path. For protecting the natural words, we should use
568 // conservative breaking rules at following conditions:
569 // 1. at near the start of word
570 // 2. at near the end of word
571 // 3. at near the latest broken point
572 // CONSERVATIVE_BREAK_RANGE define the 'near' in characters.
573 #define CONSERVATIVE_BREAK_RANGE 6
574
575 bool UseConservativeBreaking(uint32_t aOffset = 0) {
576 if (mHasCJKChar)
577 return false;
578 uint32_t index = mIndex + aOffset;
579 bool result = (index < CONSERVATIVE_BREAK_RANGE ||
580 mLength - index < CONSERVATIVE_BREAK_RANGE ||
581 index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE);
582 if (result || !mHasNonbreakableSpace)
583 return result;
584
585 // This text has no-breakable space, we need to check whether the index
586 // is near it.
587
588 // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here.
589 for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) {
590 if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1)))
591 return true;
592 }
593 // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE.
594 for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) {
595 if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))
596 return true;
597 }
598 return false;
599 }
600
601 bool HasPreviousEqualsSign() const {
602 return mHasPreviousEqualsSign;
603 }
604 void NotifySeenEqualsSign() {
605 mHasPreviousEqualsSign = true;
606 }
607
608 bool HasPreviousSlash() const {
609 return mHasPreviousSlash;
610 }
611 void NotifySeenSlash() {
612 mHasPreviousSlash = true;
613 }
614
615 bool HasPreviousBackslash() const {
616 return mHasPreviousBackslash;
617 }
618 void NotifySeenBackslash() {
619 mHasPreviousBackslash = true;
620 }
621
622 char16_t GetPreviousNonHyphenCharacter() const {
623 return mPreviousNonHyphenCharacter;
624 }
625 void NotifyNonHyphenCharacter(char16_t ch) {
626 mPreviousNonHyphenCharacter = ch;
627 }
628
629 private:
630 void Init() {
631 mIndex = 0;
632 mLastBreakIndex = 0;
633 mPreviousNonHyphenCharacter = U_NULL;
634 mHasCJKChar = 0;
635 mHasNonbreakableSpace = 0;
636 mHasPreviousEqualsSign = false;
637 mHasPreviousSlash = false;
638 mHasPreviousBackslash = false;
639
640 for (uint32_t i = 0; i < mLength; ++i) {
641 char16_t u = GetCharAt(i);
642 if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u))
643 mHasNonbreakableSpace = 1;
644 else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u))
645 mHasCJKChar = 1;
646 }
647 }
648
649 const char16_t* mUniText;
650 const uint8_t* mText;
651
652 uint32_t mIndex;
653 uint32_t mLength; // length of text
654 uint32_t mLastBreakIndex;
655 char16_t mPreviousNonHyphenCharacter; // The last character we have seen
656 // which is not U_HYPHEN
657 bool mHasCJKChar; // if the text has CJK character, this is true.
658 bool mHasNonbreakableSpace; // if the text has no-breakable space,
659 // this is true.
660 bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
661 bool mHasPreviousSlash; // True if we have seen a U_SLASH
662 bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH
663 };
664
665 static int8_t
666 ContextualAnalysis(char16_t prev, char16_t cur, char16_t next,
667 ContextState &aState)
668 {
669 // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
670
671 if (IS_HYPHEN(cur)) {
672 // If next character is hyphen, we don't need to break between them.
673 if (IS_HYPHEN(next))
674 return CLASS_CHARACTER;
675 // If prev and next characters are numeric, it may be in Math context.
676 // So, we should not break here.
677 bool prevIsNum = IS_ASCII_DIGIT(prev);
678 bool nextIsNum = IS_ASCII_DIGIT(next);
679 if (prevIsNum && nextIsNum)
680 return CLASS_NUMERIC;
681 // If one side is numeric and the other is a character, or if both sides are
682 // characters, the hyphen should be breakable.
683 if (!aState.UseConservativeBreaking(1)) {
684 char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
685 if (prevOfHyphen && next) {
686 int8_t prevClass = GetClass(prevOfHyphen);
687 int8_t nextClass = GetClass(next);
688 bool prevIsNumOrCharOrClose =
689 prevIsNum ||
690 (prevClass == CLASS_CHARACTER &&
691 !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
692 prevClass == CLASS_CLOSE ||
693 prevClass == CLASS_CLOSE_LIKE_CHARACTER;
694 bool nextIsNumOrCharOrOpen =
695 nextIsNum ||
696 (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||
697 nextClass == CLASS_OPEN ||
698 nextClass == CLASS_OPEN_LIKE_CHARACTER ||
699 next == U_OPEN_SINGLE_QUOTE ||
700 next == U_OPEN_DOUBLE_QUOTE ||
701 next == U_OPEN_GUILLEMET;
702 if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {
703 return CLASS_CLOSE;
704 }
705 }
706 }
707 } else {
708 aState.NotifyNonHyphenCharacter(cur);
709 if (cur == U_SLASH || cur == U_BACKSLASH) {
710 // If this is immediately after same char, we should not break here.
711 if (prev == cur)
712 return CLASS_CHARACTER;
713 // If this text has two or more (BACK)SLASHs, this may be file path or URL.
714 // Make sure to compute shouldReturn before we notify on this slash.
715 bool shouldReturn = !aState.UseConservativeBreaking() &&
716 (cur == U_SLASH ?
717 aState.HasPreviousSlash() : aState.HasPreviousBackslash());
718
719 if (cur == U_SLASH) {
720 aState.NotifySeenSlash();
721 } else {
722 aState.NotifySeenBackslash();
723 }
724
725 if (shouldReturn)
726 return CLASS_OPEN;
727 } else if (cur == U_PERCENT) {
728 // If this is a part of the param of URL, we should break before.
729 if (!aState.UseConservativeBreaking()) {
730 if (aState.Index() >= 3 &&
731 aState.GetCharAt(aState.Index() - 3) == U_PERCENT)
732 return CLASS_OPEN;
733 if (aState.Index() + 3 < aState.Length() &&
734 aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
735 return CLASS_OPEN;
736 }
737 } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
738 // If this may be a separator of params of URL, we should break after.
739 if (!aState.UseConservativeBreaking(1) &&
740 aState.HasPreviousEqualsSign())
741 return CLASS_CLOSE;
742 } else if (cur == U_OPEN_SINGLE_QUOTE ||
743 cur == U_OPEN_DOUBLE_QUOTE ||
744 cur == U_OPEN_GUILLEMET) {
745 // for CJK usage, we treat these as openers to allow a break before them,
746 // but otherwise treat them as normal characters because quote mark usage
747 // in various Western languages varies too much; see bug #450088 discussion.
748 if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
749 return CLASS_OPEN;
750 } else {
751 NS_ERROR("Forgot to handle the current character!");
752 }
753 }
754 return GetClass(cur);
755 }
756
757
758 int32_t
759 nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen,
760 uint32_t aPos, int8_t aDirection)
761 {
762 bool textNeedsJISx4051 = false;
763 int32_t begin, end;
764
765 for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
766 if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) {
767 textNeedsJISx4051 = true;
768 }
769 }
770 for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {
771 if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
772 textNeedsJISx4051 = true;
773 }
774 }
775
776 int32_t ret;
777 nsAutoTArray<uint8_t, 2000> breakState;
778 if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
779 // No complex text character, do not try to do complex line break.
780 // (This is required for serializers. See Bug #344816.)
781 // Also fall back to this when out of memory.
782 if (aDirection < 0) {
783 ret = (begin == int32_t(aPos)) ? begin - 1 : begin;
784 } else {
785 ret = end;
786 }
787 } else {
788 GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal,
789 breakState.Elements());
790
791 ret = aPos;
792 do {
793 ret += aDirection;
794 } while (begin < ret && ret < end && !breakState[ret - begin]);
795 }
796
797 return ret;
798 }
799
800 int32_t
801 nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen,
802 uint32_t aPos)
803 {
804 NS_ASSERTION(aText, "aText shouldn't be null");
805 NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next");
806
807 int32_t nextPos = WordMove(aText, aLen, aPos, 1);
808 return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
809 }
810
811 int32_t
812 nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen,
813 uint32_t aPos)
814 {
815 NS_ASSERTION(aText, "aText shouldn't be null");
816 NS_ASSERTION(aLen >= aPos && aPos > 0,
817 "Bad position passed to nsJISx4051LineBreaker::Prev");
818
819 int32_t prevPos = WordMove(aText, aLen, aPos, -1);
820 return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
821 }
822
823 void
824 nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength,
825 uint8_t aWordBreak,
826 uint8_t* aBreakBefore)
827 {
828 uint32_t cur;
829 int8_t lastClass = CLASS_NONE;
830 ContextState state(aChars, aLength);
831
832 for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
833 char16_t ch = aChars[cur];
834 int8_t cl;
835
836 if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
837 cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
838 ch,
839 cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
840 state);
841 } else {
842 if (ch == U_EQUAL)
843 state.NotifySeenEqualsSign();
844 state.NotifyNonHyphenCharacter(ch);
845 cl = GetClass(ch);
846 }
847
848 bool allowBreak = false;
849 if (cur > 0) {
850 NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
851 "Loop should have prevented adjacent complex chars here");
852 if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
853 allowBreak = (state.UseConservativeBreaking()) ?
854 GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
855 } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
856 allowBreak = true;
857 }
858 }
859 aBreakBefore[cur] = allowBreak;
860 if (allowBreak)
861 state.NotifyBreakBefore();
862 lastClass = cl;
863 if (CLASS_COMPLEX == cl) {
864 uint32_t end = cur + 1;
865
866 while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) {
867 ++end;
868 }
869
870 NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
871
872 // We have to consider word-break value again for complex characters
873 if (aWordBreak != nsILineBreaker::kWordBreak_Normal) {
874 // Respect word-break property
875 for (uint32_t i = cur; i < end; i++)
876 aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll);
877 }
878
879 // restore breakability at chunk begin, which was always set to false
880 // by the complex line breaker
881 aBreakBefore[cur] = allowBreak;
882
883 cur = end - 1;
884 }
885 }
886 }
887
888 void
889 nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength,
890 uint8_t aWordBreak,
891 uint8_t* aBreakBefore)
892 {
893 uint32_t cur;
894 int8_t lastClass = CLASS_NONE;
895 ContextState state(aChars, aLength);
896
897 for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
898 char16_t ch = aChars[cur];
899 int8_t cl;
900
901 if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
902 cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
903 ch,
904 cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
905 state);
906 } else {
907 if (ch == U_EQUAL)
908 state.NotifySeenEqualsSign();
909 state.NotifyNonHyphenCharacter(ch);
910 cl = GetClass(ch);
911 }
912
913 bool allowBreak = false;
914 if (cur > 0) {
915 if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
916 allowBreak = (state.UseConservativeBreaking()) ?
917 GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
918 } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
919 allowBreak = true;
920 }
921 }
922 aBreakBefore[cur] = allowBreak;
923 if (allowBreak)
924 state.NotifyBreakBefore();
925 lastClass = cl;
926 }
927 }

mercurial