michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: michael@0: michael@0: #include "nsJISx4051LineBreaker.h" michael@0: michael@0: #include "jisx4051class.h" michael@0: #include "nsComplexBreaker.h" michael@0: #include "nsTArray.h" michael@0: michael@0: /* michael@0: michael@0: Simplification of Pair Table in JIS X 4051 michael@0: michael@0: 1. The Origion Table - in 4.1.3 michael@0: michael@0: In JIS x 4051. The pair table is defined as below michael@0: michael@0: Class of michael@0: Leading Class of Trailing Char Class michael@0: Char michael@0: michael@0: 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20 michael@0: * # * # michael@0: 1 X X X X X X X X X X X X X X X X X X X X X E michael@0: 2 X X X X X X michael@0: 3 X X X X X X michael@0: 4 X X X X X X michael@0: 5 X X X X X X michael@0: 6 X X X X X X michael@0: 7 X X X X X X X michael@0: 8 X X X X X X E michael@0: 9 X X X X X X michael@0: 10 X X X X X X michael@0: 11 X X X X X X michael@0: 12 X X X X X X michael@0: 13 X X X X X X X michael@0: 14 X X X X X X X michael@0: 15 X X X X X X X X X michael@0: 16 X X X X X X X X michael@0: 17 X X X X X E michael@0: 18 X X X X X X X X X michael@0: 19 X E E E E E X X X X X X X X X X X X E X E E michael@0: 20 X X X X X E michael@0: michael@0: * Same Char michael@0: # Other Char michael@0: michael@0: X Cannot Break michael@0: michael@0: The classes mean: michael@0: 1: Open parenthesis michael@0: 2: Close parenthesis michael@0: 3: Prohibit a line break before michael@0: 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?") michael@0: 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT) michael@0: 6: Full stop michael@0: 7: Non-breakable between same characters michael@0: 8: Prefix (e.g., "$", "NO.") michael@0: 9: Postfix (e.g., "%") michael@0: 10: Ideographic space michael@0: 11: Hiragana michael@0: 12: Japanese characters (except class 11) michael@0: 13: Subscript michael@0: 14: Ruby michael@0: 15: Numeric michael@0: 16: Alphabet michael@0: 17: Space for Western language michael@0: 18: Western characters (except class 17) michael@0: 19: Split line note (Warichu) begin quote michael@0: 20: Split line note (Warichu) end quote michael@0: michael@0: 2. Simplified by remove the class which we do not care michael@0: michael@0: However, since we do not care about class 13(Subscript), 14(Ruby), michael@0: 16 (Aphabet), 19(split line note begin quote), and 20(split line note end michael@0: quote) we can simplify this par table into the following michael@0: michael@0: Class of michael@0: Leading Class of Trailing Char Class michael@0: Char michael@0: michael@0: 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18 michael@0: michael@0: 1 X X X X X X X X X X X X X X X michael@0: 2 X X X X X michael@0: 3 X X X X X michael@0: 4 X X X X X michael@0: 5 X X X X X michael@0: 6 X X X X X michael@0: 7 X X X X X X michael@0: 8 X X X X X X michael@0: 9 X X X X X michael@0: 10 X X X X X michael@0: 11 X X X X X michael@0: 12 X X X X X michael@0: 15 X X X X X X X X michael@0: 17 X X X X X michael@0: 18 X X X X X X X michael@0: michael@0: 3. Simplified by merged classes michael@0: michael@0: After the 2 simplification, the pair table have some duplication michael@0: a. class 2, 3, 4, 5, 6, are the same- we can merged them michael@0: b. class 10, 11, 12, 17 are the same- we can merged them michael@0: michael@0: michael@0: Class of michael@0: Leading Class of Trailing Char Class michael@0: Char michael@0: michael@0: 1 [a] 7 8 9 [b]15 18 michael@0: michael@0: 1 X X X X X X X X michael@0: [a] X michael@0: 7 X X michael@0: 8 X X michael@0: 9 X michael@0: [b] X michael@0: 15 X X X X michael@0: 18 X X X michael@0: michael@0: michael@0: 4. We add COMPLEX characters and make it breakable w/ all ther class michael@0: except after class 1 and before class [a] michael@0: michael@0: Class of michael@0: Leading Class of Trailing Char Class michael@0: Char michael@0: michael@0: 1 [a] 7 8 9 [b]15 18 COMPLEX michael@0: michael@0: 1 X X X X X X X X X michael@0: [a] X michael@0: 7 X X michael@0: 8 X X michael@0: 9 X michael@0: [b] X michael@0: 15 X X X X michael@0: 18 X X X michael@0: COMPLEX X T michael@0: michael@0: T : need special handling michael@0: michael@0: michael@0: 5. However, we need two special class for some punctuations/parentheses, michael@0: theirs breaking rules like character class (18), see bug 389056. michael@0: And also we need character like punctuation that is same behavior with 18, michael@0: but the characters are not letters of all languages. (e.g., '_') michael@0: [c]. Based on open parenthesis class (1), but it is not breakable after michael@0: character class (18) or numeric class (15). michael@0: [d]. Based on close parenthesis (or punctuation) class (2), but it is not michael@0: breakable before character class (18) or numeric class (15). michael@0: michael@0: Class of michael@0: Leading Class of Trailing Char Class michael@0: Char michael@0: michael@0: 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] michael@0: michael@0: 1 X X X X X X X X X X X michael@0: [a] X X X michael@0: 7 X X michael@0: 8 X X michael@0: 9 X michael@0: [b] X X michael@0: 15 X X X X X X michael@0: 18 X X X X X michael@0: COMPLEX X T michael@0: [c] X X X X X X X X X X X michael@0: [d] X X X X michael@0: michael@0: michael@0: 6. And Unicode has "NON-BREAK" characters. The lines should be broken around michael@0: them. But in JIS X 4051, such class is not, therefore, we create [e]. michael@0: michael@0: Class of michael@0: Leading Class of Trailing Char Class michael@0: Char michael@0: michael@0: 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] michael@0: michael@0: 1 X X X X X X X X X X X X michael@0: [a] X X X michael@0: 7 X X X michael@0: 8 X X X michael@0: 9 X X michael@0: [b] X X X michael@0: 15 X X X X X X X michael@0: 18 X X X X X X michael@0: COMPLEX X T X michael@0: [c] X X X X X X X X X X X X michael@0: [d] X X X X X michael@0: [e] X X X X X X X X X X X X michael@0: michael@0: michael@0: 7. Now we use one bit to encode weather it is breakable, and use 2 bytes michael@0: for one row, then the bit table will look like: michael@0: michael@0: 18 <- 1 michael@0: michael@0: 1 0000 1111 1111 1111 = 0x0FFF michael@0: [a] 0000 1100 0000 0010 = 0x0C02 michael@0: 7 0000 1000 0000 0110 = 0x0806 michael@0: 8 0000 1000 0100 0010 = 0x0842 michael@0: 9 0000 1000 0000 0010 = 0x0802 michael@0: [b] 0000 1100 0000 0010 = 0x0C02 michael@0: 15 0000 1110 1101 0010 = 0x0ED2 michael@0: 18 0000 1110 1100 0010 = 0x0EC2 michael@0: COMPLEX 0000 1001 0000 0010 = 0x0902 michael@0: [c] 0000 1111 1111 1111 = 0x0FFF michael@0: [d] 0000 1100 1100 0010 = 0x0CC2 michael@0: [e] 0000 1111 1111 1111 = 0x0FFF michael@0: */ michael@0: michael@0: #define MAX_CLASSES 12 michael@0: michael@0: static const uint16_t gPair[MAX_CLASSES] = { michael@0: 0x0FFF, michael@0: 0x0C02, michael@0: 0x0806, michael@0: 0x0842, michael@0: 0x0802, michael@0: 0x0C02, michael@0: 0x0ED2, michael@0: 0x0EC2, michael@0: 0x0902, michael@0: 0x0FFF, michael@0: 0x0CC2, michael@0: 0x0FFF michael@0: }; michael@0: michael@0: michael@0: /* michael@0: michael@0: 8. And if the character is not enough far from word start, word end and michael@0: another break point, we should not break in non-CJK languages. michael@0: I.e., Don't break around 15, 18, [c] and [d], but don't change michael@0: that if they are related to [b]. michael@0: michael@0: Class of michael@0: Leading Class of Trailing Char Class michael@0: Char michael@0: michael@0: 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] michael@0: michael@0: 1 X X X X X X X X X X X X michael@0: [a] X X X X X X michael@0: 7 X X X X X X X michael@0: 8 X X X X X X michael@0: 9 X X X X X X michael@0: [b] X X X michael@0: 15 X X X X X X X X X X X michael@0: 18 X X X X X X X X X X X michael@0: COMPLEX X X X T X X X michael@0: [c] X X X X X X X X X X X X michael@0: [d] X X X X X X X X X X X michael@0: [e] X X X X X X X X X X X X michael@0: michael@0: 18 <- 1 michael@0: michael@0: 1 0000 1111 1111 1111 = 0x0FFF michael@0: [a] 0000 1110 1100 0010 = 0x0EC2 michael@0: 7 0000 1110 1100 0110 = 0x0EC6 michael@0: 8 0000 1110 1100 0010 = 0x0EC2 michael@0: 9 0000 1110 1100 0010 = 0x0EC2 michael@0: [b] 0000 1100 0000 0010 = 0x0C02 michael@0: 15 0000 1111 1101 1111 = 0x0FDF michael@0: 18 0000 1111 1101 1111 = 0x0FDF michael@0: COMPLEX 0000 1111 1100 0010 = 0x0FC2 michael@0: [c] 0000 1111 1111 1111 = 0x0FFF michael@0: [d] 0000 1111 1101 1111 = 0x0FDF michael@0: [e] 0000 1111 1111 1111 = 0x0FFF michael@0: */ michael@0: michael@0: static const uint16_t gPairConservative[MAX_CLASSES] = { michael@0: 0x0FFF, michael@0: 0x0EC2, michael@0: 0x0EC6, michael@0: 0x0EC2, michael@0: 0x0EC2, michael@0: 0x0C02, michael@0: 0x0FDF, michael@0: 0x0FDF, michael@0: 0x0FC2, michael@0: 0x0FFF, michael@0: 0x0FDF, michael@0: 0x0FFF michael@0: }; michael@0: michael@0: michael@0: /* michael@0: michael@0: 9. Now we map the class to number michael@0: michael@0: 0: 1 michael@0: 1: [a]- 2, 3, 4, 5, 6 michael@0: 2: 7 michael@0: 3: 8 michael@0: 4: 9 michael@0: 5: [b]- 10, 11, 12, 17 michael@0: 6: 15 michael@0: 7: 18 michael@0: 8: COMPLEX michael@0: 9: [c] michael@0: A: [d] michael@0: B: [e] michael@0: michael@0: and they mean: michael@0: 0: Open parenthesis michael@0: 1: Punctuation that prohibits break before michael@0: 2: Non-breakable between same classes michael@0: 3: Prefix michael@0: 4: Postfix michael@0: 5: Breakable character (Spaces and Most Japanese characters) michael@0: 6: Numeric michael@0: 7: Characters michael@0: 8: Need special handling characters (E.g., Thai) michael@0: 9: Open parentheses like Character (See bug 389056) michael@0: A: Close parenthese (or punctuations) like Character (See bug 389056) michael@0: B: Non breakable (See bug 390920) michael@0: michael@0: */ michael@0: michael@0: #define CLASS_NONE INT8_MAX michael@0: michael@0: #define CLASS_OPEN 0x00 michael@0: #define CLASS_CLOSE 0x01 michael@0: #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02 michael@0: #define CLASS_PREFIX 0x03 michael@0: #define CLASS_POSTFFIX 0x04 michael@0: #define CLASS_BREAKABLE 0x05 michael@0: #define CLASS_NUMERIC 0x06 michael@0: #define CLASS_CHARACTER 0x07 michael@0: #define CLASS_COMPLEX 0x08 michael@0: #define CLASS_OPEN_LIKE_CHARACTER 0x09 michael@0: #define CLASS_CLOSE_LIKE_CHARACTER 0x0A michael@0: #define CLASS_NON_BREAKABLE 0x0B michael@0: michael@0: #define U_NULL char16_t(0x0000) michael@0: #define U_SLASH char16_t('/') michael@0: #define U_SPACE char16_t(' ') michael@0: #define U_HYPHEN char16_t('-') michael@0: #define U_EQUAL char16_t('=') michael@0: #define U_PERCENT char16_t('%') michael@0: #define U_AMPERSAND char16_t('&') michael@0: #define U_SEMICOLON char16_t(';') michael@0: #define U_BACKSLASH char16_t('\\') michael@0: #define U_OPEN_SINGLE_QUOTE char16_t(0x2018) michael@0: #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C) michael@0: #define U_OPEN_GUILLEMET char16_t(0x00AB) michael@0: michael@0: #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \ michael@0: (c) == U_SLASH || \ michael@0: (c) == U_PERCENT || \ michael@0: (c) == U_AMPERSAND || \ michael@0: (c) == U_SEMICOLON || \ michael@0: (c) == U_BACKSLASH || \ michael@0: (c) == U_OPEN_SINGLE_QUOTE || \ michael@0: (c) == U_OPEN_DOUBLE_QUOTE || \ michael@0: (c) == U_OPEN_GUILLEMET) michael@0: michael@0: #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039) michael@0: michael@0: static inline int michael@0: GETCLASSFROMTABLE(const uint32_t* t, uint16_t l) michael@0: { michael@0: return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f); michael@0: } michael@0: michael@0: static inline int michael@0: IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u) michael@0: { michael@0: return ((0xff66 <= (u)) && ((u) <= 0xff70)); michael@0: } michael@0: michael@0: static inline int michael@0: IS_CJK_CHAR(char16_t u) michael@0: { michael@0: return ((0x1100 <= (u) && (u) <= 0x11ff) || michael@0: (0x2e80 <= (u) && (u) <= 0xd7ff) || michael@0: (0xf900 <= (u) && (u) <= 0xfaff) || michael@0: (0xff00 <= (u) && (u) <= 0xffef) ); michael@0: } michael@0: michael@0: static inline bool michael@0: IS_NONBREAKABLE_SPACE(char16_t u) michael@0: { michael@0: return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE michael@0: } michael@0: michael@0: static inline bool michael@0: IS_HYPHEN(char16_t u) michael@0: { michael@0: return (u == U_HYPHEN || michael@0: u == 0x058A || // ARMENIAN HYPHEN michael@0: u == 0x2010 || // HYPHEN michael@0: u == 0x2012 || // FIGURE DASH michael@0: u == 0x2013); // EN DASH michael@0: } michael@0: michael@0: static int8_t michael@0: GetClass(char16_t u) michael@0: { michael@0: uint16_t h = u & 0xFF00; michael@0: uint16_t l = u & 0x00ff; michael@0: int8_t c; michael@0: michael@0: // Handle 3 range table first michael@0: if (0x0000 == h) { michael@0: c = GETCLASSFROMTABLE(gLBClass00, l); michael@0: } else if (0x1700 == h) { michael@0: c = GETCLASSFROMTABLE(gLBClass17, l); michael@0: } else if (NS_NeedsPlatformNativeHandling(u)) { michael@0: c = CLASS_COMPLEX; michael@0: } else if (0x0E00 == h) { michael@0: c = GETCLASSFROMTABLE(gLBClass0E, l); michael@0: } else if (0x2000 == h) { michael@0: c = GETCLASSFROMTABLE(gLBClass20, l); michael@0: } else if (0x2100 == h) { michael@0: c = GETCLASSFROMTABLE(gLBClass21, l); michael@0: } else if (0x3000 == h) { michael@0: c = GETCLASSFROMTABLE(gLBClass30, l); michael@0: } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi michael@0: ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul michael@0: ((0xf900 <= h) && (h <= 0xfaff))) { michael@0: c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatibility michael@0: } else if (0xff00 == h) { michael@0: if (l < 0x0060) { // Fullwidth ASCII variant michael@0: c = GETCLASSFROMTABLE(gLBClass00, (l+0x20)); michael@0: } else if (l < 0x00a0) { michael@0: switch (l) { michael@0: case 0x61: c = GetClass(0x3002); break; michael@0: case 0x62: c = GetClass(0x300c); break; michael@0: case 0x63: c = GetClass(0x300d); break; michael@0: case 0x64: c = GetClass(0x3001); break; michael@0: case 0x65: c = GetClass(0x30fb); break; michael@0: case 0x9e: c = GetClass(0x309b); break; michael@0: case 0x9f: c = GetClass(0x309c); break; michael@0: default: michael@0: if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) michael@0: c = CLASS_CLOSE; // jis x4051 class 3 michael@0: else michael@0: c = CLASS_BREAKABLE; // jis x4051 class 11 michael@0: break; michael@0: } michael@0: // Halfwidth Katakana variants michael@0: } else if (l < 0x00e0) { michael@0: c = CLASS_CHARACTER; // Halfwidth Hangul variants michael@0: } else if (l < 0x00f0) { michael@0: static char16_t NarrowFFEx[16] = { michael@0: 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000, michael@0: 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000 michael@0: }; michael@0: c = GetClass(NarrowFFEx[l - 0x00e0]); michael@0: } else { michael@0: c = CLASS_CHARACTER; michael@0: } michael@0: } else if (0x3100 == h) { michael@0: if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun michael@0: // XXX: This is per UAX #14, but UAX #14 may change michael@0: // the line breaking rules about Kanbun and Bopomofo. michael@0: c = CLASS_BREAKABLE; michael@0: } else if (l >= 0xf0) { // Katakana small letters for Ainu michael@0: c = CLASS_CLOSE; michael@0: } else { // unassigned michael@0: c = CLASS_CHARACTER; michael@0: } michael@0: } else if (0x0300 == h) { michael@0: if (0x4F == l || (0x5C <= l && l <= 0x62)) michael@0: c = CLASS_NON_BREAKABLE; michael@0: else michael@0: c = CLASS_CHARACTER; michael@0: } else if (0x0500 == h) { michael@0: // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14) michael@0: if (l == 0x8A) michael@0: c = GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN)); michael@0: else michael@0: c = CLASS_CHARACTER; michael@0: } else if (0x0F00 == h) { michael@0: if (0x08 == l || 0x0C == l || 0x12 == l) michael@0: c = CLASS_NON_BREAKABLE; michael@0: else michael@0: c = CLASS_CHARACTER; michael@0: } else if (0x1800 == h) { michael@0: if (0x0E == l) michael@0: c = CLASS_NON_BREAKABLE; michael@0: else michael@0: c = CLASS_CHARACTER; michael@0: } else if (0x1600 == h) { michael@0: if (0x80 == l) { // U+1680 OGHAM SPACE MARK michael@0: c = CLASS_BREAKABLE; michael@0: } else { michael@0: c = CLASS_CHARACTER; michael@0: } michael@0: } else if (u == 0xfeff) { michael@0: c = CLASS_NON_BREAKABLE; michael@0: } else { michael@0: c = CLASS_CHARACTER; // others michael@0: } michael@0: return c; michael@0: } michael@0: michael@0: static bool michael@0: GetPair(int8_t c1, int8_t c2) michael@0: { michael@0: NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); michael@0: NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); michael@0: michael@0: return (0 == ((gPair[c1] >> c2) & 0x0001)); michael@0: } michael@0: michael@0: static bool michael@0: GetPairConservative(int8_t c1, int8_t c2) michael@0: { michael@0: NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); michael@0: NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); michael@0: michael@0: return (0 == ((gPairConservative[c1] >> c2) & 0x0001)); michael@0: } michael@0: michael@0: nsJISx4051LineBreaker::nsJISx4051LineBreaker() michael@0: { michael@0: } michael@0: michael@0: nsJISx4051LineBreaker::~nsJISx4051LineBreaker() michael@0: { michael@0: } michael@0: michael@0: NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker) michael@0: michael@0: class ContextState { michael@0: public: michael@0: ContextState(const char16_t* aText, uint32_t aLength) { michael@0: mUniText = aText; michael@0: mText = nullptr; michael@0: mLength = aLength; michael@0: Init(); michael@0: } michael@0: michael@0: ContextState(const uint8_t* aText, uint32_t aLength) { michael@0: mUniText = nullptr; michael@0: mText = aText; michael@0: mLength = aLength; michael@0: Init(); michael@0: } michael@0: michael@0: uint32_t Length() { return mLength; } michael@0: uint32_t Index() { return mIndex; } michael@0: michael@0: char16_t GetCharAt(uint32_t aIndex) { michael@0: NS_ASSERTION(aIndex < mLength, "Out of range!"); michael@0: return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]); michael@0: } michael@0: michael@0: void AdvanceIndex() { michael@0: ++mIndex; michael@0: } michael@0: michael@0: void NotifyBreakBefore() { mLastBreakIndex = mIndex; } michael@0: michael@0: // A word of western language should not be broken. But even if the word has michael@0: // only ASCII characters, non-natural context words should be broken, e.g., michael@0: // URL and file path. For protecting the natural words, we should use michael@0: // conservative breaking rules at following conditions: michael@0: // 1. at near the start of word michael@0: // 2. at near the end of word michael@0: // 3. at near the latest broken point michael@0: // CONSERVATIVE_BREAK_RANGE define the 'near' in characters. michael@0: #define CONSERVATIVE_BREAK_RANGE 6 michael@0: michael@0: bool UseConservativeBreaking(uint32_t aOffset = 0) { michael@0: if (mHasCJKChar) michael@0: return false; michael@0: uint32_t index = mIndex + aOffset; michael@0: bool result = (index < CONSERVATIVE_BREAK_RANGE || michael@0: mLength - index < CONSERVATIVE_BREAK_RANGE || michael@0: index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE); michael@0: if (result || !mHasNonbreakableSpace) michael@0: return result; michael@0: michael@0: // This text has no-breakable space, we need to check whether the index michael@0: // is near it. michael@0: michael@0: // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here. michael@0: for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) { michael@0: if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1))) michael@0: return true; michael@0: } michael@0: // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE. michael@0: for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) { michael@0: if (IS_NONBREAKABLE_SPACE(GetCharAt(i))) michael@0: return true; michael@0: } michael@0: return false; michael@0: } michael@0: michael@0: bool HasPreviousEqualsSign() const { michael@0: return mHasPreviousEqualsSign; michael@0: } michael@0: void NotifySeenEqualsSign() { michael@0: mHasPreviousEqualsSign = true; michael@0: } michael@0: michael@0: bool HasPreviousSlash() const { michael@0: return mHasPreviousSlash; michael@0: } michael@0: void NotifySeenSlash() { michael@0: mHasPreviousSlash = true; michael@0: } michael@0: michael@0: bool HasPreviousBackslash() const { michael@0: return mHasPreviousBackslash; michael@0: } michael@0: void NotifySeenBackslash() { michael@0: mHasPreviousBackslash = true; michael@0: } michael@0: michael@0: char16_t GetPreviousNonHyphenCharacter() const { michael@0: return mPreviousNonHyphenCharacter; michael@0: } michael@0: void NotifyNonHyphenCharacter(char16_t ch) { michael@0: mPreviousNonHyphenCharacter = ch; michael@0: } michael@0: michael@0: private: michael@0: void Init() { michael@0: mIndex = 0; michael@0: mLastBreakIndex = 0; michael@0: mPreviousNonHyphenCharacter = U_NULL; michael@0: mHasCJKChar = 0; michael@0: mHasNonbreakableSpace = 0; michael@0: mHasPreviousEqualsSign = false; michael@0: mHasPreviousSlash = false; michael@0: mHasPreviousBackslash = false; michael@0: michael@0: for (uint32_t i = 0; i < mLength; ++i) { michael@0: char16_t u = GetCharAt(i); michael@0: if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) michael@0: mHasNonbreakableSpace = 1; michael@0: else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u)) michael@0: mHasCJKChar = 1; michael@0: } michael@0: } michael@0: michael@0: const char16_t* mUniText; michael@0: const uint8_t* mText; michael@0: michael@0: uint32_t mIndex; michael@0: uint32_t mLength; // length of text michael@0: uint32_t mLastBreakIndex; michael@0: char16_t mPreviousNonHyphenCharacter; // The last character we have seen michael@0: // which is not U_HYPHEN michael@0: bool mHasCJKChar; // if the text has CJK character, this is true. michael@0: bool mHasNonbreakableSpace; // if the text has no-breakable space, michael@0: // this is true. michael@0: bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL michael@0: bool mHasPreviousSlash; // True if we have seen a U_SLASH michael@0: bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH michael@0: }; michael@0: michael@0: static int8_t michael@0: ContextualAnalysis(char16_t prev, char16_t cur, char16_t next, michael@0: ContextState &aState) michael@0: { michael@0: // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE. michael@0: michael@0: if (IS_HYPHEN(cur)) { michael@0: // If next character is hyphen, we don't need to break between them. michael@0: if (IS_HYPHEN(next)) michael@0: return CLASS_CHARACTER; michael@0: // If prev and next characters are numeric, it may be in Math context. michael@0: // So, we should not break here. michael@0: bool prevIsNum = IS_ASCII_DIGIT(prev); michael@0: bool nextIsNum = IS_ASCII_DIGIT(next); michael@0: if (prevIsNum && nextIsNum) michael@0: return CLASS_NUMERIC; michael@0: // If one side is numeric and the other is a character, or if both sides are michael@0: // characters, the hyphen should be breakable. michael@0: if (!aState.UseConservativeBreaking(1)) { michael@0: char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter(); michael@0: if (prevOfHyphen && next) { michael@0: int8_t prevClass = GetClass(prevOfHyphen); michael@0: int8_t nextClass = GetClass(next); michael@0: bool prevIsNumOrCharOrClose = michael@0: prevIsNum || michael@0: (prevClass == CLASS_CHARACTER && michael@0: !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) || michael@0: prevClass == CLASS_CLOSE || michael@0: prevClass == CLASS_CLOSE_LIKE_CHARACTER; michael@0: bool nextIsNumOrCharOrOpen = michael@0: nextIsNum || michael@0: (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) || michael@0: nextClass == CLASS_OPEN || michael@0: nextClass == CLASS_OPEN_LIKE_CHARACTER || michael@0: next == U_OPEN_SINGLE_QUOTE || michael@0: next == U_OPEN_DOUBLE_QUOTE || michael@0: next == U_OPEN_GUILLEMET; michael@0: if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) { michael@0: return CLASS_CLOSE; michael@0: } michael@0: } michael@0: } michael@0: } else { michael@0: aState.NotifyNonHyphenCharacter(cur); michael@0: if (cur == U_SLASH || cur == U_BACKSLASH) { michael@0: // If this is immediately after same char, we should not break here. michael@0: if (prev == cur) michael@0: return CLASS_CHARACTER; michael@0: // If this text has two or more (BACK)SLASHs, this may be file path or URL. michael@0: // Make sure to compute shouldReturn before we notify on this slash. michael@0: bool shouldReturn = !aState.UseConservativeBreaking() && michael@0: (cur == U_SLASH ? michael@0: aState.HasPreviousSlash() : aState.HasPreviousBackslash()); michael@0: michael@0: if (cur == U_SLASH) { michael@0: aState.NotifySeenSlash(); michael@0: } else { michael@0: aState.NotifySeenBackslash(); michael@0: } michael@0: michael@0: if (shouldReturn) michael@0: return CLASS_OPEN; michael@0: } else if (cur == U_PERCENT) { michael@0: // If this is a part of the param of URL, we should break before. michael@0: if (!aState.UseConservativeBreaking()) { michael@0: if (aState.Index() >= 3 && michael@0: aState.GetCharAt(aState.Index() - 3) == U_PERCENT) michael@0: return CLASS_OPEN; michael@0: if (aState.Index() + 3 < aState.Length() && michael@0: aState.GetCharAt(aState.Index() + 3) == U_PERCENT) michael@0: return CLASS_OPEN; michael@0: } michael@0: } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) { michael@0: // If this may be a separator of params of URL, we should break after. michael@0: if (!aState.UseConservativeBreaking(1) && michael@0: aState.HasPreviousEqualsSign()) michael@0: return CLASS_CLOSE; michael@0: } else if (cur == U_OPEN_SINGLE_QUOTE || michael@0: cur == U_OPEN_DOUBLE_QUOTE || michael@0: cur == U_OPEN_GUILLEMET) { michael@0: // for CJK usage, we treat these as openers to allow a break before them, michael@0: // but otherwise treat them as normal characters because quote mark usage michael@0: // in various Western languages varies too much; see bug #450088 discussion. michael@0: if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next)) michael@0: return CLASS_OPEN; michael@0: } else { michael@0: NS_ERROR("Forgot to handle the current character!"); michael@0: } michael@0: } michael@0: return GetClass(cur); michael@0: } michael@0: michael@0: michael@0: int32_t michael@0: nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen, michael@0: uint32_t aPos, int8_t aDirection) michael@0: { michael@0: bool textNeedsJISx4051 = false; michael@0: int32_t begin, end; michael@0: michael@0: for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) { michael@0: if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) { michael@0: textNeedsJISx4051 = true; michael@0: } michael@0: } michael@0: for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) { michael@0: if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) { michael@0: textNeedsJISx4051 = true; michael@0: } michael@0: } michael@0: michael@0: int32_t ret; michael@0: nsAutoTArray breakState; michael@0: if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) { michael@0: // No complex text character, do not try to do complex line break. michael@0: // (This is required for serializers. See Bug #344816.) michael@0: // Also fall back to this when out of memory. michael@0: if (aDirection < 0) { michael@0: ret = (begin == int32_t(aPos)) ? begin - 1 : begin; michael@0: } else { michael@0: ret = end; michael@0: } michael@0: } else { michael@0: GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal, michael@0: breakState.Elements()); michael@0: michael@0: ret = aPos; michael@0: do { michael@0: ret += aDirection; michael@0: } while (begin < ret && ret < end && !breakState[ret - begin]); michael@0: } michael@0: michael@0: return ret; michael@0: } michael@0: michael@0: int32_t michael@0: nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen, michael@0: uint32_t aPos) michael@0: { michael@0: NS_ASSERTION(aText, "aText shouldn't be null"); michael@0: NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next"); michael@0: michael@0: int32_t nextPos = WordMove(aText, aLen, aPos, 1); michael@0: return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT; michael@0: } michael@0: michael@0: int32_t michael@0: nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen, michael@0: uint32_t aPos) michael@0: { michael@0: NS_ASSERTION(aText, "aText shouldn't be null"); michael@0: NS_ASSERTION(aLen >= aPos && aPos > 0, michael@0: "Bad position passed to nsJISx4051LineBreaker::Prev"); michael@0: michael@0: int32_t prevPos = WordMove(aText, aLen, aPos, -1); michael@0: return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT; michael@0: } michael@0: michael@0: void michael@0: nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength, michael@0: uint8_t aWordBreak, michael@0: uint8_t* aBreakBefore) michael@0: { michael@0: uint32_t cur; michael@0: int8_t lastClass = CLASS_NONE; michael@0: ContextState state(aChars, aLength); michael@0: michael@0: for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { michael@0: char16_t ch = aChars[cur]; michael@0: int8_t cl; michael@0: michael@0: if (NEED_CONTEXTUAL_ANALYSIS(ch)) { michael@0: cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, michael@0: ch, michael@0: cur + 1 < aLength ? aChars[cur + 1] : U_NULL, michael@0: state); michael@0: } else { michael@0: if (ch == U_EQUAL) michael@0: state.NotifySeenEqualsSign(); michael@0: state.NotifyNonHyphenCharacter(ch); michael@0: cl = GetClass(ch); michael@0: } michael@0: michael@0: bool allowBreak = false; michael@0: if (cur > 0) { michael@0: NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl, michael@0: "Loop should have prevented adjacent complex chars here"); michael@0: if (aWordBreak == nsILineBreaker::kWordBreak_Normal) { michael@0: allowBreak = (state.UseConservativeBreaking()) ? michael@0: GetPairConservative(lastClass, cl) : GetPair(lastClass, cl); michael@0: } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) { michael@0: allowBreak = true; michael@0: } michael@0: } michael@0: aBreakBefore[cur] = allowBreak; michael@0: if (allowBreak) michael@0: state.NotifyBreakBefore(); michael@0: lastClass = cl; michael@0: if (CLASS_COMPLEX == cl) { michael@0: uint32_t end = cur + 1; michael@0: michael@0: while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) { michael@0: ++end; michael@0: } michael@0: michael@0: NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur); michael@0: michael@0: // We have to consider word-break value again for complex characters michael@0: if (aWordBreak != nsILineBreaker::kWordBreak_Normal) { michael@0: // Respect word-break property michael@0: for (uint32_t i = cur; i < end; i++) michael@0: aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll); michael@0: } michael@0: michael@0: // restore breakability at chunk begin, which was always set to false michael@0: // by the complex line breaker michael@0: aBreakBefore[cur] = allowBreak; michael@0: michael@0: cur = end - 1; michael@0: } michael@0: } michael@0: } michael@0: michael@0: void michael@0: nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength, michael@0: uint8_t aWordBreak, michael@0: uint8_t* aBreakBefore) michael@0: { michael@0: uint32_t cur; michael@0: int8_t lastClass = CLASS_NONE; michael@0: ContextState state(aChars, aLength); michael@0: michael@0: for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { michael@0: char16_t ch = aChars[cur]; michael@0: int8_t cl; michael@0: michael@0: if (NEED_CONTEXTUAL_ANALYSIS(ch)) { michael@0: cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, michael@0: ch, michael@0: cur + 1 < aLength ? aChars[cur + 1] : U_NULL, michael@0: state); michael@0: } else { michael@0: if (ch == U_EQUAL) michael@0: state.NotifySeenEqualsSign(); michael@0: state.NotifyNonHyphenCharacter(ch); michael@0: cl = GetClass(ch); michael@0: } michael@0: michael@0: bool allowBreak = false; michael@0: if (cur > 0) { michael@0: if (aWordBreak == nsILineBreaker::kWordBreak_Normal) { michael@0: allowBreak = (state.UseConservativeBreaking()) ? michael@0: GetPairConservative(lastClass, cl) : GetPair(lastClass, cl); michael@0: } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) { michael@0: allowBreak = true; michael@0: } michael@0: } michael@0: aBreakBefore[cur] = allowBreak; michael@0: if (allowBreak) michael@0: state.NotifyBreakBefore(); michael@0: lastClass = cl; michael@0: } michael@0: }