Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | |
michael@0 | 6 | #include "mozInlineSpellWordUtil.h" |
michael@0 | 7 | #include "nsDebug.h" |
michael@0 | 8 | #include "nsIAtom.h" |
michael@0 | 9 | #include "nsComponentManagerUtils.h" |
michael@0 | 10 | #include "nsIDOMCSSStyleDeclaration.h" |
michael@0 | 11 | #include "nsIDOMElement.h" |
michael@0 | 12 | #include "nsIDOMRange.h" |
michael@0 | 13 | #include "nsIEditor.h" |
michael@0 | 14 | #include "nsIDOMNode.h" |
michael@0 | 15 | #include "nsIDOMHTMLBRElement.h" |
michael@0 | 16 | #include "nsUnicharUtilCIID.h" |
michael@0 | 17 | #include "nsUnicodeProperties.h" |
michael@0 | 18 | #include "nsServiceManagerUtils.h" |
michael@0 | 19 | #include "nsIContent.h" |
michael@0 | 20 | #include "nsTextFragment.h" |
michael@0 | 21 | #include "mozilla/dom/Element.h" |
michael@0 | 22 | #include "nsRange.h" |
michael@0 | 23 | #include "nsContentUtils.h" |
michael@0 | 24 | #include "nsIFrame.h" |
michael@0 | 25 | #include <algorithm> |
michael@0 | 26 | |
michael@0 | 27 | using namespace mozilla; |
michael@0 | 28 | |
michael@0 | 29 | // IsIgnorableCharacter |
michael@0 | 30 | // |
michael@0 | 31 | // These characters are ones that we should ignore in input. |
michael@0 | 32 | |
michael@0 | 33 | inline bool IsIgnorableCharacter(char16_t ch) |
michael@0 | 34 | { |
michael@0 | 35 | return (ch == 0xAD || // SOFT HYPHEN |
michael@0 | 36 | ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN |
michael@0 | 37 | } |
michael@0 | 38 | |
michael@0 | 39 | // IsConditionalPunctuation |
michael@0 | 40 | // |
michael@0 | 41 | // Some characters (like apostrophes) require characters on each side to be |
michael@0 | 42 | // part of a word, and are otherwise punctuation. |
michael@0 | 43 | |
michael@0 | 44 | inline bool IsConditionalPunctuation(char16_t ch) |
michael@0 | 45 | { |
michael@0 | 46 | return (ch == '\'' || |
michael@0 | 47 | ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK |
michael@0 | 48 | ch == 0x00B7); // MIDDLE DOT |
michael@0 | 49 | } |
michael@0 | 50 | |
michael@0 | 51 | // mozInlineSpellWordUtil::Init |
michael@0 | 52 | |
michael@0 | 53 | nsresult |
michael@0 | 54 | mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor) |
michael@0 | 55 | { |
michael@0 | 56 | nsresult rv; |
michael@0 | 57 | |
michael@0 | 58 | // getting the editor can fail commonly because the editor was detached, so |
michael@0 | 59 | // don't assert |
michael@0 | 60 | nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv); |
michael@0 | 61 | if (NS_FAILED(rv)) |
michael@0 | 62 | return rv; |
michael@0 | 63 | |
michael@0 | 64 | nsCOMPtr<nsIDOMDocument> domDoc; |
michael@0 | 65 | rv = editor->GetDocument(getter_AddRefs(domDoc)); |
michael@0 | 66 | NS_ENSURE_SUCCESS(rv, rv); |
michael@0 | 67 | NS_ENSURE_TRUE(domDoc, NS_ERROR_NULL_POINTER); |
michael@0 | 68 | |
michael@0 | 69 | mDOMDocument = domDoc; |
michael@0 | 70 | mDocument = do_QueryInterface(domDoc); |
michael@0 | 71 | |
michael@0 | 72 | // Find the root node for the editor. For contenteditable we'll need something |
michael@0 | 73 | // cleverer here. |
michael@0 | 74 | nsCOMPtr<nsIDOMElement> rootElt; |
michael@0 | 75 | rv = editor->GetRootElement(getter_AddRefs(rootElt)); |
michael@0 | 76 | NS_ENSURE_SUCCESS(rv, rv); |
michael@0 | 77 | |
michael@0 | 78 | nsCOMPtr<nsINode> rootNode = do_QueryInterface(rootElt); |
michael@0 | 79 | mRootNode = rootNode; |
michael@0 | 80 | NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!"); |
michael@0 | 81 | return NS_OK; |
michael@0 | 82 | } |
michael@0 | 83 | |
michael@0 | 84 | static inline bool |
michael@0 | 85 | IsTextNode(nsINode* aNode) |
michael@0 | 86 | { |
michael@0 | 87 | return aNode->IsNodeOfType(nsINode::eTEXT); |
michael@0 | 88 | } |
michael@0 | 89 | |
michael@0 | 90 | typedef void (* OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure); |
michael@0 | 91 | |
michael@0 | 92 | // Find the next node in the DOM tree in preorder. |
michael@0 | 93 | // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is |
michael@0 | 94 | // why we can't just use GetNextNode here, sadly. |
michael@0 | 95 | static nsINode* |
michael@0 | 96 | FindNextNode(nsINode* aNode, nsINode* aRoot, |
michael@0 | 97 | OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure) |
michael@0 | 98 | { |
michael@0 | 99 | NS_PRECONDITION(aNode, "Null starting node?"); |
michael@0 | 100 | |
michael@0 | 101 | nsINode* next = aNode->GetFirstChild(); |
michael@0 | 102 | if (next) |
michael@0 | 103 | return next; |
michael@0 | 104 | |
michael@0 | 105 | // Don't look at siblings or otherwise outside of aRoot |
michael@0 | 106 | if (aNode == aRoot) |
michael@0 | 107 | return nullptr; |
michael@0 | 108 | |
michael@0 | 109 | next = aNode->GetNextSibling(); |
michael@0 | 110 | if (next) |
michael@0 | 111 | return next; |
michael@0 | 112 | |
michael@0 | 113 | // Go up |
michael@0 | 114 | for (;;) { |
michael@0 | 115 | if (aOnLeaveNode) { |
michael@0 | 116 | aOnLeaveNode(aNode, aClosure); |
michael@0 | 117 | } |
michael@0 | 118 | |
michael@0 | 119 | next = aNode->GetParent(); |
michael@0 | 120 | if (next == aRoot || ! next) |
michael@0 | 121 | return nullptr; |
michael@0 | 122 | aNode = next; |
michael@0 | 123 | |
michael@0 | 124 | next = aNode->GetNextSibling(); |
michael@0 | 125 | if (next) |
michael@0 | 126 | return next; |
michael@0 | 127 | } |
michael@0 | 128 | } |
michael@0 | 129 | |
michael@0 | 130 | // aNode is not a text node. Find the first text node starting at aNode/aOffset |
michael@0 | 131 | // in a preorder DOM traversal. |
michael@0 | 132 | static nsINode* |
michael@0 | 133 | FindNextTextNode(nsINode* aNode, int32_t aOffset, nsINode* aRoot) |
michael@0 | 134 | { |
michael@0 | 135 | NS_PRECONDITION(aNode, "Null starting node?"); |
michael@0 | 136 | NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node"); |
michael@0 | 137 | |
michael@0 | 138 | nsINode* checkNode; |
michael@0 | 139 | // Need to start at the aOffset'th child |
michael@0 | 140 | nsIContent* child = aNode->GetChildAt(aOffset); |
michael@0 | 141 | |
michael@0 | 142 | if (child) { |
michael@0 | 143 | checkNode = child; |
michael@0 | 144 | } else { |
michael@0 | 145 | // aOffset was beyond the end of the child list. |
michael@0 | 146 | // goto next node after the last descendant of aNode in |
michael@0 | 147 | // a preorder DOM traversal. |
michael@0 | 148 | checkNode = aNode->GetNextNonChildNode(aRoot); |
michael@0 | 149 | } |
michael@0 | 150 | |
michael@0 | 151 | while (checkNode && !IsTextNode(checkNode)) { |
michael@0 | 152 | checkNode = checkNode->GetNextNode(aRoot); |
michael@0 | 153 | } |
michael@0 | 154 | return checkNode; |
michael@0 | 155 | } |
michael@0 | 156 | |
michael@0 | 157 | // mozInlineSpellWordUtil::SetEnd |
michael@0 | 158 | // |
michael@0 | 159 | // We have two ranges "hard" and "soft". The hard boundary is simply |
michael@0 | 160 | // the scope of the root node. The soft boundary is that which is set |
michael@0 | 161 | // by the caller of this class by calling this function. If this function is |
michael@0 | 162 | // not called, the soft boundary is the same as the hard boundary. |
michael@0 | 163 | // |
michael@0 | 164 | // When we reach the soft boundary (mSoftEnd), we keep |
michael@0 | 165 | // going until we reach the end of a word. This allows the caller to set the |
michael@0 | 166 | // end of the range to anything, and we will always check whole multiples of |
michael@0 | 167 | // words. When we reach the hard boundary we stop no matter what. |
michael@0 | 168 | // |
michael@0 | 169 | // There is no beginning soft boundary. This is because we only go to the |
michael@0 | 170 | // previous node once, when finding the previous word boundary in |
michael@0 | 171 | // SetPosition(). You might think of the soft boundary as being this initial |
michael@0 | 172 | // position. |
michael@0 | 173 | |
michael@0 | 174 | nsresult |
michael@0 | 175 | mozInlineSpellWordUtil::SetEnd(nsINode* aEndNode, int32_t aEndOffset) |
michael@0 | 176 | { |
michael@0 | 177 | NS_PRECONDITION(aEndNode, "Null end node?"); |
michael@0 | 178 | |
michael@0 | 179 | NS_ASSERTION(mRootNode, "Not initialized"); |
michael@0 | 180 | |
michael@0 | 181 | InvalidateWords(); |
michael@0 | 182 | |
michael@0 | 183 | if (!IsTextNode(aEndNode)) { |
michael@0 | 184 | // End at the start of the first text node after aEndNode/aEndOffset. |
michael@0 | 185 | aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode); |
michael@0 | 186 | aEndOffset = 0; |
michael@0 | 187 | } |
michael@0 | 188 | mSoftEnd = NodeOffset(aEndNode, aEndOffset); |
michael@0 | 189 | return NS_OK; |
michael@0 | 190 | } |
michael@0 | 191 | |
michael@0 | 192 | nsresult |
michael@0 | 193 | mozInlineSpellWordUtil::SetPosition(nsINode* aNode, int32_t aOffset) |
michael@0 | 194 | { |
michael@0 | 195 | InvalidateWords(); |
michael@0 | 196 | |
michael@0 | 197 | if (!IsTextNode(aNode)) { |
michael@0 | 198 | // Start at the start of the first text node after aNode/aOffset. |
michael@0 | 199 | aNode = FindNextTextNode(aNode, aOffset, mRootNode); |
michael@0 | 200 | aOffset = 0; |
michael@0 | 201 | } |
michael@0 | 202 | mSoftBegin = NodeOffset(aNode, aOffset); |
michael@0 | 203 | |
michael@0 | 204 | EnsureWords(); |
michael@0 | 205 | |
michael@0 | 206 | int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin); |
michael@0 | 207 | if (textOffset < 0) |
michael@0 | 208 | return NS_OK; |
michael@0 | 209 | mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true); |
michael@0 | 210 | return NS_OK; |
michael@0 | 211 | } |
michael@0 | 212 | |
michael@0 | 213 | void |
michael@0 | 214 | mozInlineSpellWordUtil::EnsureWords() |
michael@0 | 215 | { |
michael@0 | 216 | if (mSoftTextValid) |
michael@0 | 217 | return; |
michael@0 | 218 | BuildSoftText(); |
michael@0 | 219 | BuildRealWords(); |
michael@0 | 220 | mSoftTextValid = true; |
michael@0 | 221 | } |
michael@0 | 222 | |
michael@0 | 223 | nsresult |
michael@0 | 224 | mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsRange** aRange) |
michael@0 | 225 | { |
michael@0 | 226 | NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN); |
michael@0 | 227 | NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END); |
michael@0 | 228 | return MakeRange(begin, end, aRange); |
michael@0 | 229 | } |
michael@0 | 230 | |
michael@0 | 231 | // mozInlineSpellWordUtil::GetRangeForWord |
michael@0 | 232 | |
michael@0 | 233 | nsresult |
michael@0 | 234 | mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode, |
michael@0 | 235 | int32_t aWordOffset, |
michael@0 | 236 | nsRange** aRange) |
michael@0 | 237 | { |
michael@0 | 238 | // Set our soft end and start |
michael@0 | 239 | nsCOMPtr<nsINode> wordNode = do_QueryInterface(aWordNode); |
michael@0 | 240 | NodeOffset pt = NodeOffset(wordNode, aWordOffset); |
michael@0 | 241 | |
michael@0 | 242 | InvalidateWords(); |
michael@0 | 243 | mSoftBegin = mSoftEnd = pt; |
michael@0 | 244 | EnsureWords(); |
michael@0 | 245 | |
michael@0 | 246 | int32_t offset = MapDOMPositionToSoftTextOffset(pt); |
michael@0 | 247 | if (offset < 0) |
michael@0 | 248 | return MakeRange(pt, pt, aRange); |
michael@0 | 249 | int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false); |
michael@0 | 250 | if (wordIndex < 0) |
michael@0 | 251 | return MakeRange(pt, pt, aRange); |
michael@0 | 252 | return MakeRangeForWord(mRealWords[wordIndex], aRange); |
michael@0 | 253 | } |
michael@0 | 254 | |
michael@0 | 255 | // This is to fix characters that the spellchecker may not like |
michael@0 | 256 | static void |
michael@0 | 257 | NormalizeWord(const nsSubstring& aInput, int32_t aPos, int32_t aLen, nsAString& aOutput) |
michael@0 | 258 | { |
michael@0 | 259 | aOutput.Truncate(); |
michael@0 | 260 | for (int32_t i = 0; i < aLen; i++) { |
michael@0 | 261 | char16_t ch = aInput.CharAt(i + aPos); |
michael@0 | 262 | |
michael@0 | 263 | // remove ignorable characters from the word |
michael@0 | 264 | if (IsIgnorableCharacter(ch)) |
michael@0 | 265 | continue; |
michael@0 | 266 | |
michael@0 | 267 | // the spellchecker doesn't handle curly apostrophes in all languages |
michael@0 | 268 | if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK |
michael@0 | 269 | ch = '\''; |
michael@0 | 270 | } |
michael@0 | 271 | |
michael@0 | 272 | aOutput.Append(ch); |
michael@0 | 273 | } |
michael@0 | 274 | } |
michael@0 | 275 | |
michael@0 | 276 | // mozInlineSpellWordUtil::GetNextWord |
michael@0 | 277 | // |
michael@0 | 278 | // FIXME-optimization: we shouldn't have to generate a range every single |
michael@0 | 279 | // time. It would be better if the inline spellchecker didn't require a |
michael@0 | 280 | // range unless the word was misspelled. This may or may not be possible. |
michael@0 | 281 | |
michael@0 | 282 | nsresult |
michael@0 | 283 | mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsRange** aRange, |
michael@0 | 284 | bool* aSkipChecking) |
michael@0 | 285 | { |
michael@0 | 286 | #ifdef DEBUG_SPELLCHECK |
michael@0 | 287 | printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex); |
michael@0 | 288 | #endif |
michael@0 | 289 | |
michael@0 | 290 | if (mNextWordIndex < 0 || |
michael@0 | 291 | mNextWordIndex >= int32_t(mRealWords.Length())) { |
michael@0 | 292 | mNextWordIndex = -1; |
michael@0 | 293 | *aRange = nullptr; |
michael@0 | 294 | *aSkipChecking = true; |
michael@0 | 295 | return NS_OK; |
michael@0 | 296 | } |
michael@0 | 297 | |
michael@0 | 298 | const RealWord& word = mRealWords[mNextWordIndex]; |
michael@0 | 299 | nsresult rv = MakeRangeForWord(word, aRange); |
michael@0 | 300 | NS_ENSURE_SUCCESS(rv, rv); |
michael@0 | 301 | ++mNextWordIndex; |
michael@0 | 302 | *aSkipChecking = !word.mCheckableWord; |
michael@0 | 303 | ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText); |
michael@0 | 304 | |
michael@0 | 305 | #ifdef DEBUG_SPELLCHECK |
michael@0 | 306 | printf("GetNextWord returning: %s (skip=%d)\n", |
michael@0 | 307 | NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking); |
michael@0 | 308 | #endif |
michael@0 | 309 | |
michael@0 | 310 | return NS_OK; |
michael@0 | 311 | } |
michael@0 | 312 | |
michael@0 | 313 | // mozInlineSpellWordUtil::MakeRange |
michael@0 | 314 | // |
michael@0 | 315 | // Convenience function for creating a range over the current document. |
michael@0 | 316 | |
michael@0 | 317 | nsresult |
michael@0 | 318 | mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd, |
michael@0 | 319 | nsRange** aRange) |
michael@0 | 320 | { |
michael@0 | 321 | NS_ENSURE_ARG_POINTER(aBegin.mNode); |
michael@0 | 322 | if (!mDOMDocument) |
michael@0 | 323 | return NS_ERROR_NOT_INITIALIZED; |
michael@0 | 324 | |
michael@0 | 325 | nsRefPtr<nsRange> range = new nsRange(aBegin.mNode); |
michael@0 | 326 | nsresult rv = range->Set(aBegin.mNode, aBegin.mOffset, |
michael@0 | 327 | aEnd.mNode, aEnd.mOffset); |
michael@0 | 328 | NS_ENSURE_SUCCESS(rv, rv); |
michael@0 | 329 | range.forget(aRange); |
michael@0 | 330 | |
michael@0 | 331 | return NS_OK; |
michael@0 | 332 | } |
michael@0 | 333 | |
michael@0 | 334 | /*********** DOM text extraction ************/ |
michael@0 | 335 | |
michael@0 | 336 | // IsDOMWordSeparator |
michael@0 | 337 | // |
michael@0 | 338 | // Determines if the given character should be considered as a DOM Word |
michael@0 | 339 | // separator. Basically, this is whitespace, although it could also have |
michael@0 | 340 | // certain punctuation that we know ALWAYS breaks words. This is important. |
michael@0 | 341 | // For example, we can't have any punctuation that could appear in a URL |
michael@0 | 342 | // or email address in this, because those need to always fit into a single |
michael@0 | 343 | // DOM word. |
michael@0 | 344 | |
michael@0 | 345 | static bool |
michael@0 | 346 | IsDOMWordSeparator(char16_t ch) |
michael@0 | 347 | { |
michael@0 | 348 | // simple spaces |
michael@0 | 349 | if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') |
michael@0 | 350 | return true; |
michael@0 | 351 | |
michael@0 | 352 | // complex spaces - check only if char isn't ASCII (uncommon) |
michael@0 | 353 | if (ch >= 0xA0 && |
michael@0 | 354 | (ch == 0x00A0 || // NO-BREAK SPACE |
michael@0 | 355 | ch == 0x2002 || // EN SPACE |
michael@0 | 356 | ch == 0x2003 || // EM SPACE |
michael@0 | 357 | ch == 0x2009 || // THIN SPACE |
michael@0 | 358 | ch == 0x3000)) // IDEOGRAPHIC SPACE |
michael@0 | 359 | return true; |
michael@0 | 360 | |
michael@0 | 361 | // otherwise not a space |
michael@0 | 362 | return false; |
michael@0 | 363 | } |
michael@0 | 364 | |
michael@0 | 365 | static inline bool |
michael@0 | 366 | IsBRElement(nsINode* aNode) |
michael@0 | 367 | { |
michael@0 | 368 | return aNode->IsElement() && |
michael@0 | 369 | aNode->AsElement()->IsHTML(nsGkAtoms::br); |
michael@0 | 370 | } |
michael@0 | 371 | |
michael@0 | 372 | /** |
michael@0 | 373 | * Given a TextNode, checks to see if there's a DOM word separator before |
michael@0 | 374 | * aBeforeOffset within it. This function does not modify aSeparatorOffset when |
michael@0 | 375 | * it returns false. |
michael@0 | 376 | * |
michael@0 | 377 | * @param aNode the TextNode to check. |
michael@0 | 378 | * @param aBeforeOffset the offset in the TextNode before which we will search |
michael@0 | 379 | * for the DOM separator. You can pass INT32_MAX to search the entire |
michael@0 | 380 | * length of the string. |
michael@0 | 381 | * @param aSeparatorOffset will be set to the offset of the first separator it |
michael@0 | 382 | * encounters. Will not be written to if no separator is found. |
michael@0 | 383 | * @returns True if it found a separator. |
michael@0 | 384 | */ |
michael@0 | 385 | static bool |
michael@0 | 386 | TextNodeContainsDOMWordSeparator(nsINode* aNode, |
michael@0 | 387 | int32_t aBeforeOffset, |
michael@0 | 388 | int32_t* aSeparatorOffset) |
michael@0 | 389 | { |
michael@0 | 390 | // aNode is actually an nsIContent, since it's eTEXT |
michael@0 | 391 | nsIContent* content = static_cast<nsIContent*>(aNode); |
michael@0 | 392 | const nsTextFragment* textFragment = content->GetText(); |
michael@0 | 393 | NS_ASSERTION(textFragment, "Where is our text?"); |
michael@0 | 394 | for (int32_t i = std::min(aBeforeOffset, int32_t(textFragment->GetLength())) - 1; i >= 0; --i) { |
michael@0 | 395 | if (IsDOMWordSeparator(textFragment->CharAt(i))) { |
michael@0 | 396 | // Be greedy, find as many separators as we can |
michael@0 | 397 | for (int32_t j = i - 1; j >= 0; --j) { |
michael@0 | 398 | if (IsDOMWordSeparator(textFragment->CharAt(j))) { |
michael@0 | 399 | i = j; |
michael@0 | 400 | } else { |
michael@0 | 401 | break; |
michael@0 | 402 | } |
michael@0 | 403 | } |
michael@0 | 404 | *aSeparatorOffset = i; |
michael@0 | 405 | return true; |
michael@0 | 406 | } |
michael@0 | 407 | } |
michael@0 | 408 | return false; |
michael@0 | 409 | } |
michael@0 | 410 | |
michael@0 | 411 | /** |
michael@0 | 412 | * Check if there's a DOM word separator before aBeforeOffset in this node. |
michael@0 | 413 | * Always returns true if it's a BR element. |
michael@0 | 414 | * aSeparatorOffset is set to the index of the first character in the last |
michael@0 | 415 | * separator if any is found (0 for BR elements). |
michael@0 | 416 | * |
michael@0 | 417 | * This function does not modify aSeparatorOffset when it returns false. |
michael@0 | 418 | */ |
michael@0 | 419 | static bool |
michael@0 | 420 | ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset, |
michael@0 | 421 | int32_t* aSeparatorOffset) |
michael@0 | 422 | { |
michael@0 | 423 | if (IsBRElement(aNode)) { |
michael@0 | 424 | *aSeparatorOffset = 0; |
michael@0 | 425 | return true; |
michael@0 | 426 | } |
michael@0 | 427 | |
michael@0 | 428 | if (!IsTextNode(aNode)) |
michael@0 | 429 | return false; |
michael@0 | 430 | |
michael@0 | 431 | return TextNodeContainsDOMWordSeparator(aNode, aBeforeOffset, |
michael@0 | 432 | aSeparatorOffset); |
michael@0 | 433 | } |
michael@0 | 434 | |
michael@0 | 435 | static bool |
michael@0 | 436 | IsBreakElement(nsINode* aNode) |
michael@0 | 437 | { |
michael@0 | 438 | if (!aNode->IsElement()) { |
michael@0 | 439 | return false; |
michael@0 | 440 | } |
michael@0 | 441 | |
michael@0 | 442 | dom::Element *element = aNode->AsElement(); |
michael@0 | 443 | |
michael@0 | 444 | if (element->IsHTML(nsGkAtoms::br)) |
michael@0 | 445 | return true; |
michael@0 | 446 | |
michael@0 | 447 | // If we don't have a frame, we don't consider ourselves a break |
michael@0 | 448 | // element. In particular, words can span us. |
michael@0 | 449 | if (!element->GetPrimaryFrame()) |
michael@0 | 450 | return false; |
michael@0 | 451 | |
michael@0 | 452 | // Anything that's not an inline element is a break element. |
michael@0 | 453 | // XXXbz should replaced inlines be break elements, though? |
michael@0 | 454 | return element->GetPrimaryFrame()->StyleDisplay()->mDisplay != |
michael@0 | 455 | NS_STYLE_DISPLAY_INLINE; |
michael@0 | 456 | } |
michael@0 | 457 | |
michael@0 | 458 | struct CheckLeavingBreakElementClosure { |
michael@0 | 459 | bool mLeftBreakElement; |
michael@0 | 460 | }; |
michael@0 | 461 | |
michael@0 | 462 | static void |
michael@0 | 463 | CheckLeavingBreakElement(nsINode* aNode, void* aClosure) |
michael@0 | 464 | { |
michael@0 | 465 | CheckLeavingBreakElementClosure* cl = |
michael@0 | 466 | static_cast<CheckLeavingBreakElementClosure*>(aClosure); |
michael@0 | 467 | if (!cl->mLeftBreakElement && IsBreakElement(aNode)) { |
michael@0 | 468 | cl->mLeftBreakElement = true; |
michael@0 | 469 | } |
michael@0 | 470 | } |
michael@0 | 471 | |
michael@0 | 472 | void |
michael@0 | 473 | mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord) |
michael@0 | 474 | { |
michael@0 | 475 | nsAutoString result; |
michael@0 | 476 | ::NormalizeWord(aWord, 0, aWord.Length(), result); |
michael@0 | 477 | aWord = result; |
michael@0 | 478 | } |
michael@0 | 479 | |
michael@0 | 480 | void |
michael@0 | 481 | mozInlineSpellWordUtil::BuildSoftText() |
michael@0 | 482 | { |
michael@0 | 483 | // First we have to work backwards from mSoftStart to find a text node |
michael@0 | 484 | // containing a DOM word separator, a non-inline-element |
michael@0 | 485 | // boundary, or the hard start node. That's where we'll start building the |
michael@0 | 486 | // soft string from. |
michael@0 | 487 | nsINode* node = mSoftBegin.mNode; |
michael@0 | 488 | int32_t firstOffsetInNode = 0; |
michael@0 | 489 | int32_t checkBeforeOffset = mSoftBegin.mOffset; |
michael@0 | 490 | while (node) { |
michael@0 | 491 | if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) { |
michael@0 | 492 | if (node == mSoftBegin.mNode) { |
michael@0 | 493 | // If we find a word separator on the first node, look at the preceding |
michael@0 | 494 | // word on the text node as well. |
michael@0 | 495 | int32_t newOffset = 0; |
michael@0 | 496 | if (firstOffsetInNode > 0) { |
michael@0 | 497 | // Try to find the previous word boundary in the current node. If |
michael@0 | 498 | // we can't find one, start checking previous sibling nodes (if any |
michael@0 | 499 | // adjacent ones exist) to see if we can find any text nodes with |
michael@0 | 500 | // DOM word separators. We bail out as soon as we see a node that is |
michael@0 | 501 | // not a text node, or we run out of previous sibling nodes. In the |
michael@0 | 502 | // event that we simply cannot find any preceding word separator, the |
michael@0 | 503 | // offset is set to 0, and the soft text beginning node is set to the |
michael@0 | 504 | // "most previous" text node before the original starting node, or |
michael@0 | 505 | // kept at the original starting node if no previous text nodes exist. |
michael@0 | 506 | if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1, |
michael@0 | 507 | &newOffset)) { |
michael@0 | 508 | nsINode* prevNode = node->GetPreviousSibling(); |
michael@0 | 509 | while (prevNode && IsTextNode(prevNode)) { |
michael@0 | 510 | mSoftBegin.mNode = prevNode; |
michael@0 | 511 | if (TextNodeContainsDOMWordSeparator(prevNode, INT32_MAX, |
michael@0 | 512 | &newOffset)) { |
michael@0 | 513 | break; |
michael@0 | 514 | } |
michael@0 | 515 | prevNode = prevNode->GetPreviousSibling(); |
michael@0 | 516 | } |
michael@0 | 517 | } |
michael@0 | 518 | } |
michael@0 | 519 | firstOffsetInNode = newOffset; |
michael@0 | 520 | mSoftBegin.mOffset = newOffset; |
michael@0 | 521 | } |
michael@0 | 522 | break; |
michael@0 | 523 | } |
michael@0 | 524 | checkBeforeOffset = INT32_MAX; |
michael@0 | 525 | if (IsBreakElement(node)) { |
michael@0 | 526 | // Since GetPreviousContent follows tree *preorder*, we're about to traverse |
michael@0 | 527 | // up out of 'node'. Since node induces breaks (e.g., it's a block), |
michael@0 | 528 | // don't bother trying to look outside it, just stop now. |
michael@0 | 529 | break; |
michael@0 | 530 | } |
michael@0 | 531 | // GetPreviousContent below expects mRootNode to be an ancestor of node. |
michael@0 | 532 | if (!nsContentUtils::ContentIsDescendantOf(node, mRootNode)) { |
michael@0 | 533 | break; |
michael@0 | 534 | } |
michael@0 | 535 | node = node->GetPreviousContent(mRootNode); |
michael@0 | 536 | } |
michael@0 | 537 | |
michael@0 | 538 | // Now build up the string moving forward through the DOM until we reach |
michael@0 | 539 | // the soft end and *then* see a DOM word separator, a non-inline-element |
michael@0 | 540 | // boundary, or the hard end node. |
michael@0 | 541 | mSoftText.Truncate(); |
michael@0 | 542 | mSoftTextDOMMapping.Clear(); |
michael@0 | 543 | bool seenSoftEnd = false; |
michael@0 | 544 | // Leave this outside the loop so large heap string allocations can be reused |
michael@0 | 545 | // across iterations |
michael@0 | 546 | while (node) { |
michael@0 | 547 | if (node == mSoftEnd.mNode) { |
michael@0 | 548 | seenSoftEnd = true; |
michael@0 | 549 | } |
michael@0 | 550 | |
michael@0 | 551 | bool exit = false; |
michael@0 | 552 | if (IsTextNode(node)) { |
michael@0 | 553 | nsIContent* content = static_cast<nsIContent*>(node); |
michael@0 | 554 | NS_ASSERTION(content, "Where is our content?"); |
michael@0 | 555 | const nsTextFragment* textFragment = content->GetText(); |
michael@0 | 556 | NS_ASSERTION(textFragment, "Where is our text?"); |
michael@0 | 557 | int32_t lastOffsetInNode = textFragment->GetLength(); |
michael@0 | 558 | |
michael@0 | 559 | if (seenSoftEnd) { |
michael@0 | 560 | // check whether we can stop after this |
michael@0 | 561 | for (int32_t i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0; |
michael@0 | 562 | i < int32_t(textFragment->GetLength()); ++i) { |
michael@0 | 563 | if (IsDOMWordSeparator(textFragment->CharAt(i))) { |
michael@0 | 564 | exit = true; |
michael@0 | 565 | // stop at the first separator after the soft end point |
michael@0 | 566 | lastOffsetInNode = i; |
michael@0 | 567 | break; |
michael@0 | 568 | } |
michael@0 | 569 | } |
michael@0 | 570 | } |
michael@0 | 571 | |
michael@0 | 572 | if (firstOffsetInNode < lastOffsetInNode) { |
michael@0 | 573 | int32_t len = lastOffsetInNode - firstOffsetInNode; |
michael@0 | 574 | mSoftTextDOMMapping.AppendElement( |
michael@0 | 575 | DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len)); |
michael@0 | 576 | |
michael@0 | 577 | bool ok = textFragment->AppendTo(mSoftText, firstOffsetInNode, len, |
michael@0 | 578 | mozilla::fallible_t()); |
michael@0 | 579 | if (!ok) { |
michael@0 | 580 | // probably out of memory, remove from mSoftTextDOMMapping |
michael@0 | 581 | mSoftTextDOMMapping.RemoveElementAt(mSoftTextDOMMapping.Length() - 1); |
michael@0 | 582 | exit = true; |
michael@0 | 583 | } |
michael@0 | 584 | } |
michael@0 | 585 | |
michael@0 | 586 | firstOffsetInNode = 0; |
michael@0 | 587 | } |
michael@0 | 588 | |
michael@0 | 589 | if (exit) |
michael@0 | 590 | break; |
michael@0 | 591 | |
michael@0 | 592 | CheckLeavingBreakElementClosure closure = { false }; |
michael@0 | 593 | node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure); |
michael@0 | 594 | if (closure.mLeftBreakElement || (node && IsBreakElement(node))) { |
michael@0 | 595 | // We left, or are entering, a break element (e.g., block). Maybe we can |
michael@0 | 596 | // stop now. |
michael@0 | 597 | if (seenSoftEnd) |
michael@0 | 598 | break; |
michael@0 | 599 | // Record the break |
michael@0 | 600 | mSoftText.Append(' '); |
michael@0 | 601 | } |
michael@0 | 602 | } |
michael@0 | 603 | |
michael@0 | 604 | #ifdef DEBUG_SPELLCHECK |
michael@0 | 605 | printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get()); |
michael@0 | 606 | #endif |
michael@0 | 607 | } |
michael@0 | 608 | |
michael@0 | 609 | void |
michael@0 | 610 | mozInlineSpellWordUtil::BuildRealWords() |
michael@0 | 611 | { |
michael@0 | 612 | // This is pretty simple. We just have to walk mSoftText, tokenizing it |
michael@0 | 613 | // into "real words". |
michael@0 | 614 | // We do an outer traversal of words delimited by IsDOMWordSeparator, calling |
michael@0 | 615 | // SplitDOMWord on each of those DOM words |
michael@0 | 616 | int32_t wordStart = -1; |
michael@0 | 617 | mRealWords.Clear(); |
michael@0 | 618 | for (int32_t i = 0; i < int32_t(mSoftText.Length()); ++i) { |
michael@0 | 619 | if (IsDOMWordSeparator(mSoftText.CharAt(i))) { |
michael@0 | 620 | if (wordStart >= 0) { |
michael@0 | 621 | SplitDOMWord(wordStart, i); |
michael@0 | 622 | wordStart = -1; |
michael@0 | 623 | } |
michael@0 | 624 | } else { |
michael@0 | 625 | if (wordStart < 0) { |
michael@0 | 626 | wordStart = i; |
michael@0 | 627 | } |
michael@0 | 628 | } |
michael@0 | 629 | } |
michael@0 | 630 | if (wordStart >= 0) { |
michael@0 | 631 | SplitDOMWord(wordStart, mSoftText.Length()); |
michael@0 | 632 | } |
michael@0 | 633 | } |
michael@0 | 634 | |
michael@0 | 635 | /*********** DOM/realwords<->mSoftText mapping functions ************/ |
michael@0 | 636 | |
michael@0 | 637 | int32_t |
michael@0 | 638 | mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset) |
michael@0 | 639 | { |
michael@0 | 640 | if (!mSoftTextValid) { |
michael@0 | 641 | NS_ERROR("Soft text must be valid if we're to map into it"); |
michael@0 | 642 | return -1; |
michael@0 | 643 | } |
michael@0 | 644 | |
michael@0 | 645 | for (int32_t i = 0; i < int32_t(mSoftTextDOMMapping.Length()); ++i) { |
michael@0 | 646 | const DOMTextMapping& map = mSoftTextDOMMapping[i]; |
michael@0 | 647 | if (map.mNodeOffset.mNode == aNodeOffset.mNode) { |
michael@0 | 648 | // Allow offsets at either end of the string, in particular, allow the |
michael@0 | 649 | // offset that's at the end of the contributed string |
michael@0 | 650 | int32_t offsetInContributedString = |
michael@0 | 651 | aNodeOffset.mOffset - map.mNodeOffset.mOffset; |
michael@0 | 652 | if (offsetInContributedString >= 0 && |
michael@0 | 653 | offsetInContributedString <= map.mLength) |
michael@0 | 654 | return map.mSoftTextOffset + offsetInContributedString; |
michael@0 | 655 | return -1; |
michael@0 | 656 | } |
michael@0 | 657 | } |
michael@0 | 658 | return -1; |
michael@0 | 659 | } |
michael@0 | 660 | |
michael@0 | 661 | mozInlineSpellWordUtil::NodeOffset |
michael@0 | 662 | mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset, |
michael@0 | 663 | DOMMapHint aHint) |
michael@0 | 664 | { |
michael@0 | 665 | NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it"); |
michael@0 | 666 | if (!mSoftTextValid) |
michael@0 | 667 | return NodeOffset(nullptr, -1); |
michael@0 | 668 | |
michael@0 | 669 | // The invariant is that the range start..end includes the last mapping, |
michael@0 | 670 | // if any, such that mSoftTextOffset <= aSoftTextOffset |
michael@0 | 671 | int32_t start = 0; |
michael@0 | 672 | int32_t end = mSoftTextDOMMapping.Length(); |
michael@0 | 673 | while (end - start >= 2) { |
michael@0 | 674 | int32_t mid = (start + end)/2; |
michael@0 | 675 | const DOMTextMapping& map = mSoftTextDOMMapping[mid]; |
michael@0 | 676 | if (map.mSoftTextOffset > aSoftTextOffset) { |
michael@0 | 677 | end = mid; |
michael@0 | 678 | } else { |
michael@0 | 679 | start = mid; |
michael@0 | 680 | } |
michael@0 | 681 | } |
michael@0 | 682 | |
michael@0 | 683 | if (start >= end) |
michael@0 | 684 | return NodeOffset(nullptr, -1); |
michael@0 | 685 | |
michael@0 | 686 | // 'start' is now the last mapping, if any, such that |
michael@0 | 687 | // mSoftTextOffset <= aSoftTextOffset. |
michael@0 | 688 | // If we're doing HINT_END, then we may want to return the end of the |
michael@0 | 689 | // the previous mapping instead of the start of this mapping |
michael@0 | 690 | if (aHint == HINT_END && start > 0) { |
michael@0 | 691 | const DOMTextMapping& map = mSoftTextDOMMapping[start - 1]; |
michael@0 | 692 | if (map.mSoftTextOffset + map.mLength == aSoftTextOffset) |
michael@0 | 693 | return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength); |
michael@0 | 694 | } |
michael@0 | 695 | |
michael@0 | 696 | // We allow ourselves to return the end of this mapping even if we're |
michael@0 | 697 | // doing HINT_START. This will only happen if there is no mapping which this |
michael@0 | 698 | // point is the start of. I'm not 100% sure this is OK... |
michael@0 | 699 | const DOMTextMapping& map = mSoftTextDOMMapping[start]; |
michael@0 | 700 | int32_t offset = aSoftTextOffset - map.mSoftTextOffset; |
michael@0 | 701 | if (offset >= 0 && offset <= map.mLength) |
michael@0 | 702 | return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset); |
michael@0 | 703 | |
michael@0 | 704 | return NodeOffset(nullptr, -1); |
michael@0 | 705 | } |
michael@0 | 706 | |
michael@0 | 707 | int32_t |
michael@0 | 708 | mozInlineSpellWordUtil::FindRealWordContaining(int32_t aSoftTextOffset, |
michael@0 | 709 | DOMMapHint aHint, bool aSearchForward) |
michael@0 | 710 | { |
michael@0 | 711 | NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it"); |
michael@0 | 712 | if (!mSoftTextValid) |
michael@0 | 713 | return -1; |
michael@0 | 714 | |
michael@0 | 715 | // The invariant is that the range start..end includes the last word, |
michael@0 | 716 | // if any, such that mSoftTextOffset <= aSoftTextOffset |
michael@0 | 717 | int32_t start = 0; |
michael@0 | 718 | int32_t end = mRealWords.Length(); |
michael@0 | 719 | while (end - start >= 2) { |
michael@0 | 720 | int32_t mid = (start + end)/2; |
michael@0 | 721 | const RealWord& word = mRealWords[mid]; |
michael@0 | 722 | if (word.mSoftTextOffset > aSoftTextOffset) { |
michael@0 | 723 | end = mid; |
michael@0 | 724 | } else { |
michael@0 | 725 | start = mid; |
michael@0 | 726 | } |
michael@0 | 727 | } |
michael@0 | 728 | |
michael@0 | 729 | if (start >= end) |
michael@0 | 730 | return -1; |
michael@0 | 731 | |
michael@0 | 732 | // 'start' is now the last word, if any, such that |
michael@0 | 733 | // mSoftTextOffset <= aSoftTextOffset. |
michael@0 | 734 | // If we're doing HINT_END, then we may want to return the end of the |
michael@0 | 735 | // the previous word instead of the start of this word |
michael@0 | 736 | if (aHint == HINT_END && start > 0) { |
michael@0 | 737 | const RealWord& word = mRealWords[start - 1]; |
michael@0 | 738 | if (word.mSoftTextOffset + word.mLength == aSoftTextOffset) |
michael@0 | 739 | return start - 1; |
michael@0 | 740 | } |
michael@0 | 741 | |
michael@0 | 742 | // We allow ourselves to return the end of this word even if we're |
michael@0 | 743 | // doing HINT_START. This will only happen if there is no word which this |
michael@0 | 744 | // point is the start of. I'm not 100% sure this is OK... |
michael@0 | 745 | const RealWord& word = mRealWords[start]; |
michael@0 | 746 | int32_t offset = aSoftTextOffset - word.mSoftTextOffset; |
michael@0 | 747 | if (offset >= 0 && offset <= word.mLength) |
michael@0 | 748 | return start; |
michael@0 | 749 | |
michael@0 | 750 | if (aSearchForward) { |
michael@0 | 751 | if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) { |
michael@0 | 752 | // All words have mSoftTextOffset > aSoftTextOffset |
michael@0 | 753 | return 0; |
michael@0 | 754 | } |
michael@0 | 755 | // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset. |
michael@0 | 756 | // Word start+1, if it exists, will be the first with |
michael@0 | 757 | // mSoftTextOffset > aSoftTextOffset. |
michael@0 | 758 | if (start + 1 < int32_t(mRealWords.Length())) |
michael@0 | 759 | return start + 1; |
michael@0 | 760 | } |
michael@0 | 761 | |
michael@0 | 762 | return -1; |
michael@0 | 763 | } |
michael@0 | 764 | |
michael@0 | 765 | /*********** Word Splitting ************/ |
michael@0 | 766 | |
michael@0 | 767 | // classifies a given character in the DOM word |
michael@0 | 768 | enum CharClass { |
michael@0 | 769 | CHAR_CLASS_WORD, |
michael@0 | 770 | CHAR_CLASS_SEPARATOR, |
michael@0 | 771 | CHAR_CLASS_END_OF_INPUT }; |
michael@0 | 772 | |
michael@0 | 773 | // Encapsulates DOM-word to real-word splitting |
michael@0 | 774 | struct MOZ_STACK_CLASS WordSplitState |
michael@0 | 775 | { |
michael@0 | 776 | mozInlineSpellWordUtil* mWordUtil; |
michael@0 | 777 | const nsDependentSubstring mDOMWordText; |
michael@0 | 778 | int32_t mDOMWordOffset; |
michael@0 | 779 | CharClass mCurCharClass; |
michael@0 | 780 | |
michael@0 | 781 | WordSplitState(mozInlineSpellWordUtil* aWordUtil, |
michael@0 | 782 | const nsString& aString, int32_t aStart, int32_t aLen) |
michael@0 | 783 | : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen), |
michael@0 | 784 | mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {} |
michael@0 | 785 | |
michael@0 | 786 | CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const; |
michael@0 | 787 | void Advance(); |
michael@0 | 788 | void AdvanceThroughSeparators(); |
michael@0 | 789 | void AdvanceThroughWord(); |
michael@0 | 790 | |
michael@0 | 791 | // Finds special words like email addresses and URLs that may start at the |
michael@0 | 792 | // current position, and returns their length, or 0 if not found. This allows |
michael@0 | 793 | // arbitrary word breaking rules to be used for these special entities, as |
michael@0 | 794 | // long as they can not contain whitespace. |
michael@0 | 795 | bool IsSpecialWord(); |
michael@0 | 796 | |
michael@0 | 797 | // Similar to IsSpecialWord except that this takes a split word as |
michael@0 | 798 | // input. This checks for things that do not require special word-breaking |
michael@0 | 799 | // rules. |
michael@0 | 800 | bool ShouldSkipWord(int32_t aStart, int32_t aLength); |
michael@0 | 801 | }; |
michael@0 | 802 | |
michael@0 | 803 | // WordSplitState::ClassifyCharacter |
michael@0 | 804 | |
michael@0 | 805 | CharClass |
michael@0 | 806 | WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const |
michael@0 | 807 | { |
michael@0 | 808 | NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()), |
michael@0 | 809 | "Index out of range"); |
michael@0 | 810 | if (aIndex == int32_t(mDOMWordText.Length())) |
michael@0 | 811 | return CHAR_CLASS_SEPARATOR; |
michael@0 | 812 | |
michael@0 | 813 | // this will classify the character, we want to treat "ignorable" characters |
michael@0 | 814 | // such as soft hyphens, and also ZWJ and ZWNJ as word characters. |
michael@0 | 815 | nsIUGenCategory::nsUGenCategory |
michael@0 | 816 | charCategory = mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]); |
michael@0 | 817 | if (charCategory == nsIUGenCategory::kLetter || |
michael@0 | 818 | IsIgnorableCharacter(mDOMWordText[aIndex]) || |
michael@0 | 819 | mDOMWordText[aIndex] == 0x200C /* ZWNJ */ || |
michael@0 | 820 | mDOMWordText[aIndex] == 0x200D /* ZWJ */) |
michael@0 | 821 | return CHAR_CLASS_WORD; |
michael@0 | 822 | |
michael@0 | 823 | // If conditional punctuation is surrounded immediately on both sides by word |
michael@0 | 824 | // characters it also counts as a word character. |
michael@0 | 825 | if (IsConditionalPunctuation(mDOMWordText[aIndex])) { |
michael@0 | 826 | if (!aRecurse) { |
michael@0 | 827 | // not allowed to look around, this punctuation counts like a separator |
michael@0 | 828 | return CHAR_CLASS_SEPARATOR; |
michael@0 | 829 | } |
michael@0 | 830 | |
michael@0 | 831 | // check the left-hand character |
michael@0 | 832 | if (aIndex == 0) |
michael@0 | 833 | return CHAR_CLASS_SEPARATOR; |
michael@0 | 834 | if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) |
michael@0 | 835 | return CHAR_CLASS_SEPARATOR; |
michael@0 | 836 | // If the previous charatcer is a word-char, make sure that it's not a |
michael@0 | 837 | // special dot character. |
michael@0 | 838 | if (mDOMWordText[aIndex - 1] == '.') |
michael@0 | 839 | return CHAR_CLASS_SEPARATOR; |
michael@0 | 840 | |
michael@0 | 841 | // now we know left char is a word-char, check the right-hand character |
michael@0 | 842 | if (aIndex == int32_t(mDOMWordText.Length()) - 1) |
michael@0 | 843 | return CHAR_CLASS_SEPARATOR; |
michael@0 | 844 | if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD) |
michael@0 | 845 | return CHAR_CLASS_SEPARATOR; |
michael@0 | 846 | // If the next charatcer is a word-char, make sure that it's not a |
michael@0 | 847 | // special dot character. |
michael@0 | 848 | if (mDOMWordText[aIndex + 1] == '.') |
michael@0 | 849 | return CHAR_CLASS_SEPARATOR; |
michael@0 | 850 | |
michael@0 | 851 | // char on either side is a word, this counts as a word |
michael@0 | 852 | return CHAR_CLASS_WORD; |
michael@0 | 853 | } |
michael@0 | 854 | |
michael@0 | 855 | // The dot character, if appearing at the end of a word, should |
michael@0 | 856 | // be considered part of that word. Example: "etc.", or |
michael@0 | 857 | // abbreviations |
michael@0 | 858 | if (aIndex > 0 && |
michael@0 | 859 | mDOMWordText[aIndex] == '.' && |
michael@0 | 860 | mDOMWordText[aIndex - 1] != '.' && |
michael@0 | 861 | ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) { |
michael@0 | 862 | return CHAR_CLASS_WORD; |
michael@0 | 863 | } |
michael@0 | 864 | |
michael@0 | 865 | // all other punctuation |
michael@0 | 866 | if (charCategory == nsIUGenCategory::kSeparator || |
michael@0 | 867 | charCategory == nsIUGenCategory::kOther || |
michael@0 | 868 | charCategory == nsIUGenCategory::kPunctuation || |
michael@0 | 869 | charCategory == nsIUGenCategory::kSymbol) { |
michael@0 | 870 | // Don't break on hyphens, as hunspell handles them on its own. |
michael@0 | 871 | if (aIndex > 0 && |
michael@0 | 872 | mDOMWordText[aIndex] == '-' && |
michael@0 | 873 | mDOMWordText[aIndex - 1] != '-' && |
michael@0 | 874 | ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) { |
michael@0 | 875 | // A hyphen is only meaningful as a separator inside a word |
michael@0 | 876 | // if the previous and next characters are a word character. |
michael@0 | 877 | if (aIndex == int32_t(mDOMWordText.Length()) - 1) |
michael@0 | 878 | return CHAR_CLASS_SEPARATOR; |
michael@0 | 879 | if (mDOMWordText[aIndex + 1] != '.' && |
michael@0 | 880 | ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD) |
michael@0 | 881 | return CHAR_CLASS_WORD; |
michael@0 | 882 | } |
michael@0 | 883 | return CHAR_CLASS_SEPARATOR; |
michael@0 | 884 | } |
michael@0 | 885 | |
michael@0 | 886 | // any other character counts as a word |
michael@0 | 887 | return CHAR_CLASS_WORD; |
michael@0 | 888 | } |
michael@0 | 889 | |
michael@0 | 890 | |
michael@0 | 891 | // WordSplitState::Advance |
michael@0 | 892 | |
michael@0 | 893 | void |
michael@0 | 894 | WordSplitState::Advance() |
michael@0 | 895 | { |
michael@0 | 896 | NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index"); |
michael@0 | 897 | NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(), |
michael@0 | 898 | "Length beyond end"); |
michael@0 | 899 | |
michael@0 | 900 | mDOMWordOffset ++; |
michael@0 | 901 | if (mDOMWordOffset >= (int32_t)mDOMWordText.Length()) |
michael@0 | 902 | mCurCharClass = CHAR_CLASS_END_OF_INPUT; |
michael@0 | 903 | else |
michael@0 | 904 | mCurCharClass = ClassifyCharacter(mDOMWordOffset, true); |
michael@0 | 905 | } |
michael@0 | 906 | |
michael@0 | 907 | |
michael@0 | 908 | // WordSplitState::AdvanceThroughSeparators |
michael@0 | 909 | |
michael@0 | 910 | void |
michael@0 | 911 | WordSplitState::AdvanceThroughSeparators() |
michael@0 | 912 | { |
michael@0 | 913 | while (mCurCharClass == CHAR_CLASS_SEPARATOR) |
michael@0 | 914 | Advance(); |
michael@0 | 915 | } |
michael@0 | 916 | |
michael@0 | 917 | // WordSplitState::AdvanceThroughWord |
michael@0 | 918 | |
michael@0 | 919 | void |
michael@0 | 920 | WordSplitState::AdvanceThroughWord() |
michael@0 | 921 | { |
michael@0 | 922 | while (mCurCharClass == CHAR_CLASS_WORD) |
michael@0 | 923 | Advance(); |
michael@0 | 924 | } |
michael@0 | 925 | |
michael@0 | 926 | |
michael@0 | 927 | // WordSplitState::IsSpecialWord |
michael@0 | 928 | |
michael@0 | 929 | bool |
michael@0 | 930 | WordSplitState::IsSpecialWord() |
michael@0 | 931 | { |
michael@0 | 932 | // Search for email addresses. We simply define these as any sequence of |
michael@0 | 933 | // characters with an '@' character in the middle. The DOM word is already |
michael@0 | 934 | // split on whitepace, so we know that everything to the end is the address |
michael@0 | 935 | int32_t firstColon = -1; |
michael@0 | 936 | for (int32_t i = mDOMWordOffset; |
michael@0 | 937 | i < int32_t(mDOMWordText.Length()); i ++) { |
michael@0 | 938 | if (mDOMWordText[i] == '@') { |
michael@0 | 939 | // only accept this if there are unambiguous word characters (don't bother |
michael@0 | 940 | // recursing to disambiguate apostrophes) on each side. This prevents |
michael@0 | 941 | // classifying, e.g. "@home" as an email address |
michael@0 | 942 | |
michael@0 | 943 | // Use this condition to only accept words with '@' in the middle of |
michael@0 | 944 | // them. It works, but the inlinespellcker doesn't like this. The problem |
michael@0 | 945 | // is that you type "fhsgfh@" that's a misspelled word followed by a |
michael@0 | 946 | // symbol, but when you type another letter "fhsgfh@g" that first word |
michael@0 | 947 | // need to be unmarked misspelled. It doesn't do this. it only checks the |
michael@0 | 948 | // current position for potentially removing a spelling range. |
michael@0 | 949 | if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD && |
michael@0 | 950 | i < (int32_t)mDOMWordText.Length() - 1 && |
michael@0 | 951 | ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) { |
michael@0 | 952 | return true; |
michael@0 | 953 | } |
michael@0 | 954 | } else if (mDOMWordText[i] == ':' && firstColon < 0) { |
michael@0 | 955 | firstColon = i; |
michael@0 | 956 | |
michael@0 | 957 | // If the first colon is followed by a slash, consider it a URL |
michael@0 | 958 | // This will catch things like asdf://foo.com |
michael@0 | 959 | if (firstColon < (int32_t)mDOMWordText.Length() - 1 && |
michael@0 | 960 | mDOMWordText[firstColon + 1] == '/') { |
michael@0 | 961 | return true; |
michael@0 | 962 | } |
michael@0 | 963 | } |
michael@0 | 964 | } |
michael@0 | 965 | |
michael@0 | 966 | // Check the text before the first colon against some known protocols. It |
michael@0 | 967 | // is impossible to check against all protocols, especially since you can |
michael@0 | 968 | // plug in new protocols. We also don't want to waste time here checking |
michael@0 | 969 | // against a lot of obscure protocols. |
michael@0 | 970 | if (firstColon > mDOMWordOffset) { |
michael@0 | 971 | nsString protocol(Substring(mDOMWordText, mDOMWordOffset, |
michael@0 | 972 | firstColon - mDOMWordOffset)); |
michael@0 | 973 | if (protocol.EqualsIgnoreCase("http") || |
michael@0 | 974 | protocol.EqualsIgnoreCase("https") || |
michael@0 | 975 | protocol.EqualsIgnoreCase("news") || |
michael@0 | 976 | protocol.EqualsIgnoreCase("file") || |
michael@0 | 977 | protocol.EqualsIgnoreCase("javascript") || |
michael@0 | 978 | protocol.EqualsIgnoreCase("data") || |
michael@0 | 979 | protocol.EqualsIgnoreCase("ftp")) { |
michael@0 | 980 | return true; |
michael@0 | 981 | } |
michael@0 | 982 | } |
michael@0 | 983 | |
michael@0 | 984 | // not anything special |
michael@0 | 985 | return false; |
michael@0 | 986 | } |
michael@0 | 987 | |
michael@0 | 988 | // WordSplitState::ShouldSkipWord |
michael@0 | 989 | |
michael@0 | 990 | bool |
michael@0 | 991 | WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength) |
michael@0 | 992 | { |
michael@0 | 993 | int32_t last = aStart + aLength; |
michael@0 | 994 | |
michael@0 | 995 | // check to see if the word contains a digit |
michael@0 | 996 | for (int32_t i = aStart; i < last; i ++) { |
michael@0 | 997 | if (unicode::GetGenCategory(mDOMWordText[i]) == nsIUGenCategory::kNumber) { |
michael@0 | 998 | return true; |
michael@0 | 999 | } |
michael@0 | 1000 | } |
michael@0 | 1001 | |
michael@0 | 1002 | // not special |
michael@0 | 1003 | return false; |
michael@0 | 1004 | } |
michael@0 | 1005 | |
michael@0 | 1006 | // mozInlineSpellWordUtil::SplitDOMWord |
michael@0 | 1007 | |
michael@0 | 1008 | void |
michael@0 | 1009 | mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd) |
michael@0 | 1010 | { |
michael@0 | 1011 | WordSplitState state(this, mSoftText, aStart, aEnd - aStart); |
michael@0 | 1012 | state.mCurCharClass = state.ClassifyCharacter(0, true); |
michael@0 | 1013 | |
michael@0 | 1014 | state.AdvanceThroughSeparators(); |
michael@0 | 1015 | if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT && |
michael@0 | 1016 | state.IsSpecialWord()) { |
michael@0 | 1017 | int32_t specialWordLength = state.mDOMWordText.Length() - state.mDOMWordOffset; |
michael@0 | 1018 | mRealWords.AppendElement( |
michael@0 | 1019 | RealWord(aStart + state.mDOMWordOffset, specialWordLength, false)); |
michael@0 | 1020 | |
michael@0 | 1021 | return; |
michael@0 | 1022 | } |
michael@0 | 1023 | |
michael@0 | 1024 | while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) { |
michael@0 | 1025 | state.AdvanceThroughSeparators(); |
michael@0 | 1026 | if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT) |
michael@0 | 1027 | break; |
michael@0 | 1028 | |
michael@0 | 1029 | // save the beginning of the word |
michael@0 | 1030 | int32_t wordOffset = state.mDOMWordOffset; |
michael@0 | 1031 | |
michael@0 | 1032 | // find the end of the word |
michael@0 | 1033 | state.AdvanceThroughWord(); |
michael@0 | 1034 | int32_t wordLen = state.mDOMWordOffset - wordOffset; |
michael@0 | 1035 | mRealWords.AppendElement( |
michael@0 | 1036 | RealWord(aStart + wordOffset, wordLen, |
michael@0 | 1037 | !state.ShouldSkipWord(wordOffset, wordLen))); |
michael@0 | 1038 | } |
michael@0 | 1039 | } |