1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1039 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "mozInlineSpellWordUtil.h" 1.10 +#include "nsDebug.h" 1.11 +#include "nsIAtom.h" 1.12 +#include "nsComponentManagerUtils.h" 1.13 +#include "nsIDOMCSSStyleDeclaration.h" 1.14 +#include "nsIDOMElement.h" 1.15 +#include "nsIDOMRange.h" 1.16 +#include "nsIEditor.h" 1.17 +#include "nsIDOMNode.h" 1.18 +#include "nsIDOMHTMLBRElement.h" 1.19 +#include "nsUnicharUtilCIID.h" 1.20 +#include "nsUnicodeProperties.h" 1.21 +#include "nsServiceManagerUtils.h" 1.22 +#include "nsIContent.h" 1.23 +#include "nsTextFragment.h" 1.24 +#include "mozilla/dom/Element.h" 1.25 +#include "nsRange.h" 1.26 +#include "nsContentUtils.h" 1.27 +#include "nsIFrame.h" 1.28 +#include <algorithm> 1.29 + 1.30 +using namespace mozilla; 1.31 + 1.32 +// IsIgnorableCharacter 1.33 +// 1.34 +// These characters are ones that we should ignore in input. 1.35 + 1.36 +inline bool IsIgnorableCharacter(char16_t ch) 1.37 +{ 1.38 + return (ch == 0xAD || // SOFT HYPHEN 1.39 + ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN 1.40 +} 1.41 + 1.42 +// IsConditionalPunctuation 1.43 +// 1.44 +// Some characters (like apostrophes) require characters on each side to be 1.45 +// part of a word, and are otherwise punctuation. 1.46 + 1.47 +inline bool IsConditionalPunctuation(char16_t ch) 1.48 +{ 1.49 + return (ch == '\'' || 1.50 + ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK 1.51 + ch == 0x00B7); // MIDDLE DOT 1.52 +} 1.53 + 1.54 +// mozInlineSpellWordUtil::Init 1.55 + 1.56 +nsresult 1.57 +mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor) 1.58 +{ 1.59 + nsresult rv; 1.60 + 1.61 + // getting the editor can fail commonly because the editor was detached, so 1.62 + // don't assert 1.63 + nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv); 1.64 + if (NS_FAILED(rv)) 1.65 + return rv; 1.66 + 1.67 + nsCOMPtr<nsIDOMDocument> domDoc; 1.68 + rv = editor->GetDocument(getter_AddRefs(domDoc)); 1.69 + NS_ENSURE_SUCCESS(rv, rv); 1.70 + NS_ENSURE_TRUE(domDoc, NS_ERROR_NULL_POINTER); 1.71 + 1.72 + mDOMDocument = domDoc; 1.73 + mDocument = do_QueryInterface(domDoc); 1.74 + 1.75 + // Find the root node for the editor. For contenteditable we'll need something 1.76 + // cleverer here. 1.77 + nsCOMPtr<nsIDOMElement> rootElt; 1.78 + rv = editor->GetRootElement(getter_AddRefs(rootElt)); 1.79 + NS_ENSURE_SUCCESS(rv, rv); 1.80 + 1.81 + nsCOMPtr<nsINode> rootNode = do_QueryInterface(rootElt); 1.82 + mRootNode = rootNode; 1.83 + NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!"); 1.84 + return NS_OK; 1.85 +} 1.86 + 1.87 +static inline bool 1.88 +IsTextNode(nsINode* aNode) 1.89 +{ 1.90 + return aNode->IsNodeOfType(nsINode::eTEXT); 1.91 +} 1.92 + 1.93 +typedef void (* OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure); 1.94 + 1.95 +// Find the next node in the DOM tree in preorder. 1.96 +// Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is 1.97 +// why we can't just use GetNextNode here, sadly. 1.98 +static nsINode* 1.99 +FindNextNode(nsINode* aNode, nsINode* aRoot, 1.100 + OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure) 1.101 +{ 1.102 + NS_PRECONDITION(aNode, "Null starting node?"); 1.103 + 1.104 + nsINode* next = aNode->GetFirstChild(); 1.105 + if (next) 1.106 + return next; 1.107 + 1.108 + // Don't look at siblings or otherwise outside of aRoot 1.109 + if (aNode == aRoot) 1.110 + return nullptr; 1.111 + 1.112 + next = aNode->GetNextSibling(); 1.113 + if (next) 1.114 + return next; 1.115 + 1.116 + // Go up 1.117 + for (;;) { 1.118 + if (aOnLeaveNode) { 1.119 + aOnLeaveNode(aNode, aClosure); 1.120 + } 1.121 + 1.122 + next = aNode->GetParent(); 1.123 + if (next == aRoot || ! next) 1.124 + return nullptr; 1.125 + aNode = next; 1.126 + 1.127 + next = aNode->GetNextSibling(); 1.128 + if (next) 1.129 + return next; 1.130 + } 1.131 +} 1.132 + 1.133 +// aNode is not a text node. Find the first text node starting at aNode/aOffset 1.134 +// in a preorder DOM traversal. 1.135 +static nsINode* 1.136 +FindNextTextNode(nsINode* aNode, int32_t aOffset, nsINode* aRoot) 1.137 +{ 1.138 + NS_PRECONDITION(aNode, "Null starting node?"); 1.139 + NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node"); 1.140 + 1.141 + nsINode* checkNode; 1.142 + // Need to start at the aOffset'th child 1.143 + nsIContent* child = aNode->GetChildAt(aOffset); 1.144 + 1.145 + if (child) { 1.146 + checkNode = child; 1.147 + } else { 1.148 + // aOffset was beyond the end of the child list. 1.149 + // goto next node after the last descendant of aNode in 1.150 + // a preorder DOM traversal. 1.151 + checkNode = aNode->GetNextNonChildNode(aRoot); 1.152 + } 1.153 + 1.154 + while (checkNode && !IsTextNode(checkNode)) { 1.155 + checkNode = checkNode->GetNextNode(aRoot); 1.156 + } 1.157 + return checkNode; 1.158 +} 1.159 + 1.160 +// mozInlineSpellWordUtil::SetEnd 1.161 +// 1.162 +// We have two ranges "hard" and "soft". The hard boundary is simply 1.163 +// the scope of the root node. The soft boundary is that which is set 1.164 +// by the caller of this class by calling this function. If this function is 1.165 +// not called, the soft boundary is the same as the hard boundary. 1.166 +// 1.167 +// When we reach the soft boundary (mSoftEnd), we keep 1.168 +// going until we reach the end of a word. This allows the caller to set the 1.169 +// end of the range to anything, and we will always check whole multiples of 1.170 +// words. When we reach the hard boundary we stop no matter what. 1.171 +// 1.172 +// There is no beginning soft boundary. This is because we only go to the 1.173 +// previous node once, when finding the previous word boundary in 1.174 +// SetPosition(). You might think of the soft boundary as being this initial 1.175 +// position. 1.176 + 1.177 +nsresult 1.178 +mozInlineSpellWordUtil::SetEnd(nsINode* aEndNode, int32_t aEndOffset) 1.179 +{ 1.180 + NS_PRECONDITION(aEndNode, "Null end node?"); 1.181 + 1.182 + NS_ASSERTION(mRootNode, "Not initialized"); 1.183 + 1.184 + InvalidateWords(); 1.185 + 1.186 + if (!IsTextNode(aEndNode)) { 1.187 + // End at the start of the first text node after aEndNode/aEndOffset. 1.188 + aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode); 1.189 + aEndOffset = 0; 1.190 + } 1.191 + mSoftEnd = NodeOffset(aEndNode, aEndOffset); 1.192 + return NS_OK; 1.193 +} 1.194 + 1.195 +nsresult 1.196 +mozInlineSpellWordUtil::SetPosition(nsINode* aNode, int32_t aOffset) 1.197 +{ 1.198 + InvalidateWords(); 1.199 + 1.200 + if (!IsTextNode(aNode)) { 1.201 + // Start at the start of the first text node after aNode/aOffset. 1.202 + aNode = FindNextTextNode(aNode, aOffset, mRootNode); 1.203 + aOffset = 0; 1.204 + } 1.205 + mSoftBegin = NodeOffset(aNode, aOffset); 1.206 + 1.207 + EnsureWords(); 1.208 + 1.209 + int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin); 1.210 + if (textOffset < 0) 1.211 + return NS_OK; 1.212 + mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true); 1.213 + return NS_OK; 1.214 +} 1.215 + 1.216 +void 1.217 +mozInlineSpellWordUtil::EnsureWords() 1.218 +{ 1.219 + if (mSoftTextValid) 1.220 + return; 1.221 + BuildSoftText(); 1.222 + BuildRealWords(); 1.223 + mSoftTextValid = true; 1.224 +} 1.225 + 1.226 +nsresult 1.227 +mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsRange** aRange) 1.228 +{ 1.229 + NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN); 1.230 + NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END); 1.231 + return MakeRange(begin, end, aRange); 1.232 +} 1.233 + 1.234 +// mozInlineSpellWordUtil::GetRangeForWord 1.235 + 1.236 +nsresult 1.237 +mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode, 1.238 + int32_t aWordOffset, 1.239 + nsRange** aRange) 1.240 +{ 1.241 + // Set our soft end and start 1.242 + nsCOMPtr<nsINode> wordNode = do_QueryInterface(aWordNode); 1.243 + NodeOffset pt = NodeOffset(wordNode, aWordOffset); 1.244 + 1.245 + InvalidateWords(); 1.246 + mSoftBegin = mSoftEnd = pt; 1.247 + EnsureWords(); 1.248 + 1.249 + int32_t offset = MapDOMPositionToSoftTextOffset(pt); 1.250 + if (offset < 0) 1.251 + return MakeRange(pt, pt, aRange); 1.252 + int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false); 1.253 + if (wordIndex < 0) 1.254 + return MakeRange(pt, pt, aRange); 1.255 + return MakeRangeForWord(mRealWords[wordIndex], aRange); 1.256 +} 1.257 + 1.258 +// This is to fix characters that the spellchecker may not like 1.259 +static void 1.260 +NormalizeWord(const nsSubstring& aInput, int32_t aPos, int32_t aLen, nsAString& aOutput) 1.261 +{ 1.262 + aOutput.Truncate(); 1.263 + for (int32_t i = 0; i < aLen; i++) { 1.264 + char16_t ch = aInput.CharAt(i + aPos); 1.265 + 1.266 + // remove ignorable characters from the word 1.267 + if (IsIgnorableCharacter(ch)) 1.268 + continue; 1.269 + 1.270 + // the spellchecker doesn't handle curly apostrophes in all languages 1.271 + if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK 1.272 + ch = '\''; 1.273 + } 1.274 + 1.275 + aOutput.Append(ch); 1.276 + } 1.277 +} 1.278 + 1.279 +// mozInlineSpellWordUtil::GetNextWord 1.280 +// 1.281 +// FIXME-optimization: we shouldn't have to generate a range every single 1.282 +// time. It would be better if the inline spellchecker didn't require a 1.283 +// range unless the word was misspelled. This may or may not be possible. 1.284 + 1.285 +nsresult 1.286 +mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsRange** aRange, 1.287 + bool* aSkipChecking) 1.288 +{ 1.289 +#ifdef DEBUG_SPELLCHECK 1.290 + printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex); 1.291 +#endif 1.292 + 1.293 + if (mNextWordIndex < 0 || 1.294 + mNextWordIndex >= int32_t(mRealWords.Length())) { 1.295 + mNextWordIndex = -1; 1.296 + *aRange = nullptr; 1.297 + *aSkipChecking = true; 1.298 + return NS_OK; 1.299 + } 1.300 + 1.301 + const RealWord& word = mRealWords[mNextWordIndex]; 1.302 + nsresult rv = MakeRangeForWord(word, aRange); 1.303 + NS_ENSURE_SUCCESS(rv, rv); 1.304 + ++mNextWordIndex; 1.305 + *aSkipChecking = !word.mCheckableWord; 1.306 + ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText); 1.307 + 1.308 +#ifdef DEBUG_SPELLCHECK 1.309 + printf("GetNextWord returning: %s (skip=%d)\n", 1.310 + NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking); 1.311 +#endif 1.312 + 1.313 + return NS_OK; 1.314 +} 1.315 + 1.316 +// mozInlineSpellWordUtil::MakeRange 1.317 +// 1.318 +// Convenience function for creating a range over the current document. 1.319 + 1.320 +nsresult 1.321 +mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd, 1.322 + nsRange** aRange) 1.323 +{ 1.324 + NS_ENSURE_ARG_POINTER(aBegin.mNode); 1.325 + if (!mDOMDocument) 1.326 + return NS_ERROR_NOT_INITIALIZED; 1.327 + 1.328 + nsRefPtr<nsRange> range = new nsRange(aBegin.mNode); 1.329 + nsresult rv = range->Set(aBegin.mNode, aBegin.mOffset, 1.330 + aEnd.mNode, aEnd.mOffset); 1.331 + NS_ENSURE_SUCCESS(rv, rv); 1.332 + range.forget(aRange); 1.333 + 1.334 + return NS_OK; 1.335 +} 1.336 + 1.337 +/*********** DOM text extraction ************/ 1.338 + 1.339 +// IsDOMWordSeparator 1.340 +// 1.341 +// Determines if the given character should be considered as a DOM Word 1.342 +// separator. Basically, this is whitespace, although it could also have 1.343 +// certain punctuation that we know ALWAYS breaks words. This is important. 1.344 +// For example, we can't have any punctuation that could appear in a URL 1.345 +// or email address in this, because those need to always fit into a single 1.346 +// DOM word. 1.347 + 1.348 +static bool 1.349 +IsDOMWordSeparator(char16_t ch) 1.350 +{ 1.351 + // simple spaces 1.352 + if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') 1.353 + return true; 1.354 + 1.355 + // complex spaces - check only if char isn't ASCII (uncommon) 1.356 + if (ch >= 0xA0 && 1.357 + (ch == 0x00A0 || // NO-BREAK SPACE 1.358 + ch == 0x2002 || // EN SPACE 1.359 + ch == 0x2003 || // EM SPACE 1.360 + ch == 0x2009 || // THIN SPACE 1.361 + ch == 0x3000)) // IDEOGRAPHIC SPACE 1.362 + return true; 1.363 + 1.364 + // otherwise not a space 1.365 + return false; 1.366 +} 1.367 + 1.368 +static inline bool 1.369 +IsBRElement(nsINode* aNode) 1.370 +{ 1.371 + return aNode->IsElement() && 1.372 + aNode->AsElement()->IsHTML(nsGkAtoms::br); 1.373 +} 1.374 + 1.375 +/** 1.376 + * Given a TextNode, checks to see if there's a DOM word separator before 1.377 + * aBeforeOffset within it. This function does not modify aSeparatorOffset when 1.378 + * it returns false. 1.379 + * 1.380 + * @param aNode the TextNode to check. 1.381 + * @param aBeforeOffset the offset in the TextNode before which we will search 1.382 + * for the DOM separator. You can pass INT32_MAX to search the entire 1.383 + * length of the string. 1.384 + * @param aSeparatorOffset will be set to the offset of the first separator it 1.385 + * encounters. Will not be written to if no separator is found. 1.386 + * @returns True if it found a separator. 1.387 + */ 1.388 +static bool 1.389 +TextNodeContainsDOMWordSeparator(nsINode* aNode, 1.390 + int32_t aBeforeOffset, 1.391 + int32_t* aSeparatorOffset) 1.392 +{ 1.393 + // aNode is actually an nsIContent, since it's eTEXT 1.394 + nsIContent* content = static_cast<nsIContent*>(aNode); 1.395 + const nsTextFragment* textFragment = content->GetText(); 1.396 + NS_ASSERTION(textFragment, "Where is our text?"); 1.397 + for (int32_t i = std::min(aBeforeOffset, int32_t(textFragment->GetLength())) - 1; i >= 0; --i) { 1.398 + if (IsDOMWordSeparator(textFragment->CharAt(i))) { 1.399 + // Be greedy, find as many separators as we can 1.400 + for (int32_t j = i - 1; j >= 0; --j) { 1.401 + if (IsDOMWordSeparator(textFragment->CharAt(j))) { 1.402 + i = j; 1.403 + } else { 1.404 + break; 1.405 + } 1.406 + } 1.407 + *aSeparatorOffset = i; 1.408 + return true; 1.409 + } 1.410 + } 1.411 + return false; 1.412 +} 1.413 + 1.414 +/** 1.415 + * Check if there's a DOM word separator before aBeforeOffset in this node. 1.416 + * Always returns true if it's a BR element. 1.417 + * aSeparatorOffset is set to the index of the first character in the last 1.418 + * separator if any is found (0 for BR elements). 1.419 + * 1.420 + * This function does not modify aSeparatorOffset when it returns false. 1.421 + */ 1.422 +static bool 1.423 +ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset, 1.424 + int32_t* aSeparatorOffset) 1.425 +{ 1.426 + if (IsBRElement(aNode)) { 1.427 + *aSeparatorOffset = 0; 1.428 + return true; 1.429 + } 1.430 + 1.431 + if (!IsTextNode(aNode)) 1.432 + return false; 1.433 + 1.434 + return TextNodeContainsDOMWordSeparator(aNode, aBeforeOffset, 1.435 + aSeparatorOffset); 1.436 +} 1.437 + 1.438 +static bool 1.439 +IsBreakElement(nsINode* aNode) 1.440 +{ 1.441 + if (!aNode->IsElement()) { 1.442 + return false; 1.443 + } 1.444 + 1.445 + dom::Element *element = aNode->AsElement(); 1.446 + 1.447 + if (element->IsHTML(nsGkAtoms::br)) 1.448 + return true; 1.449 + 1.450 + // If we don't have a frame, we don't consider ourselves a break 1.451 + // element. In particular, words can span us. 1.452 + if (!element->GetPrimaryFrame()) 1.453 + return false; 1.454 + 1.455 + // Anything that's not an inline element is a break element. 1.456 + // XXXbz should replaced inlines be break elements, though? 1.457 + return element->GetPrimaryFrame()->StyleDisplay()->mDisplay != 1.458 + NS_STYLE_DISPLAY_INLINE; 1.459 +} 1.460 + 1.461 +struct CheckLeavingBreakElementClosure { 1.462 + bool mLeftBreakElement; 1.463 +}; 1.464 + 1.465 +static void 1.466 +CheckLeavingBreakElement(nsINode* aNode, void* aClosure) 1.467 +{ 1.468 + CheckLeavingBreakElementClosure* cl = 1.469 + static_cast<CheckLeavingBreakElementClosure*>(aClosure); 1.470 + if (!cl->mLeftBreakElement && IsBreakElement(aNode)) { 1.471 + cl->mLeftBreakElement = true; 1.472 + } 1.473 +} 1.474 + 1.475 +void 1.476 +mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord) 1.477 +{ 1.478 + nsAutoString result; 1.479 + ::NormalizeWord(aWord, 0, aWord.Length(), result); 1.480 + aWord = result; 1.481 +} 1.482 + 1.483 +void 1.484 +mozInlineSpellWordUtil::BuildSoftText() 1.485 +{ 1.486 + // First we have to work backwards from mSoftStart to find a text node 1.487 + // containing a DOM word separator, a non-inline-element 1.488 + // boundary, or the hard start node. That's where we'll start building the 1.489 + // soft string from. 1.490 + nsINode* node = mSoftBegin.mNode; 1.491 + int32_t firstOffsetInNode = 0; 1.492 + int32_t checkBeforeOffset = mSoftBegin.mOffset; 1.493 + while (node) { 1.494 + if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) { 1.495 + if (node == mSoftBegin.mNode) { 1.496 + // If we find a word separator on the first node, look at the preceding 1.497 + // word on the text node as well. 1.498 + int32_t newOffset = 0; 1.499 + if (firstOffsetInNode > 0) { 1.500 + // Try to find the previous word boundary in the current node. If 1.501 + // we can't find one, start checking previous sibling nodes (if any 1.502 + // adjacent ones exist) to see if we can find any text nodes with 1.503 + // DOM word separators. We bail out as soon as we see a node that is 1.504 + // not a text node, or we run out of previous sibling nodes. In the 1.505 + // event that we simply cannot find any preceding word separator, the 1.506 + // offset is set to 0, and the soft text beginning node is set to the 1.507 + // "most previous" text node before the original starting node, or 1.508 + // kept at the original starting node if no previous text nodes exist. 1.509 + if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1, 1.510 + &newOffset)) { 1.511 + nsINode* prevNode = node->GetPreviousSibling(); 1.512 + while (prevNode && IsTextNode(prevNode)) { 1.513 + mSoftBegin.mNode = prevNode; 1.514 + if (TextNodeContainsDOMWordSeparator(prevNode, INT32_MAX, 1.515 + &newOffset)) { 1.516 + break; 1.517 + } 1.518 + prevNode = prevNode->GetPreviousSibling(); 1.519 + } 1.520 + } 1.521 + } 1.522 + firstOffsetInNode = newOffset; 1.523 + mSoftBegin.mOffset = newOffset; 1.524 + } 1.525 + break; 1.526 + } 1.527 + checkBeforeOffset = INT32_MAX; 1.528 + if (IsBreakElement(node)) { 1.529 + // Since GetPreviousContent follows tree *preorder*, we're about to traverse 1.530 + // up out of 'node'. Since node induces breaks (e.g., it's a block), 1.531 + // don't bother trying to look outside it, just stop now. 1.532 + break; 1.533 + } 1.534 + // GetPreviousContent below expects mRootNode to be an ancestor of node. 1.535 + if (!nsContentUtils::ContentIsDescendantOf(node, mRootNode)) { 1.536 + break; 1.537 + } 1.538 + node = node->GetPreviousContent(mRootNode); 1.539 + } 1.540 + 1.541 + // Now build up the string moving forward through the DOM until we reach 1.542 + // the soft end and *then* see a DOM word separator, a non-inline-element 1.543 + // boundary, or the hard end node. 1.544 + mSoftText.Truncate(); 1.545 + mSoftTextDOMMapping.Clear(); 1.546 + bool seenSoftEnd = false; 1.547 + // Leave this outside the loop so large heap string allocations can be reused 1.548 + // across iterations 1.549 + while (node) { 1.550 + if (node == mSoftEnd.mNode) { 1.551 + seenSoftEnd = true; 1.552 + } 1.553 + 1.554 + bool exit = false; 1.555 + if (IsTextNode(node)) { 1.556 + nsIContent* content = static_cast<nsIContent*>(node); 1.557 + NS_ASSERTION(content, "Where is our content?"); 1.558 + const nsTextFragment* textFragment = content->GetText(); 1.559 + NS_ASSERTION(textFragment, "Where is our text?"); 1.560 + int32_t lastOffsetInNode = textFragment->GetLength(); 1.561 + 1.562 + if (seenSoftEnd) { 1.563 + // check whether we can stop after this 1.564 + for (int32_t i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0; 1.565 + i < int32_t(textFragment->GetLength()); ++i) { 1.566 + if (IsDOMWordSeparator(textFragment->CharAt(i))) { 1.567 + exit = true; 1.568 + // stop at the first separator after the soft end point 1.569 + lastOffsetInNode = i; 1.570 + break; 1.571 + } 1.572 + } 1.573 + } 1.574 + 1.575 + if (firstOffsetInNode < lastOffsetInNode) { 1.576 + int32_t len = lastOffsetInNode - firstOffsetInNode; 1.577 + mSoftTextDOMMapping.AppendElement( 1.578 + DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len)); 1.579 + 1.580 + bool ok = textFragment->AppendTo(mSoftText, firstOffsetInNode, len, 1.581 + mozilla::fallible_t()); 1.582 + if (!ok) { 1.583 + // probably out of memory, remove from mSoftTextDOMMapping 1.584 + mSoftTextDOMMapping.RemoveElementAt(mSoftTextDOMMapping.Length() - 1); 1.585 + exit = true; 1.586 + } 1.587 + } 1.588 + 1.589 + firstOffsetInNode = 0; 1.590 + } 1.591 + 1.592 + if (exit) 1.593 + break; 1.594 + 1.595 + CheckLeavingBreakElementClosure closure = { false }; 1.596 + node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure); 1.597 + if (closure.mLeftBreakElement || (node && IsBreakElement(node))) { 1.598 + // We left, or are entering, a break element (e.g., block). Maybe we can 1.599 + // stop now. 1.600 + if (seenSoftEnd) 1.601 + break; 1.602 + // Record the break 1.603 + mSoftText.Append(' '); 1.604 + } 1.605 + } 1.606 + 1.607 +#ifdef DEBUG_SPELLCHECK 1.608 + printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get()); 1.609 +#endif 1.610 +} 1.611 + 1.612 +void 1.613 +mozInlineSpellWordUtil::BuildRealWords() 1.614 +{ 1.615 + // This is pretty simple. We just have to walk mSoftText, tokenizing it 1.616 + // into "real words". 1.617 + // We do an outer traversal of words delimited by IsDOMWordSeparator, calling 1.618 + // SplitDOMWord on each of those DOM words 1.619 + int32_t wordStart = -1; 1.620 + mRealWords.Clear(); 1.621 + for (int32_t i = 0; i < int32_t(mSoftText.Length()); ++i) { 1.622 + if (IsDOMWordSeparator(mSoftText.CharAt(i))) { 1.623 + if (wordStart >= 0) { 1.624 + SplitDOMWord(wordStart, i); 1.625 + wordStart = -1; 1.626 + } 1.627 + } else { 1.628 + if (wordStart < 0) { 1.629 + wordStart = i; 1.630 + } 1.631 + } 1.632 + } 1.633 + if (wordStart >= 0) { 1.634 + SplitDOMWord(wordStart, mSoftText.Length()); 1.635 + } 1.636 +} 1.637 + 1.638 +/*********** DOM/realwords<->mSoftText mapping functions ************/ 1.639 + 1.640 +int32_t 1.641 +mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset) 1.642 +{ 1.643 + if (!mSoftTextValid) { 1.644 + NS_ERROR("Soft text must be valid if we're to map into it"); 1.645 + return -1; 1.646 + } 1.647 + 1.648 + for (int32_t i = 0; i < int32_t(mSoftTextDOMMapping.Length()); ++i) { 1.649 + const DOMTextMapping& map = mSoftTextDOMMapping[i]; 1.650 + if (map.mNodeOffset.mNode == aNodeOffset.mNode) { 1.651 + // Allow offsets at either end of the string, in particular, allow the 1.652 + // offset that's at the end of the contributed string 1.653 + int32_t offsetInContributedString = 1.654 + aNodeOffset.mOffset - map.mNodeOffset.mOffset; 1.655 + if (offsetInContributedString >= 0 && 1.656 + offsetInContributedString <= map.mLength) 1.657 + return map.mSoftTextOffset + offsetInContributedString; 1.658 + return -1; 1.659 + } 1.660 + } 1.661 + return -1; 1.662 +} 1.663 + 1.664 +mozInlineSpellWordUtil::NodeOffset 1.665 +mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset, 1.666 + DOMMapHint aHint) 1.667 +{ 1.668 + NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it"); 1.669 + if (!mSoftTextValid) 1.670 + return NodeOffset(nullptr, -1); 1.671 + 1.672 + // The invariant is that the range start..end includes the last mapping, 1.673 + // if any, such that mSoftTextOffset <= aSoftTextOffset 1.674 + int32_t start = 0; 1.675 + int32_t end = mSoftTextDOMMapping.Length(); 1.676 + while (end - start >= 2) { 1.677 + int32_t mid = (start + end)/2; 1.678 + const DOMTextMapping& map = mSoftTextDOMMapping[mid]; 1.679 + if (map.mSoftTextOffset > aSoftTextOffset) { 1.680 + end = mid; 1.681 + } else { 1.682 + start = mid; 1.683 + } 1.684 + } 1.685 + 1.686 + if (start >= end) 1.687 + return NodeOffset(nullptr, -1); 1.688 + 1.689 + // 'start' is now the last mapping, if any, such that 1.690 + // mSoftTextOffset <= aSoftTextOffset. 1.691 + // If we're doing HINT_END, then we may want to return the end of the 1.692 + // the previous mapping instead of the start of this mapping 1.693 + if (aHint == HINT_END && start > 0) { 1.694 + const DOMTextMapping& map = mSoftTextDOMMapping[start - 1]; 1.695 + if (map.mSoftTextOffset + map.mLength == aSoftTextOffset) 1.696 + return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength); 1.697 + } 1.698 + 1.699 + // We allow ourselves to return the end of this mapping even if we're 1.700 + // doing HINT_START. This will only happen if there is no mapping which this 1.701 + // point is the start of. I'm not 100% sure this is OK... 1.702 + const DOMTextMapping& map = mSoftTextDOMMapping[start]; 1.703 + int32_t offset = aSoftTextOffset - map.mSoftTextOffset; 1.704 + if (offset >= 0 && offset <= map.mLength) 1.705 + return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset); 1.706 + 1.707 + return NodeOffset(nullptr, -1); 1.708 +} 1.709 + 1.710 +int32_t 1.711 +mozInlineSpellWordUtil::FindRealWordContaining(int32_t aSoftTextOffset, 1.712 + DOMMapHint aHint, bool aSearchForward) 1.713 +{ 1.714 + NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it"); 1.715 + if (!mSoftTextValid) 1.716 + return -1; 1.717 + 1.718 + // The invariant is that the range start..end includes the last word, 1.719 + // if any, such that mSoftTextOffset <= aSoftTextOffset 1.720 + int32_t start = 0; 1.721 + int32_t end = mRealWords.Length(); 1.722 + while (end - start >= 2) { 1.723 + int32_t mid = (start + end)/2; 1.724 + const RealWord& word = mRealWords[mid]; 1.725 + if (word.mSoftTextOffset > aSoftTextOffset) { 1.726 + end = mid; 1.727 + } else { 1.728 + start = mid; 1.729 + } 1.730 + } 1.731 + 1.732 + if (start >= end) 1.733 + return -1; 1.734 + 1.735 + // 'start' is now the last word, if any, such that 1.736 + // mSoftTextOffset <= aSoftTextOffset. 1.737 + // If we're doing HINT_END, then we may want to return the end of the 1.738 + // the previous word instead of the start of this word 1.739 + if (aHint == HINT_END && start > 0) { 1.740 + const RealWord& word = mRealWords[start - 1]; 1.741 + if (word.mSoftTextOffset + word.mLength == aSoftTextOffset) 1.742 + return start - 1; 1.743 + } 1.744 + 1.745 + // We allow ourselves to return the end of this word even if we're 1.746 + // doing HINT_START. This will only happen if there is no word which this 1.747 + // point is the start of. I'm not 100% sure this is OK... 1.748 + const RealWord& word = mRealWords[start]; 1.749 + int32_t offset = aSoftTextOffset - word.mSoftTextOffset; 1.750 + if (offset >= 0 && offset <= word.mLength) 1.751 + return start; 1.752 + 1.753 + if (aSearchForward) { 1.754 + if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) { 1.755 + // All words have mSoftTextOffset > aSoftTextOffset 1.756 + return 0; 1.757 + } 1.758 + // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset. 1.759 + // Word start+1, if it exists, will be the first with 1.760 + // mSoftTextOffset > aSoftTextOffset. 1.761 + if (start + 1 < int32_t(mRealWords.Length())) 1.762 + return start + 1; 1.763 + } 1.764 + 1.765 + return -1; 1.766 +} 1.767 + 1.768 +/*********** Word Splitting ************/ 1.769 + 1.770 +// classifies a given character in the DOM word 1.771 +enum CharClass { 1.772 + CHAR_CLASS_WORD, 1.773 + CHAR_CLASS_SEPARATOR, 1.774 + CHAR_CLASS_END_OF_INPUT }; 1.775 + 1.776 +// Encapsulates DOM-word to real-word splitting 1.777 +struct MOZ_STACK_CLASS WordSplitState 1.778 +{ 1.779 + mozInlineSpellWordUtil* mWordUtil; 1.780 + const nsDependentSubstring mDOMWordText; 1.781 + int32_t mDOMWordOffset; 1.782 + CharClass mCurCharClass; 1.783 + 1.784 + WordSplitState(mozInlineSpellWordUtil* aWordUtil, 1.785 + const nsString& aString, int32_t aStart, int32_t aLen) 1.786 + : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen), 1.787 + mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {} 1.788 + 1.789 + CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const; 1.790 + void Advance(); 1.791 + void AdvanceThroughSeparators(); 1.792 + void AdvanceThroughWord(); 1.793 + 1.794 + // Finds special words like email addresses and URLs that may start at the 1.795 + // current position, and returns their length, or 0 if not found. This allows 1.796 + // arbitrary word breaking rules to be used for these special entities, as 1.797 + // long as they can not contain whitespace. 1.798 + bool IsSpecialWord(); 1.799 + 1.800 + // Similar to IsSpecialWord except that this takes a split word as 1.801 + // input. This checks for things that do not require special word-breaking 1.802 + // rules. 1.803 + bool ShouldSkipWord(int32_t aStart, int32_t aLength); 1.804 +}; 1.805 + 1.806 +// WordSplitState::ClassifyCharacter 1.807 + 1.808 +CharClass 1.809 +WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const 1.810 +{ 1.811 + NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()), 1.812 + "Index out of range"); 1.813 + if (aIndex == int32_t(mDOMWordText.Length())) 1.814 + return CHAR_CLASS_SEPARATOR; 1.815 + 1.816 + // this will classify the character, we want to treat "ignorable" characters 1.817 + // such as soft hyphens, and also ZWJ and ZWNJ as word characters. 1.818 + nsIUGenCategory::nsUGenCategory 1.819 + charCategory = mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]); 1.820 + if (charCategory == nsIUGenCategory::kLetter || 1.821 + IsIgnorableCharacter(mDOMWordText[aIndex]) || 1.822 + mDOMWordText[aIndex] == 0x200C /* ZWNJ */ || 1.823 + mDOMWordText[aIndex] == 0x200D /* ZWJ */) 1.824 + return CHAR_CLASS_WORD; 1.825 + 1.826 + // If conditional punctuation is surrounded immediately on both sides by word 1.827 + // characters it also counts as a word character. 1.828 + if (IsConditionalPunctuation(mDOMWordText[aIndex])) { 1.829 + if (!aRecurse) { 1.830 + // not allowed to look around, this punctuation counts like a separator 1.831 + return CHAR_CLASS_SEPARATOR; 1.832 + } 1.833 + 1.834 + // check the left-hand character 1.835 + if (aIndex == 0) 1.836 + return CHAR_CLASS_SEPARATOR; 1.837 + if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) 1.838 + return CHAR_CLASS_SEPARATOR; 1.839 + // If the previous charatcer is a word-char, make sure that it's not a 1.840 + // special dot character. 1.841 + if (mDOMWordText[aIndex - 1] == '.') 1.842 + return CHAR_CLASS_SEPARATOR; 1.843 + 1.844 + // now we know left char is a word-char, check the right-hand character 1.845 + if (aIndex == int32_t(mDOMWordText.Length()) - 1) 1.846 + return CHAR_CLASS_SEPARATOR; 1.847 + if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD) 1.848 + return CHAR_CLASS_SEPARATOR; 1.849 + // If the next charatcer is a word-char, make sure that it's not a 1.850 + // special dot character. 1.851 + if (mDOMWordText[aIndex + 1] == '.') 1.852 + return CHAR_CLASS_SEPARATOR; 1.853 + 1.854 + // char on either side is a word, this counts as a word 1.855 + return CHAR_CLASS_WORD; 1.856 + } 1.857 + 1.858 + // The dot character, if appearing at the end of a word, should 1.859 + // be considered part of that word. Example: "etc.", or 1.860 + // abbreviations 1.861 + if (aIndex > 0 && 1.862 + mDOMWordText[aIndex] == '.' && 1.863 + mDOMWordText[aIndex - 1] != '.' && 1.864 + ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) { 1.865 + return CHAR_CLASS_WORD; 1.866 + } 1.867 + 1.868 + // all other punctuation 1.869 + if (charCategory == nsIUGenCategory::kSeparator || 1.870 + charCategory == nsIUGenCategory::kOther || 1.871 + charCategory == nsIUGenCategory::kPunctuation || 1.872 + charCategory == nsIUGenCategory::kSymbol) { 1.873 + // Don't break on hyphens, as hunspell handles them on its own. 1.874 + if (aIndex > 0 && 1.875 + mDOMWordText[aIndex] == '-' && 1.876 + mDOMWordText[aIndex - 1] != '-' && 1.877 + ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) { 1.878 + // A hyphen is only meaningful as a separator inside a word 1.879 + // if the previous and next characters are a word character. 1.880 + if (aIndex == int32_t(mDOMWordText.Length()) - 1) 1.881 + return CHAR_CLASS_SEPARATOR; 1.882 + if (mDOMWordText[aIndex + 1] != '.' && 1.883 + ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD) 1.884 + return CHAR_CLASS_WORD; 1.885 + } 1.886 + return CHAR_CLASS_SEPARATOR; 1.887 + } 1.888 + 1.889 + // any other character counts as a word 1.890 + return CHAR_CLASS_WORD; 1.891 +} 1.892 + 1.893 + 1.894 +// WordSplitState::Advance 1.895 + 1.896 +void 1.897 +WordSplitState::Advance() 1.898 +{ 1.899 + NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index"); 1.900 + NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(), 1.901 + "Length beyond end"); 1.902 + 1.903 + mDOMWordOffset ++; 1.904 + if (mDOMWordOffset >= (int32_t)mDOMWordText.Length()) 1.905 + mCurCharClass = CHAR_CLASS_END_OF_INPUT; 1.906 + else 1.907 + mCurCharClass = ClassifyCharacter(mDOMWordOffset, true); 1.908 +} 1.909 + 1.910 + 1.911 +// WordSplitState::AdvanceThroughSeparators 1.912 + 1.913 +void 1.914 +WordSplitState::AdvanceThroughSeparators() 1.915 +{ 1.916 + while (mCurCharClass == CHAR_CLASS_SEPARATOR) 1.917 + Advance(); 1.918 +} 1.919 + 1.920 +// WordSplitState::AdvanceThroughWord 1.921 + 1.922 +void 1.923 +WordSplitState::AdvanceThroughWord() 1.924 +{ 1.925 + while (mCurCharClass == CHAR_CLASS_WORD) 1.926 + Advance(); 1.927 +} 1.928 + 1.929 + 1.930 +// WordSplitState::IsSpecialWord 1.931 + 1.932 +bool 1.933 +WordSplitState::IsSpecialWord() 1.934 +{ 1.935 + // Search for email addresses. We simply define these as any sequence of 1.936 + // characters with an '@' character in the middle. The DOM word is already 1.937 + // split on whitepace, so we know that everything to the end is the address 1.938 + int32_t firstColon = -1; 1.939 + for (int32_t i = mDOMWordOffset; 1.940 + i < int32_t(mDOMWordText.Length()); i ++) { 1.941 + if (mDOMWordText[i] == '@') { 1.942 + // only accept this if there are unambiguous word characters (don't bother 1.943 + // recursing to disambiguate apostrophes) on each side. This prevents 1.944 + // classifying, e.g. "@home" as an email address 1.945 + 1.946 + // Use this condition to only accept words with '@' in the middle of 1.947 + // them. It works, but the inlinespellcker doesn't like this. The problem 1.948 + // is that you type "fhsgfh@" that's a misspelled word followed by a 1.949 + // symbol, but when you type another letter "fhsgfh@g" that first word 1.950 + // need to be unmarked misspelled. It doesn't do this. it only checks the 1.951 + // current position for potentially removing a spelling range. 1.952 + if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD && 1.953 + i < (int32_t)mDOMWordText.Length() - 1 && 1.954 + ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) { 1.955 + return true; 1.956 + } 1.957 + } else if (mDOMWordText[i] == ':' && firstColon < 0) { 1.958 + firstColon = i; 1.959 + 1.960 + // If the first colon is followed by a slash, consider it a URL 1.961 + // This will catch things like asdf://foo.com 1.962 + if (firstColon < (int32_t)mDOMWordText.Length() - 1 && 1.963 + mDOMWordText[firstColon + 1] == '/') { 1.964 + return true; 1.965 + } 1.966 + } 1.967 + } 1.968 + 1.969 + // Check the text before the first colon against some known protocols. It 1.970 + // is impossible to check against all protocols, especially since you can 1.971 + // plug in new protocols. We also don't want to waste time here checking 1.972 + // against a lot of obscure protocols. 1.973 + if (firstColon > mDOMWordOffset) { 1.974 + nsString protocol(Substring(mDOMWordText, mDOMWordOffset, 1.975 + firstColon - mDOMWordOffset)); 1.976 + if (protocol.EqualsIgnoreCase("http") || 1.977 + protocol.EqualsIgnoreCase("https") || 1.978 + protocol.EqualsIgnoreCase("news") || 1.979 + protocol.EqualsIgnoreCase("file") || 1.980 + protocol.EqualsIgnoreCase("javascript") || 1.981 + protocol.EqualsIgnoreCase("data") || 1.982 + protocol.EqualsIgnoreCase("ftp")) { 1.983 + return true; 1.984 + } 1.985 + } 1.986 + 1.987 + // not anything special 1.988 + return false; 1.989 +} 1.990 + 1.991 +// WordSplitState::ShouldSkipWord 1.992 + 1.993 +bool 1.994 +WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength) 1.995 +{ 1.996 + int32_t last = aStart + aLength; 1.997 + 1.998 + // check to see if the word contains a digit 1.999 + for (int32_t i = aStart; i < last; i ++) { 1.1000 + if (unicode::GetGenCategory(mDOMWordText[i]) == nsIUGenCategory::kNumber) { 1.1001 + return true; 1.1002 + } 1.1003 + } 1.1004 + 1.1005 + // not special 1.1006 + return false; 1.1007 +} 1.1008 + 1.1009 +// mozInlineSpellWordUtil::SplitDOMWord 1.1010 + 1.1011 +void 1.1012 +mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd) 1.1013 +{ 1.1014 + WordSplitState state(this, mSoftText, aStart, aEnd - aStart); 1.1015 + state.mCurCharClass = state.ClassifyCharacter(0, true); 1.1016 + 1.1017 + state.AdvanceThroughSeparators(); 1.1018 + if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT && 1.1019 + state.IsSpecialWord()) { 1.1020 + int32_t specialWordLength = state.mDOMWordText.Length() - state.mDOMWordOffset; 1.1021 + mRealWords.AppendElement( 1.1022 + RealWord(aStart + state.mDOMWordOffset, specialWordLength, false)); 1.1023 + 1.1024 + return; 1.1025 + } 1.1026 + 1.1027 + while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) { 1.1028 + state.AdvanceThroughSeparators(); 1.1029 + if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT) 1.1030 + break; 1.1031 + 1.1032 + // save the beginning of the word 1.1033 + int32_t wordOffset = state.mDOMWordOffset; 1.1034 + 1.1035 + // find the end of the word 1.1036 + state.AdvanceThroughWord(); 1.1037 + int32_t wordLen = state.mDOMWordOffset - wordOffset; 1.1038 + mRealWords.AppendElement( 1.1039 + RealWord(aStart + wordOffset, wordLen, 1.1040 + !state.ShouldSkipWord(wordOffset, wordLen))); 1.1041 + } 1.1042 +}