michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "mozInlineSpellWordUtil.h" michael@0: #include "nsDebug.h" michael@0: #include "nsIAtom.h" michael@0: #include "nsComponentManagerUtils.h" michael@0: #include "nsIDOMCSSStyleDeclaration.h" michael@0: #include "nsIDOMElement.h" michael@0: #include "nsIDOMRange.h" michael@0: #include "nsIEditor.h" michael@0: #include "nsIDOMNode.h" michael@0: #include "nsIDOMHTMLBRElement.h" michael@0: #include "nsUnicharUtilCIID.h" michael@0: #include "nsUnicodeProperties.h" michael@0: #include "nsServiceManagerUtils.h" michael@0: #include "nsIContent.h" michael@0: #include "nsTextFragment.h" michael@0: #include "mozilla/dom/Element.h" michael@0: #include "nsRange.h" michael@0: #include "nsContentUtils.h" michael@0: #include "nsIFrame.h" michael@0: #include michael@0: michael@0: using namespace mozilla; michael@0: michael@0: // IsIgnorableCharacter michael@0: // michael@0: // These characters are ones that we should ignore in input. michael@0: michael@0: inline bool IsIgnorableCharacter(char16_t ch) michael@0: { michael@0: return (ch == 0xAD || // SOFT HYPHEN michael@0: ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN michael@0: } michael@0: michael@0: // IsConditionalPunctuation michael@0: // michael@0: // Some characters (like apostrophes) require characters on each side to be michael@0: // part of a word, and are otherwise punctuation. michael@0: michael@0: inline bool IsConditionalPunctuation(char16_t ch) michael@0: { michael@0: return (ch == '\'' || michael@0: ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK michael@0: ch == 0x00B7); // MIDDLE DOT michael@0: } michael@0: michael@0: // mozInlineSpellWordUtil::Init michael@0: michael@0: nsresult michael@0: mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor) michael@0: { michael@0: nsresult rv; michael@0: michael@0: // getting the editor can fail commonly because the editor was detached, so michael@0: // don't assert michael@0: nsCOMPtr editor = do_QueryReferent(aWeakEditor, &rv); michael@0: if (NS_FAILED(rv)) michael@0: return rv; michael@0: michael@0: nsCOMPtr domDoc; michael@0: rv = editor->GetDocument(getter_AddRefs(domDoc)); michael@0: NS_ENSURE_SUCCESS(rv, rv); michael@0: NS_ENSURE_TRUE(domDoc, NS_ERROR_NULL_POINTER); michael@0: michael@0: mDOMDocument = domDoc; michael@0: mDocument = do_QueryInterface(domDoc); michael@0: michael@0: // Find the root node for the editor. For contenteditable we'll need something michael@0: // cleverer here. michael@0: nsCOMPtr rootElt; michael@0: rv = editor->GetRootElement(getter_AddRefs(rootElt)); michael@0: NS_ENSURE_SUCCESS(rv, rv); michael@0: michael@0: nsCOMPtr rootNode = do_QueryInterface(rootElt); michael@0: mRootNode = rootNode; michael@0: NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!"); michael@0: return NS_OK; michael@0: } michael@0: michael@0: static inline bool michael@0: IsTextNode(nsINode* aNode) michael@0: { michael@0: return aNode->IsNodeOfType(nsINode::eTEXT); michael@0: } michael@0: michael@0: typedef void (* OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure); michael@0: michael@0: // Find the next node in the DOM tree in preorder. michael@0: // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is michael@0: // why we can't just use GetNextNode here, sadly. michael@0: static nsINode* michael@0: FindNextNode(nsINode* aNode, nsINode* aRoot, michael@0: OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure) michael@0: { michael@0: NS_PRECONDITION(aNode, "Null starting node?"); michael@0: michael@0: nsINode* next = aNode->GetFirstChild(); michael@0: if (next) michael@0: return next; michael@0: michael@0: // Don't look at siblings or otherwise outside of aRoot michael@0: if (aNode == aRoot) michael@0: return nullptr; michael@0: michael@0: next = aNode->GetNextSibling(); michael@0: if (next) michael@0: return next; michael@0: michael@0: // Go up michael@0: for (;;) { michael@0: if (aOnLeaveNode) { michael@0: aOnLeaveNode(aNode, aClosure); michael@0: } michael@0: michael@0: next = aNode->GetParent(); michael@0: if (next == aRoot || ! next) michael@0: return nullptr; michael@0: aNode = next; michael@0: michael@0: next = aNode->GetNextSibling(); michael@0: if (next) michael@0: return next; michael@0: } michael@0: } michael@0: michael@0: // aNode is not a text node. Find the first text node starting at aNode/aOffset michael@0: // in a preorder DOM traversal. michael@0: static nsINode* michael@0: FindNextTextNode(nsINode* aNode, int32_t aOffset, nsINode* aRoot) michael@0: { michael@0: NS_PRECONDITION(aNode, "Null starting node?"); michael@0: NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node"); michael@0: michael@0: nsINode* checkNode; michael@0: // Need to start at the aOffset'th child michael@0: nsIContent* child = aNode->GetChildAt(aOffset); michael@0: michael@0: if (child) { michael@0: checkNode = child; michael@0: } else { michael@0: // aOffset was beyond the end of the child list. michael@0: // goto next node after the last descendant of aNode in michael@0: // a preorder DOM traversal. michael@0: checkNode = aNode->GetNextNonChildNode(aRoot); michael@0: } michael@0: michael@0: while (checkNode && !IsTextNode(checkNode)) { michael@0: checkNode = checkNode->GetNextNode(aRoot); michael@0: } michael@0: return checkNode; michael@0: } michael@0: michael@0: // mozInlineSpellWordUtil::SetEnd michael@0: // michael@0: // We have two ranges "hard" and "soft". The hard boundary is simply michael@0: // the scope of the root node. The soft boundary is that which is set michael@0: // by the caller of this class by calling this function. If this function is michael@0: // not called, the soft boundary is the same as the hard boundary. michael@0: // michael@0: // When we reach the soft boundary (mSoftEnd), we keep michael@0: // going until we reach the end of a word. This allows the caller to set the michael@0: // end of the range to anything, and we will always check whole multiples of michael@0: // words. When we reach the hard boundary we stop no matter what. michael@0: // michael@0: // There is no beginning soft boundary. This is because we only go to the michael@0: // previous node once, when finding the previous word boundary in michael@0: // SetPosition(). You might think of the soft boundary as being this initial michael@0: // position. michael@0: michael@0: nsresult michael@0: mozInlineSpellWordUtil::SetEnd(nsINode* aEndNode, int32_t aEndOffset) michael@0: { michael@0: NS_PRECONDITION(aEndNode, "Null end node?"); michael@0: michael@0: NS_ASSERTION(mRootNode, "Not initialized"); michael@0: michael@0: InvalidateWords(); michael@0: michael@0: if (!IsTextNode(aEndNode)) { michael@0: // End at the start of the first text node after aEndNode/aEndOffset. michael@0: aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode); michael@0: aEndOffset = 0; michael@0: } michael@0: mSoftEnd = NodeOffset(aEndNode, aEndOffset); michael@0: return NS_OK; michael@0: } michael@0: michael@0: nsresult michael@0: mozInlineSpellWordUtil::SetPosition(nsINode* aNode, int32_t aOffset) michael@0: { michael@0: InvalidateWords(); michael@0: michael@0: if (!IsTextNode(aNode)) { michael@0: // Start at the start of the first text node after aNode/aOffset. michael@0: aNode = FindNextTextNode(aNode, aOffset, mRootNode); michael@0: aOffset = 0; michael@0: } michael@0: mSoftBegin = NodeOffset(aNode, aOffset); michael@0: michael@0: EnsureWords(); michael@0: michael@0: int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin); michael@0: if (textOffset < 0) michael@0: return NS_OK; michael@0: mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true); michael@0: return NS_OK; michael@0: } michael@0: michael@0: void michael@0: mozInlineSpellWordUtil::EnsureWords() michael@0: { michael@0: if (mSoftTextValid) michael@0: return; michael@0: BuildSoftText(); michael@0: BuildRealWords(); michael@0: mSoftTextValid = true; michael@0: } michael@0: michael@0: nsresult michael@0: mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsRange** aRange) michael@0: { michael@0: NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN); michael@0: NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END); michael@0: return MakeRange(begin, end, aRange); michael@0: } michael@0: michael@0: // mozInlineSpellWordUtil::GetRangeForWord michael@0: michael@0: nsresult michael@0: mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode, michael@0: int32_t aWordOffset, michael@0: nsRange** aRange) michael@0: { michael@0: // Set our soft end and start michael@0: nsCOMPtr wordNode = do_QueryInterface(aWordNode); michael@0: NodeOffset pt = NodeOffset(wordNode, aWordOffset); michael@0: michael@0: InvalidateWords(); michael@0: mSoftBegin = mSoftEnd = pt; michael@0: EnsureWords(); michael@0: michael@0: int32_t offset = MapDOMPositionToSoftTextOffset(pt); michael@0: if (offset < 0) michael@0: return MakeRange(pt, pt, aRange); michael@0: int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false); michael@0: if (wordIndex < 0) michael@0: return MakeRange(pt, pt, aRange); michael@0: return MakeRangeForWord(mRealWords[wordIndex], aRange); michael@0: } michael@0: michael@0: // This is to fix characters that the spellchecker may not like michael@0: static void michael@0: NormalizeWord(const nsSubstring& aInput, int32_t aPos, int32_t aLen, nsAString& aOutput) michael@0: { michael@0: aOutput.Truncate(); michael@0: for (int32_t i = 0; i < aLen; i++) { michael@0: char16_t ch = aInput.CharAt(i + aPos); michael@0: michael@0: // remove ignorable characters from the word michael@0: if (IsIgnorableCharacter(ch)) michael@0: continue; michael@0: michael@0: // the spellchecker doesn't handle curly apostrophes in all languages michael@0: if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK michael@0: ch = '\''; michael@0: } michael@0: michael@0: aOutput.Append(ch); michael@0: } michael@0: } michael@0: michael@0: // mozInlineSpellWordUtil::GetNextWord michael@0: // michael@0: // FIXME-optimization: we shouldn't have to generate a range every single michael@0: // time. It would be better if the inline spellchecker didn't require a michael@0: // range unless the word was misspelled. This may or may not be possible. michael@0: michael@0: nsresult michael@0: mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsRange** aRange, michael@0: bool* aSkipChecking) michael@0: { michael@0: #ifdef DEBUG_SPELLCHECK michael@0: printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex); michael@0: #endif michael@0: michael@0: if (mNextWordIndex < 0 || michael@0: mNextWordIndex >= int32_t(mRealWords.Length())) { michael@0: mNextWordIndex = -1; michael@0: *aRange = nullptr; michael@0: *aSkipChecking = true; michael@0: return NS_OK; michael@0: } michael@0: michael@0: const RealWord& word = mRealWords[mNextWordIndex]; michael@0: nsresult rv = MakeRangeForWord(word, aRange); michael@0: NS_ENSURE_SUCCESS(rv, rv); michael@0: ++mNextWordIndex; michael@0: *aSkipChecking = !word.mCheckableWord; michael@0: ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText); michael@0: michael@0: #ifdef DEBUG_SPELLCHECK michael@0: printf("GetNextWord returning: %s (skip=%d)\n", michael@0: NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking); michael@0: #endif michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: // mozInlineSpellWordUtil::MakeRange michael@0: // michael@0: // Convenience function for creating a range over the current document. michael@0: michael@0: nsresult michael@0: mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd, michael@0: nsRange** aRange) michael@0: { michael@0: NS_ENSURE_ARG_POINTER(aBegin.mNode); michael@0: if (!mDOMDocument) michael@0: return NS_ERROR_NOT_INITIALIZED; michael@0: michael@0: nsRefPtr range = new nsRange(aBegin.mNode); michael@0: nsresult rv = range->Set(aBegin.mNode, aBegin.mOffset, michael@0: aEnd.mNode, aEnd.mOffset); michael@0: NS_ENSURE_SUCCESS(rv, rv); michael@0: range.forget(aRange); michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: /*********** DOM text extraction ************/ michael@0: michael@0: // IsDOMWordSeparator michael@0: // michael@0: // Determines if the given character should be considered as a DOM Word michael@0: // separator. Basically, this is whitespace, although it could also have michael@0: // certain punctuation that we know ALWAYS breaks words. This is important. michael@0: // For example, we can't have any punctuation that could appear in a URL michael@0: // or email address in this, because those need to always fit into a single michael@0: // DOM word. michael@0: michael@0: static bool michael@0: IsDOMWordSeparator(char16_t ch) michael@0: { michael@0: // simple spaces michael@0: if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') michael@0: return true; michael@0: michael@0: // complex spaces - check only if char isn't ASCII (uncommon) michael@0: if (ch >= 0xA0 && michael@0: (ch == 0x00A0 || // NO-BREAK SPACE michael@0: ch == 0x2002 || // EN SPACE michael@0: ch == 0x2003 || // EM SPACE michael@0: ch == 0x2009 || // THIN SPACE michael@0: ch == 0x3000)) // IDEOGRAPHIC SPACE michael@0: return true; michael@0: michael@0: // otherwise not a space michael@0: return false; michael@0: } michael@0: michael@0: static inline bool michael@0: IsBRElement(nsINode* aNode) michael@0: { michael@0: return aNode->IsElement() && michael@0: aNode->AsElement()->IsHTML(nsGkAtoms::br); michael@0: } michael@0: michael@0: /** michael@0: * Given a TextNode, checks to see if there's a DOM word separator before michael@0: * aBeforeOffset within it. This function does not modify aSeparatorOffset when michael@0: * it returns false. michael@0: * michael@0: * @param aNode the TextNode to check. michael@0: * @param aBeforeOffset the offset in the TextNode before which we will search michael@0: * for the DOM separator. You can pass INT32_MAX to search the entire michael@0: * length of the string. michael@0: * @param aSeparatorOffset will be set to the offset of the first separator it michael@0: * encounters. Will not be written to if no separator is found. michael@0: * @returns True if it found a separator. michael@0: */ michael@0: static bool michael@0: TextNodeContainsDOMWordSeparator(nsINode* aNode, michael@0: int32_t aBeforeOffset, michael@0: int32_t* aSeparatorOffset) michael@0: { michael@0: // aNode is actually an nsIContent, since it's eTEXT michael@0: nsIContent* content = static_cast(aNode); michael@0: const nsTextFragment* textFragment = content->GetText(); michael@0: NS_ASSERTION(textFragment, "Where is our text?"); michael@0: for (int32_t i = std::min(aBeforeOffset, int32_t(textFragment->GetLength())) - 1; i >= 0; --i) { michael@0: if (IsDOMWordSeparator(textFragment->CharAt(i))) { michael@0: // Be greedy, find as many separators as we can michael@0: for (int32_t j = i - 1; j >= 0; --j) { michael@0: if (IsDOMWordSeparator(textFragment->CharAt(j))) { michael@0: i = j; michael@0: } else { michael@0: break; michael@0: } michael@0: } michael@0: *aSeparatorOffset = i; michael@0: return true; michael@0: } michael@0: } michael@0: return false; michael@0: } michael@0: michael@0: /** michael@0: * Check if there's a DOM word separator before aBeforeOffset in this node. michael@0: * Always returns true if it's a BR element. michael@0: * aSeparatorOffset is set to the index of the first character in the last michael@0: * separator if any is found (0 for BR elements). michael@0: * michael@0: * This function does not modify aSeparatorOffset when it returns false. michael@0: */ michael@0: static bool michael@0: ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset, michael@0: int32_t* aSeparatorOffset) michael@0: { michael@0: if (IsBRElement(aNode)) { michael@0: *aSeparatorOffset = 0; michael@0: return true; michael@0: } michael@0: michael@0: if (!IsTextNode(aNode)) michael@0: return false; michael@0: michael@0: return TextNodeContainsDOMWordSeparator(aNode, aBeforeOffset, michael@0: aSeparatorOffset); michael@0: } michael@0: michael@0: static bool michael@0: IsBreakElement(nsINode* aNode) michael@0: { michael@0: if (!aNode->IsElement()) { michael@0: return false; michael@0: } michael@0: michael@0: dom::Element *element = aNode->AsElement(); michael@0: michael@0: if (element->IsHTML(nsGkAtoms::br)) michael@0: return true; michael@0: michael@0: // If we don't have a frame, we don't consider ourselves a break michael@0: // element. In particular, words can span us. michael@0: if (!element->GetPrimaryFrame()) michael@0: return false; michael@0: michael@0: // Anything that's not an inline element is a break element. michael@0: // XXXbz should replaced inlines be break elements, though? michael@0: return element->GetPrimaryFrame()->StyleDisplay()->mDisplay != michael@0: NS_STYLE_DISPLAY_INLINE; michael@0: } michael@0: michael@0: struct CheckLeavingBreakElementClosure { michael@0: bool mLeftBreakElement; michael@0: }; michael@0: michael@0: static void michael@0: CheckLeavingBreakElement(nsINode* aNode, void* aClosure) michael@0: { michael@0: CheckLeavingBreakElementClosure* cl = michael@0: static_cast(aClosure); michael@0: if (!cl->mLeftBreakElement && IsBreakElement(aNode)) { michael@0: cl->mLeftBreakElement = true; michael@0: } michael@0: } michael@0: michael@0: void michael@0: mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord) michael@0: { michael@0: nsAutoString result; michael@0: ::NormalizeWord(aWord, 0, aWord.Length(), result); michael@0: aWord = result; michael@0: } michael@0: michael@0: void michael@0: mozInlineSpellWordUtil::BuildSoftText() michael@0: { michael@0: // First we have to work backwards from mSoftStart to find a text node michael@0: // containing a DOM word separator, a non-inline-element michael@0: // boundary, or the hard start node. That's where we'll start building the michael@0: // soft string from. michael@0: nsINode* node = mSoftBegin.mNode; michael@0: int32_t firstOffsetInNode = 0; michael@0: int32_t checkBeforeOffset = mSoftBegin.mOffset; michael@0: while (node) { michael@0: if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) { michael@0: if (node == mSoftBegin.mNode) { michael@0: // If we find a word separator on the first node, look at the preceding michael@0: // word on the text node as well. michael@0: int32_t newOffset = 0; michael@0: if (firstOffsetInNode > 0) { michael@0: // Try to find the previous word boundary in the current node. If michael@0: // we can't find one, start checking previous sibling nodes (if any michael@0: // adjacent ones exist) to see if we can find any text nodes with michael@0: // DOM word separators. We bail out as soon as we see a node that is michael@0: // not a text node, or we run out of previous sibling nodes. In the michael@0: // event that we simply cannot find any preceding word separator, the michael@0: // offset is set to 0, and the soft text beginning node is set to the michael@0: // "most previous" text node before the original starting node, or michael@0: // kept at the original starting node if no previous text nodes exist. michael@0: if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1, michael@0: &newOffset)) { michael@0: nsINode* prevNode = node->GetPreviousSibling(); michael@0: while (prevNode && IsTextNode(prevNode)) { michael@0: mSoftBegin.mNode = prevNode; michael@0: if (TextNodeContainsDOMWordSeparator(prevNode, INT32_MAX, michael@0: &newOffset)) { michael@0: break; michael@0: } michael@0: prevNode = prevNode->GetPreviousSibling(); michael@0: } michael@0: } michael@0: } michael@0: firstOffsetInNode = newOffset; michael@0: mSoftBegin.mOffset = newOffset; michael@0: } michael@0: break; michael@0: } michael@0: checkBeforeOffset = INT32_MAX; michael@0: if (IsBreakElement(node)) { michael@0: // Since GetPreviousContent follows tree *preorder*, we're about to traverse michael@0: // up out of 'node'. Since node induces breaks (e.g., it's a block), michael@0: // don't bother trying to look outside it, just stop now. michael@0: break; michael@0: } michael@0: // GetPreviousContent below expects mRootNode to be an ancestor of node. michael@0: if (!nsContentUtils::ContentIsDescendantOf(node, mRootNode)) { michael@0: break; michael@0: } michael@0: node = node->GetPreviousContent(mRootNode); michael@0: } michael@0: michael@0: // Now build up the string moving forward through the DOM until we reach michael@0: // the soft end and *then* see a DOM word separator, a non-inline-element michael@0: // boundary, or the hard end node. michael@0: mSoftText.Truncate(); michael@0: mSoftTextDOMMapping.Clear(); michael@0: bool seenSoftEnd = false; michael@0: // Leave this outside the loop so large heap string allocations can be reused michael@0: // across iterations michael@0: while (node) { michael@0: if (node == mSoftEnd.mNode) { michael@0: seenSoftEnd = true; michael@0: } michael@0: michael@0: bool exit = false; michael@0: if (IsTextNode(node)) { michael@0: nsIContent* content = static_cast(node); michael@0: NS_ASSERTION(content, "Where is our content?"); michael@0: const nsTextFragment* textFragment = content->GetText(); michael@0: NS_ASSERTION(textFragment, "Where is our text?"); michael@0: int32_t lastOffsetInNode = textFragment->GetLength(); michael@0: michael@0: if (seenSoftEnd) { michael@0: // check whether we can stop after this michael@0: for (int32_t i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0; michael@0: i < int32_t(textFragment->GetLength()); ++i) { michael@0: if (IsDOMWordSeparator(textFragment->CharAt(i))) { michael@0: exit = true; michael@0: // stop at the first separator after the soft end point michael@0: lastOffsetInNode = i; michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: michael@0: if (firstOffsetInNode < lastOffsetInNode) { michael@0: int32_t len = lastOffsetInNode - firstOffsetInNode; michael@0: mSoftTextDOMMapping.AppendElement( michael@0: DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len)); michael@0: michael@0: bool ok = textFragment->AppendTo(mSoftText, firstOffsetInNode, len, michael@0: mozilla::fallible_t()); michael@0: if (!ok) { michael@0: // probably out of memory, remove from mSoftTextDOMMapping michael@0: mSoftTextDOMMapping.RemoveElementAt(mSoftTextDOMMapping.Length() - 1); michael@0: exit = true; michael@0: } michael@0: } michael@0: michael@0: firstOffsetInNode = 0; michael@0: } michael@0: michael@0: if (exit) michael@0: break; michael@0: michael@0: CheckLeavingBreakElementClosure closure = { false }; michael@0: node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure); michael@0: if (closure.mLeftBreakElement || (node && IsBreakElement(node))) { michael@0: // We left, or are entering, a break element (e.g., block). Maybe we can michael@0: // stop now. michael@0: if (seenSoftEnd) michael@0: break; michael@0: // Record the break michael@0: mSoftText.Append(' '); michael@0: } michael@0: } michael@0: michael@0: #ifdef DEBUG_SPELLCHECK michael@0: printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get()); michael@0: #endif michael@0: } michael@0: michael@0: void michael@0: mozInlineSpellWordUtil::BuildRealWords() michael@0: { michael@0: // This is pretty simple. We just have to walk mSoftText, tokenizing it michael@0: // into "real words". michael@0: // We do an outer traversal of words delimited by IsDOMWordSeparator, calling michael@0: // SplitDOMWord on each of those DOM words michael@0: int32_t wordStart = -1; michael@0: mRealWords.Clear(); michael@0: for (int32_t i = 0; i < int32_t(mSoftText.Length()); ++i) { michael@0: if (IsDOMWordSeparator(mSoftText.CharAt(i))) { michael@0: if (wordStart >= 0) { michael@0: SplitDOMWord(wordStart, i); michael@0: wordStart = -1; michael@0: } michael@0: } else { michael@0: if (wordStart < 0) { michael@0: wordStart = i; michael@0: } michael@0: } michael@0: } michael@0: if (wordStart >= 0) { michael@0: SplitDOMWord(wordStart, mSoftText.Length()); michael@0: } michael@0: } michael@0: michael@0: /*********** DOM/realwords<->mSoftText mapping functions ************/ michael@0: michael@0: int32_t michael@0: mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset) michael@0: { michael@0: if (!mSoftTextValid) { michael@0: NS_ERROR("Soft text must be valid if we're to map into it"); michael@0: return -1; michael@0: } michael@0: michael@0: for (int32_t i = 0; i < int32_t(mSoftTextDOMMapping.Length()); ++i) { michael@0: const DOMTextMapping& map = mSoftTextDOMMapping[i]; michael@0: if (map.mNodeOffset.mNode == aNodeOffset.mNode) { michael@0: // Allow offsets at either end of the string, in particular, allow the michael@0: // offset that's at the end of the contributed string michael@0: int32_t offsetInContributedString = michael@0: aNodeOffset.mOffset - map.mNodeOffset.mOffset; michael@0: if (offsetInContributedString >= 0 && michael@0: offsetInContributedString <= map.mLength) michael@0: return map.mSoftTextOffset + offsetInContributedString; michael@0: return -1; michael@0: } michael@0: } michael@0: return -1; michael@0: } michael@0: michael@0: mozInlineSpellWordUtil::NodeOffset michael@0: mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset, michael@0: DOMMapHint aHint) michael@0: { michael@0: NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it"); michael@0: if (!mSoftTextValid) michael@0: return NodeOffset(nullptr, -1); michael@0: michael@0: // The invariant is that the range start..end includes the last mapping, michael@0: // if any, such that mSoftTextOffset <= aSoftTextOffset michael@0: int32_t start = 0; michael@0: int32_t end = mSoftTextDOMMapping.Length(); michael@0: while (end - start >= 2) { michael@0: int32_t mid = (start + end)/2; michael@0: const DOMTextMapping& map = mSoftTextDOMMapping[mid]; michael@0: if (map.mSoftTextOffset > aSoftTextOffset) { michael@0: end = mid; michael@0: } else { michael@0: start = mid; michael@0: } michael@0: } michael@0: michael@0: if (start >= end) michael@0: return NodeOffset(nullptr, -1); michael@0: michael@0: // 'start' is now the last mapping, if any, such that michael@0: // mSoftTextOffset <= aSoftTextOffset. michael@0: // If we're doing HINT_END, then we may want to return the end of the michael@0: // the previous mapping instead of the start of this mapping michael@0: if (aHint == HINT_END && start > 0) { michael@0: const DOMTextMapping& map = mSoftTextDOMMapping[start - 1]; michael@0: if (map.mSoftTextOffset + map.mLength == aSoftTextOffset) michael@0: return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength); michael@0: } michael@0: michael@0: // We allow ourselves to return the end of this mapping even if we're michael@0: // doing HINT_START. This will only happen if there is no mapping which this michael@0: // point is the start of. I'm not 100% sure this is OK... michael@0: const DOMTextMapping& map = mSoftTextDOMMapping[start]; michael@0: int32_t offset = aSoftTextOffset - map.mSoftTextOffset; michael@0: if (offset >= 0 && offset <= map.mLength) michael@0: return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset); michael@0: michael@0: return NodeOffset(nullptr, -1); michael@0: } michael@0: michael@0: int32_t michael@0: mozInlineSpellWordUtil::FindRealWordContaining(int32_t aSoftTextOffset, michael@0: DOMMapHint aHint, bool aSearchForward) michael@0: { michael@0: NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it"); michael@0: if (!mSoftTextValid) michael@0: return -1; michael@0: michael@0: // The invariant is that the range start..end includes the last word, michael@0: // if any, such that mSoftTextOffset <= aSoftTextOffset michael@0: int32_t start = 0; michael@0: int32_t end = mRealWords.Length(); michael@0: while (end - start >= 2) { michael@0: int32_t mid = (start + end)/2; michael@0: const RealWord& word = mRealWords[mid]; michael@0: if (word.mSoftTextOffset > aSoftTextOffset) { michael@0: end = mid; michael@0: } else { michael@0: start = mid; michael@0: } michael@0: } michael@0: michael@0: if (start >= end) michael@0: return -1; michael@0: michael@0: // 'start' is now the last word, if any, such that michael@0: // mSoftTextOffset <= aSoftTextOffset. michael@0: // If we're doing HINT_END, then we may want to return the end of the michael@0: // the previous word instead of the start of this word michael@0: if (aHint == HINT_END && start > 0) { michael@0: const RealWord& word = mRealWords[start - 1]; michael@0: if (word.mSoftTextOffset + word.mLength == aSoftTextOffset) michael@0: return start - 1; michael@0: } michael@0: michael@0: // We allow ourselves to return the end of this word even if we're michael@0: // doing HINT_START. This will only happen if there is no word which this michael@0: // point is the start of. I'm not 100% sure this is OK... michael@0: const RealWord& word = mRealWords[start]; michael@0: int32_t offset = aSoftTextOffset - word.mSoftTextOffset; michael@0: if (offset >= 0 && offset <= word.mLength) michael@0: return start; michael@0: michael@0: if (aSearchForward) { michael@0: if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) { michael@0: // All words have mSoftTextOffset > aSoftTextOffset michael@0: return 0; michael@0: } michael@0: // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset. michael@0: // Word start+1, if it exists, will be the first with michael@0: // mSoftTextOffset > aSoftTextOffset. michael@0: if (start + 1 < int32_t(mRealWords.Length())) michael@0: return start + 1; michael@0: } michael@0: michael@0: return -1; michael@0: } michael@0: michael@0: /*********** Word Splitting ************/ michael@0: michael@0: // classifies a given character in the DOM word michael@0: enum CharClass { michael@0: CHAR_CLASS_WORD, michael@0: CHAR_CLASS_SEPARATOR, michael@0: CHAR_CLASS_END_OF_INPUT }; michael@0: michael@0: // Encapsulates DOM-word to real-word splitting michael@0: struct MOZ_STACK_CLASS WordSplitState michael@0: { michael@0: mozInlineSpellWordUtil* mWordUtil; michael@0: const nsDependentSubstring mDOMWordText; michael@0: int32_t mDOMWordOffset; michael@0: CharClass mCurCharClass; michael@0: michael@0: WordSplitState(mozInlineSpellWordUtil* aWordUtil, michael@0: const nsString& aString, int32_t aStart, int32_t aLen) michael@0: : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen), michael@0: mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {} michael@0: michael@0: CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const; michael@0: void Advance(); michael@0: void AdvanceThroughSeparators(); michael@0: void AdvanceThroughWord(); michael@0: michael@0: // Finds special words like email addresses and URLs that may start at the michael@0: // current position, and returns their length, or 0 if not found. This allows michael@0: // arbitrary word breaking rules to be used for these special entities, as michael@0: // long as they can not contain whitespace. michael@0: bool IsSpecialWord(); michael@0: michael@0: // Similar to IsSpecialWord except that this takes a split word as michael@0: // input. This checks for things that do not require special word-breaking michael@0: // rules. michael@0: bool ShouldSkipWord(int32_t aStart, int32_t aLength); michael@0: }; michael@0: michael@0: // WordSplitState::ClassifyCharacter michael@0: michael@0: CharClass michael@0: WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const michael@0: { michael@0: NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()), michael@0: "Index out of range"); michael@0: if (aIndex == int32_t(mDOMWordText.Length())) michael@0: return CHAR_CLASS_SEPARATOR; michael@0: michael@0: // this will classify the character, we want to treat "ignorable" characters michael@0: // such as soft hyphens, and also ZWJ and ZWNJ as word characters. michael@0: nsIUGenCategory::nsUGenCategory michael@0: charCategory = mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]); michael@0: if (charCategory == nsIUGenCategory::kLetter || michael@0: IsIgnorableCharacter(mDOMWordText[aIndex]) || michael@0: mDOMWordText[aIndex] == 0x200C /* ZWNJ */ || michael@0: mDOMWordText[aIndex] == 0x200D /* ZWJ */) michael@0: return CHAR_CLASS_WORD; michael@0: michael@0: // If conditional punctuation is surrounded immediately on both sides by word michael@0: // characters it also counts as a word character. michael@0: if (IsConditionalPunctuation(mDOMWordText[aIndex])) { michael@0: if (!aRecurse) { michael@0: // not allowed to look around, this punctuation counts like a separator michael@0: return CHAR_CLASS_SEPARATOR; michael@0: } michael@0: michael@0: // check the left-hand character michael@0: if (aIndex == 0) michael@0: return CHAR_CLASS_SEPARATOR; michael@0: if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) michael@0: return CHAR_CLASS_SEPARATOR; michael@0: // If the previous charatcer is a word-char, make sure that it's not a michael@0: // special dot character. michael@0: if (mDOMWordText[aIndex - 1] == '.') michael@0: return CHAR_CLASS_SEPARATOR; michael@0: michael@0: // now we know left char is a word-char, check the right-hand character michael@0: if (aIndex == int32_t(mDOMWordText.Length()) - 1) michael@0: return CHAR_CLASS_SEPARATOR; michael@0: if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD) michael@0: return CHAR_CLASS_SEPARATOR; michael@0: // If the next charatcer is a word-char, make sure that it's not a michael@0: // special dot character. michael@0: if (mDOMWordText[aIndex + 1] == '.') michael@0: return CHAR_CLASS_SEPARATOR; michael@0: michael@0: // char on either side is a word, this counts as a word michael@0: return CHAR_CLASS_WORD; michael@0: } michael@0: michael@0: // The dot character, if appearing at the end of a word, should michael@0: // be considered part of that word. Example: "etc.", or michael@0: // abbreviations michael@0: if (aIndex > 0 && michael@0: mDOMWordText[aIndex] == '.' && michael@0: mDOMWordText[aIndex - 1] != '.' && michael@0: ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) { michael@0: return CHAR_CLASS_WORD; michael@0: } michael@0: michael@0: // all other punctuation michael@0: if (charCategory == nsIUGenCategory::kSeparator || michael@0: charCategory == nsIUGenCategory::kOther || michael@0: charCategory == nsIUGenCategory::kPunctuation || michael@0: charCategory == nsIUGenCategory::kSymbol) { michael@0: // Don't break on hyphens, as hunspell handles them on its own. michael@0: if (aIndex > 0 && michael@0: mDOMWordText[aIndex] == '-' && michael@0: mDOMWordText[aIndex - 1] != '-' && michael@0: ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) { michael@0: // A hyphen is only meaningful as a separator inside a word michael@0: // if the previous and next characters are a word character. michael@0: if (aIndex == int32_t(mDOMWordText.Length()) - 1) michael@0: return CHAR_CLASS_SEPARATOR; michael@0: if (mDOMWordText[aIndex + 1] != '.' && michael@0: ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD) michael@0: return CHAR_CLASS_WORD; michael@0: } michael@0: return CHAR_CLASS_SEPARATOR; michael@0: } michael@0: michael@0: // any other character counts as a word michael@0: return CHAR_CLASS_WORD; michael@0: } michael@0: michael@0: michael@0: // WordSplitState::Advance michael@0: michael@0: void michael@0: WordSplitState::Advance() michael@0: { michael@0: NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index"); michael@0: NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(), michael@0: "Length beyond end"); michael@0: michael@0: mDOMWordOffset ++; michael@0: if (mDOMWordOffset >= (int32_t)mDOMWordText.Length()) michael@0: mCurCharClass = CHAR_CLASS_END_OF_INPUT; michael@0: else michael@0: mCurCharClass = ClassifyCharacter(mDOMWordOffset, true); michael@0: } michael@0: michael@0: michael@0: // WordSplitState::AdvanceThroughSeparators michael@0: michael@0: void michael@0: WordSplitState::AdvanceThroughSeparators() michael@0: { michael@0: while (mCurCharClass == CHAR_CLASS_SEPARATOR) michael@0: Advance(); michael@0: } michael@0: michael@0: // WordSplitState::AdvanceThroughWord michael@0: michael@0: void michael@0: WordSplitState::AdvanceThroughWord() michael@0: { michael@0: while (mCurCharClass == CHAR_CLASS_WORD) michael@0: Advance(); michael@0: } michael@0: michael@0: michael@0: // WordSplitState::IsSpecialWord michael@0: michael@0: bool michael@0: WordSplitState::IsSpecialWord() michael@0: { michael@0: // Search for email addresses. We simply define these as any sequence of michael@0: // characters with an '@' character in the middle. The DOM word is already michael@0: // split on whitepace, so we know that everything to the end is the address michael@0: int32_t firstColon = -1; michael@0: for (int32_t i = mDOMWordOffset; michael@0: i < int32_t(mDOMWordText.Length()); i ++) { michael@0: if (mDOMWordText[i] == '@') { michael@0: // only accept this if there are unambiguous word characters (don't bother michael@0: // recursing to disambiguate apostrophes) on each side. This prevents michael@0: // classifying, e.g. "@home" as an email address michael@0: michael@0: // Use this condition to only accept words with '@' in the middle of michael@0: // them. It works, but the inlinespellcker doesn't like this. The problem michael@0: // is that you type "fhsgfh@" that's a misspelled word followed by a michael@0: // symbol, but when you type another letter "fhsgfh@g" that first word michael@0: // need to be unmarked misspelled. It doesn't do this. it only checks the michael@0: // current position for potentially removing a spelling range. michael@0: if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD && michael@0: i < (int32_t)mDOMWordText.Length() - 1 && michael@0: ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) { michael@0: return true; michael@0: } michael@0: } else if (mDOMWordText[i] == ':' && firstColon < 0) { michael@0: firstColon = i; michael@0: michael@0: // If the first colon is followed by a slash, consider it a URL michael@0: // This will catch things like asdf://foo.com michael@0: if (firstColon < (int32_t)mDOMWordText.Length() - 1 && michael@0: mDOMWordText[firstColon + 1] == '/') { michael@0: return true; michael@0: } michael@0: } michael@0: } michael@0: michael@0: // Check the text before the first colon against some known protocols. It michael@0: // is impossible to check against all protocols, especially since you can michael@0: // plug in new protocols. We also don't want to waste time here checking michael@0: // against a lot of obscure protocols. michael@0: if (firstColon > mDOMWordOffset) { michael@0: nsString protocol(Substring(mDOMWordText, mDOMWordOffset, michael@0: firstColon - mDOMWordOffset)); michael@0: if (protocol.EqualsIgnoreCase("http") || michael@0: protocol.EqualsIgnoreCase("https") || michael@0: protocol.EqualsIgnoreCase("news") || michael@0: protocol.EqualsIgnoreCase("file") || michael@0: protocol.EqualsIgnoreCase("javascript") || michael@0: protocol.EqualsIgnoreCase("data") || michael@0: protocol.EqualsIgnoreCase("ftp")) { michael@0: return true; michael@0: } michael@0: } michael@0: michael@0: // not anything special michael@0: return false; michael@0: } michael@0: michael@0: // WordSplitState::ShouldSkipWord michael@0: michael@0: bool michael@0: WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength) michael@0: { michael@0: int32_t last = aStart + aLength; michael@0: michael@0: // check to see if the word contains a digit michael@0: for (int32_t i = aStart; i < last; i ++) { michael@0: if (unicode::GetGenCategory(mDOMWordText[i]) == nsIUGenCategory::kNumber) { michael@0: return true; michael@0: } michael@0: } michael@0: michael@0: // not special michael@0: return false; michael@0: } michael@0: michael@0: // mozInlineSpellWordUtil::SplitDOMWord michael@0: michael@0: void michael@0: mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd) michael@0: { michael@0: WordSplitState state(this, mSoftText, aStart, aEnd - aStart); michael@0: state.mCurCharClass = state.ClassifyCharacter(0, true); michael@0: michael@0: state.AdvanceThroughSeparators(); michael@0: if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT && michael@0: state.IsSpecialWord()) { michael@0: int32_t specialWordLength = state.mDOMWordText.Length() - state.mDOMWordOffset; michael@0: mRealWords.AppendElement( michael@0: RealWord(aStart + state.mDOMWordOffset, specialWordLength, false)); michael@0: michael@0: return; michael@0: } michael@0: michael@0: while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) { michael@0: state.AdvanceThroughSeparators(); michael@0: if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT) michael@0: break; michael@0: michael@0: // save the beginning of the word michael@0: int32_t wordOffset = state.mDOMWordOffset; michael@0: michael@0: // find the end of the word michael@0: state.AdvanceThroughWord(); michael@0: int32_t wordLen = state.mDOMWordOffset - wordOffset; michael@0: mRealWords.AppendElement( michael@0: RealWord(aStart + wordOffset, wordLen, michael@0: !state.ShouldSkipWord(wordOffset, wordLen))); michael@0: } michael@0: }