The Tor Browser: extensions/spellcheck/src/mozInlineSpellWordUtil.cpp@6474c204b198 (annotated)

extensions/spellcheck/src/mozInlineSpellWordUtil.cpp@6474c204b198 (annotated)

extensions/spellcheck/src/mozInlineSpellWordUtil.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author: Michael Schloh von Bennewitz <michael@schloh.com>
date: Wed, 31 Dec 2014 06:09:35 +0100
changeset 0: 6474c204b198
permissions: -rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 #include "mozInlineSpellWordUtil.h"
 #include "nsDebug.h"
 #include "nsIAtom.h"
 #include "nsComponentManagerUtils.h"
 #include "nsIDOMCSSStyleDeclaration.h"
 #include "nsIDOMElement.h"
 #include "nsIDOMRange.h"
 #include "nsIEditor.h"
 #include "nsIDOMNode.h"
 #include "nsIDOMHTMLBRElement.h"
 #include "nsUnicharUtilCIID.h"
 #include "nsUnicodeProperties.h"
 #include "nsServiceManagerUtils.h"
 #include "nsIContent.h"
 #include "nsTextFragment.h"
 #include "mozilla/dom/Element.h"
 #include "nsRange.h"
 #include "nsContentUtils.h"
 #include "nsIFrame.h"
 #include <algorithm>
 using namespace mozilla;
 // IsIgnorableCharacter
 //
 //    These characters are ones that we should ignore in input.
 inline bool IsIgnorableCharacter(char16_t ch)
 {
   return (ch == 0xAD ||   // SOFT HYPHEN
           ch == 0x1806);  // MONGOLIAN TODO SOFT HYPHEN
 }
 // IsConditionalPunctuation
 //
 //    Some characters (like apostrophes) require characters on each side to be
 //    part of a word, and are otherwise punctuation.
 inline bool IsConditionalPunctuation(char16_t ch)
 {
   return (ch == '\'' ||
           ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK
           ch == 0x00B7); // MIDDLE DOT
 }
 // mozInlineSpellWordUtil::Init
 nsresult
 mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor)
 {
   nsresult rv;
   // getting the editor can fail commonly because the editor was detached, so
   // don't assert
   nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv);
   if (NS_FAILED(rv))
     return rv;
   nsCOMPtr<nsIDOMDocument> domDoc;
   rv = editor->GetDocument(getter_AddRefs(domDoc));
   NS_ENSURE_SUCCESS(rv, rv);
   NS_ENSURE_TRUE(domDoc, NS_ERROR_NULL_POINTER);
   mDOMDocument = domDoc;
   mDocument = do_QueryInterface(domDoc);
   // Find the root node for the editor. For contenteditable we'll need something
   // cleverer here.
   nsCOMPtr<nsIDOMElement> rootElt;
   rv = editor->GetRootElement(getter_AddRefs(rootElt));
   NS_ENSURE_SUCCESS(rv, rv);
   nsCOMPtr<nsINode> rootNode = do_QueryInterface(rootElt);
   mRootNode = rootNode;
   NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!");
   return NS_OK;
 }
 static inline bool
 IsTextNode(nsINode* aNode)
 {
   return aNode->IsNodeOfType(nsINode::eTEXT);
 }
 typedef void (* OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);
 // Find the next node in the DOM tree in preorder.
 // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
 // why we can't just use GetNextNode here, sadly.
 static nsINode*
 FindNextNode(nsINode* aNode, nsINode* aRoot,
              OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure)
 {
   NS_PRECONDITION(aNode, "Null starting node?");
   nsINode* next = aNode->GetFirstChild();
   if (next)
     return next;
   // Don't look at siblings or otherwise outside of aRoot
   if (aNode == aRoot)
     return nullptr;
   next = aNode->GetNextSibling();
   if (next)
     return next;
   // Go up
   for (;;) {
     if (aOnLeaveNode) {
       aOnLeaveNode(aNode, aClosure);
     }
     next = aNode->GetParent();
     if (next == aRoot || ! next)
       return nullptr;
     aNode = next;
     next = aNode->GetNextSibling();
     if (next)
       return next;
   }
 }
 // aNode is not a text node. Find the first text node starting at aNode/aOffset
 // in a preorder DOM traversal.
 static nsINode*
 FindNextTextNode(nsINode* aNode, int32_t aOffset, nsINode* aRoot)
 {
   NS_PRECONDITION(aNode, "Null starting node?");
   NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node");
   nsINode* checkNode;
   // Need to start at the aOffset'th child
   nsIContent* child = aNode->GetChildAt(aOffset);
   if (child) {
     checkNode = child;
   } else {
     // aOffset was beyond the end of the child list.
     // goto next node after the last descendant of aNode in
     // a preorder DOM traversal.
     checkNode = aNode->GetNextNonChildNode(aRoot);
   }
   while (checkNode && !IsTextNode(checkNode)) {
     checkNode = checkNode->GetNextNode(aRoot);
   }
   return checkNode;
 }
 // mozInlineSpellWordUtil::SetEnd
 //
 //    We have two ranges "hard" and "soft". The hard boundary is simply
 //    the scope of the root node. The soft boundary is that which is set
 //    by the caller of this class by calling this function. If this function is
 //    not called, the soft boundary is the same as the hard boundary.
 //
 //    When we reach the soft boundary (mSoftEnd), we keep
 //    going until we reach the end of a word. This allows the caller to set the
 //    end of the range to anything, and we will always check whole multiples of
 //    words. When we reach the hard boundary we stop no matter what.
 //
 //    There is no beginning soft boundary. This is because we only go to the
 //    previous node once, when finding the previous word boundary in
 //    SetPosition(). You might think of the soft boundary as being this initial
 //    position.
 nsresult
 mozInlineSpellWordUtil::SetEnd(nsINode* aEndNode, int32_t aEndOffset)
 {
   NS_PRECONDITION(aEndNode, "Null end node?");
   NS_ASSERTION(mRootNode, "Not initialized");
   InvalidateWords();
   if (!IsTextNode(aEndNode)) {
     // End at the start of the first text node after aEndNode/aEndOffset.
     aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
     aEndOffset = 0;
   }
   mSoftEnd = NodeOffset(aEndNode, aEndOffset);
   return NS_OK;
 }
 nsresult
 mozInlineSpellWordUtil::SetPosition(nsINode* aNode, int32_t aOffset)
 {
   InvalidateWords();
   if (!IsTextNode(aNode)) {
     // Start at the start of the first text node after aNode/aOffset.
     aNode = FindNextTextNode(aNode, aOffset, mRootNode);
     aOffset = 0;
   }
   mSoftBegin = NodeOffset(aNode, aOffset);
   EnsureWords();
   int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin);
   if (textOffset < 0)
     return NS_OK;
   mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
   return NS_OK;
 }
 void
 mozInlineSpellWordUtil::EnsureWords()
 {
   if (mSoftTextValid)
     return;
   BuildSoftText();
   BuildRealWords();
   mSoftTextValid = true;
 }
 nsresult
 mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsRange** aRange)
 {
   NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
   NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
   return MakeRange(begin, end, aRange);
 }
 // mozInlineSpellWordUtil::GetRangeForWord
 nsresult
 mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode,
                                         int32_t aWordOffset,
                                         nsRange** aRange)
 {
   // Set our soft end and start
   nsCOMPtr<nsINode> wordNode = do_QueryInterface(aWordNode);
   NodeOffset pt = NodeOffset(wordNode, aWordOffset);
   InvalidateWords();
   mSoftBegin = mSoftEnd = pt;
   EnsureWords();
   int32_t offset = MapDOMPositionToSoftTextOffset(pt);
   if (offset < 0)
     return MakeRange(pt, pt, aRange);
   int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
   if (wordIndex < 0)
     return MakeRange(pt, pt, aRange);
   return MakeRangeForWord(mRealWords[wordIndex], aRange);
 }
 // This is to fix characters that the spellchecker may not like
 static void
 NormalizeWord(const nsSubstring& aInput, int32_t aPos, int32_t aLen, nsAString& aOutput)
 {
   aOutput.Truncate();
   for (int32_t i = 0; i < aLen; i++) {
     char16_t ch = aInput.CharAt(i + aPos);
     // remove ignorable characters from the word
     if (IsIgnorableCharacter(ch))
       continue;
     // the spellchecker doesn't handle curly apostrophes in all languages
     if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
       ch = '\'';
     }
     aOutput.Append(ch);
   }
 }
 // mozInlineSpellWordUtil::GetNextWord
 //
 //    FIXME-optimization: we shouldn't have to generate a range every single
 //    time. It would be better if the inline spellchecker didn't require a
 //    range unless the word was misspelled. This may or may not be possible.
 nsresult
 mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsRange** aRange,
                                     bool* aSkipChecking)
 {
 #ifdef DEBUG_SPELLCHECK
   printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex);
 #endif
   if (mNextWordIndex < 0 ||
       mNextWordIndex >= int32_t(mRealWords.Length())) {
     mNextWordIndex = -1;
     *aRange = nullptr;
     *aSkipChecking = true;
     return NS_OK;
   }
   const RealWord& word = mRealWords[mNextWordIndex];
   nsresult rv = MakeRangeForWord(word, aRange);
   NS_ENSURE_SUCCESS(rv, rv);
   ++mNextWordIndex;
   *aSkipChecking = !word.mCheckableWord;
   ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText);
 #ifdef DEBUG_SPELLCHECK
   printf("GetNextWord returning: %s (skip=%d)\n",
          NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking);
 #endif
   return NS_OK;
 }
 // mozInlineSpellWordUtil::MakeRange
 //
 //    Convenience function for creating a range over the current document.
 nsresult
 mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
                                   nsRange** aRange)
 {
   NS_ENSURE_ARG_POINTER(aBegin.mNode);
   if (!mDOMDocument)
     return NS_ERROR_NOT_INITIALIZED;
   nsRefPtr<nsRange> range = new nsRange(aBegin.mNode);
   nsresult rv = range->Set(aBegin.mNode, aBegin.mOffset,
                            aEnd.mNode, aEnd.mOffset);
   NS_ENSURE_SUCCESS(rv, rv);
   range.forget(aRange);
   return NS_OK;
 }
 /*********** DOM text extraction ************/
 // IsDOMWordSeparator
 //
 //    Determines if the given character should be considered as a DOM Word
 //    separator. Basically, this is whitespace, although it could also have
 //    certain punctuation that we know ALWAYS breaks words. This is important.
 //    For example, we can't have any punctuation that could appear in a URL
 //    or email address in this, because those need to always fit into a single
 //    DOM word.
 static bool
 IsDOMWordSeparator(char16_t ch)
 {
   // simple spaces
   if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
     return true;
   // complex spaces - check only if char isn't ASCII (uncommon)
   if (ch >= 0xA0 &&
       (ch == 0x00A0 ||  // NO-BREAK SPACE
        ch == 0x2002 ||  // EN SPACE
        ch == 0x2003 ||  // EM SPACE
        ch == 0x2009 ||  // THIN SPACE
        ch == 0x3000))   // IDEOGRAPHIC SPACE
     return true;
   // otherwise not a space
   return false;
 }
 static inline bool
 IsBRElement(nsINode* aNode)
 {
   return aNode->IsElement() &&
          aNode->AsElement()->IsHTML(nsGkAtoms::br);
 }
 /**
  * Given a TextNode, checks to see if there's a DOM word separator before
  * aBeforeOffset within it. This function does not modify aSeparatorOffset when
  * it returns false.
  *
  * @param aNode the TextNode to check.
  * @param aBeforeOffset the offset in the TextNode before which we will search
  *        for the DOM separator. You can pass INT32_MAX to search the entire
  *        length of the string.
  * @param aSeparatorOffset will be set to the offset of the first separator it
  *        encounters. Will not be written to if no separator is found.
  * @returns True if it found a separator.
  */
 static bool
 TextNodeContainsDOMWordSeparator(nsINode* aNode,
                                  int32_t aBeforeOffset,
                                  int32_t* aSeparatorOffset)
 {
   // aNode is actually an nsIContent, since it's eTEXT
   nsIContent* content = static_cast<nsIContent*>(aNode);
   const nsTextFragment* textFragment = content->GetText();
   NS_ASSERTION(textFragment, "Where is our text?");
   for (int32_t i = std::min(aBeforeOffset, int32_t(textFragment->GetLength())) - 1; i >= 0; --i) {
     if (IsDOMWordSeparator(textFragment->CharAt(i))) {
       // Be greedy, find as many separators as we can
       for (int32_t j = i - 1; j >= 0; --j) {
         if (IsDOMWordSeparator(textFragment->CharAt(j))) {
           i = j;
         } else {
           break;
         }
       }
       *aSeparatorOffset = i;
       return true;
     }
   }
   return false;
 }
 /**
  * Check if there's a DOM word separator before aBeforeOffset in this node.
  * Always returns true if it's a BR element.
  * aSeparatorOffset is set to the index of the first character in the last
  * separator if any is found (0 for BR elements).
  *
  * This function does not modify aSeparatorOffset when it returns false.
  */
 static bool
 ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,
                          int32_t* aSeparatorOffset)
 {
   if (IsBRElement(aNode)) {
     *aSeparatorOffset = 0;
     return true;
   }
   if (!IsTextNode(aNode))
     return false;
   return TextNodeContainsDOMWordSeparator(aNode, aBeforeOffset,
                                           aSeparatorOffset);
 }
 static bool
 IsBreakElement(nsINode* aNode)
 {
   if (!aNode->IsElement()) {
     return false;
   }
   dom::Element *element = aNode->AsElement();
   if (element->IsHTML(nsGkAtoms::br))
     return true;
   // If we don't have a frame, we don't consider ourselves a break
   // element.  In particular, words can span us.
   if (!element->GetPrimaryFrame())
     return false;
   // Anything that's not an inline element is a break element.
   // XXXbz should replaced inlines be break elements, though?
   return element->GetPrimaryFrame()->StyleDisplay()->mDisplay !=
     NS_STYLE_DISPLAY_INLINE;
 }
 struct CheckLeavingBreakElementClosure {
   bool          mLeftBreakElement;
 };
 static void
 CheckLeavingBreakElement(nsINode* aNode, void* aClosure)
 {
   CheckLeavingBreakElementClosure* cl =
     static_cast<CheckLeavingBreakElementClosure*>(aClosure);
   if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
     cl->mLeftBreakElement = true;
   }
 }
 void
 mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord)
 {
   nsAutoString result;
   ::NormalizeWord(aWord, 0, aWord.Length(), result);
   aWord = result;
 }
 void
 mozInlineSpellWordUtil::BuildSoftText()
 {
   // First we have to work backwards from mSoftStart to find a text node
   // containing a DOM word separator, a non-inline-element
   // boundary, or the hard start node. That's where we'll start building the
   // soft string from.
   nsINode* node = mSoftBegin.mNode;
   int32_t firstOffsetInNode = 0;
   int32_t checkBeforeOffset = mSoftBegin.mOffset;
   while (node) {
     if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
       if (node == mSoftBegin.mNode) {
         // If we find a word separator on the first node, look at the preceding
         // word on the text node as well.
         int32_t newOffset = 0;
         if (firstOffsetInNode > 0) {
           // Try to find the previous word boundary in the current node. If
           // we can't find one, start checking previous sibling nodes (if any
           // adjacent ones exist) to see if we can find any text nodes with
           // DOM word separators. We bail out as soon as we see a node that is
           // not a text node, or we run out of previous sibling nodes. In the
           // event that we simply cannot find any preceding word separator, the
           // offset is set to 0, and the soft text beginning node is set to the
           // "most previous" text node before the original starting node, or
           // kept at the original starting node if no previous text nodes exist.
           if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,
                                         &newOffset)) {
             nsINode* prevNode = node->GetPreviousSibling();
             while (prevNode && IsTextNode(prevNode)) {
               mSoftBegin.mNode = prevNode;
               if (TextNodeContainsDOMWordSeparator(prevNode, INT32_MAX,
                                                    &newOffset)) {
                 break;
               }
               prevNode = prevNode->GetPreviousSibling();
             }
           }
         }
         firstOffsetInNode = newOffset;
         mSoftBegin.mOffset = newOffset;
       }
       break;
     }
     checkBeforeOffset = INT32_MAX;
     if (IsBreakElement(node)) {
       // Since GetPreviousContent follows tree *preorder*, we're about to traverse
       // up out of 'node'. Since node induces breaks (e.g., it's a block),
       // don't bother trying to look outside it, just stop now.
       break;
     }
     // GetPreviousContent below expects mRootNode to be an ancestor of node.
     if (!nsContentUtils::ContentIsDescendantOf(node, mRootNode)) {
       break;
     }
     node = node->GetPreviousContent(mRootNode);
   }
   // Now build up the string moving forward through the DOM until we reach
   // the soft end and *then* see a DOM word separator, a non-inline-element
   // boundary, or the hard end node.
   mSoftText.Truncate();
   mSoftTextDOMMapping.Clear();
   bool seenSoftEnd = false;
   // Leave this outside the loop so large heap string allocations can be reused
   // across iterations
   while (node) {
     if (node == mSoftEnd.mNode) {
       seenSoftEnd = true;
     }
     bool exit = false;
     if (IsTextNode(node)) {
       nsIContent* content = static_cast<nsIContent*>(node);
       NS_ASSERTION(content, "Where is our content?");
       const nsTextFragment* textFragment = content->GetText();
       NS_ASSERTION(textFragment, "Where is our text?");
       int32_t lastOffsetInNode = textFragment->GetLength();
       if (seenSoftEnd) {
         // check whether we can stop after this
         for (int32_t i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0;
              i < int32_t(textFragment->GetLength()); ++i) {
           if (IsDOMWordSeparator(textFragment->CharAt(i))) {
             exit = true;
             // stop at the first separator after the soft end point
             lastOffsetInNode = i;
             break;
           }
         }
       }
       if (firstOffsetInNode < lastOffsetInNode) {
         int32_t len = lastOffsetInNode - firstOffsetInNode;
         mSoftTextDOMMapping.AppendElement(
           DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len));
         bool ok = textFragment->AppendTo(mSoftText, firstOffsetInNode, len,
                                          mozilla::fallible_t());
         if (!ok) {
             // probably out of memory, remove from mSoftTextDOMMapping
             mSoftTextDOMMapping.RemoveElementAt(mSoftTextDOMMapping.Length() - 1);
             exit = true;
         }
       }
       firstOffsetInNode = 0;
     }
     if (exit)
       break;
     CheckLeavingBreakElementClosure closure = { false };
     node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure);
     if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
       // We left, or are entering, a break element (e.g., block). Maybe we can
       // stop now.
       if (seenSoftEnd)
         break;
       // Record the break
       mSoftText.Append(' ');
     }
   }
 #ifdef DEBUG_SPELLCHECK
   printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get());
 #endif
 }
 void
 mozInlineSpellWordUtil::BuildRealWords()
 {
   // This is pretty simple. We just have to walk mSoftText, tokenizing it
   // into "real words".
   // We do an outer traversal of words delimited by IsDOMWordSeparator, calling
   // SplitDOMWord on each of those DOM words
   int32_t wordStart = -1;
   mRealWords.Clear();
   for (int32_t i = 0; i < int32_t(mSoftText.Length()); ++i) {
     if (IsDOMWordSeparator(mSoftText.CharAt(i))) {
       if (wordStart >= 0) {
         SplitDOMWord(wordStart, i);
         wordStart = -1;
       }
     } else {
       if (wordStart < 0) {
         wordStart = i;
       }
     }
   }
   if (wordStart >= 0) {
     SplitDOMWord(wordStart, mSoftText.Length());
   }
 }
 /*********** DOM/realwords<->mSoftText mapping functions ************/
 int32_t
 mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset)
 {
   if (!mSoftTextValid) {
     NS_ERROR("Soft text must be valid if we're to map into it");
     return -1;
   }
   for (int32_t i = 0; i < int32_t(mSoftTextDOMMapping.Length()); ++i) {
     const DOMTextMapping& map = mSoftTextDOMMapping[i];
     if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
       // Allow offsets at either end of the string, in particular, allow the
       // offset that's at the end of the contributed string
       int32_t offsetInContributedString =
         aNodeOffset.mOffset - map.mNodeOffset.mOffset;
       if (offsetInContributedString >= 0 &&
           offsetInContributedString <= map.mLength)
         return map.mSoftTextOffset + offsetInContributedString;
       return -1;
     }
   }
   return -1;
 }
 mozInlineSpellWordUtil::NodeOffset
 mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset,
                                                        DOMMapHint aHint)
 {
   NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
   if (!mSoftTextValid)
     return NodeOffset(nullptr, -1);
   // The invariant is that the range start..end includes the last mapping,
   // if any, such that mSoftTextOffset <= aSoftTextOffset
   int32_t start = 0;
   int32_t end = mSoftTextDOMMapping.Length();
   while (end - start >= 2) {
     int32_t mid = (start + end)/2;
     const DOMTextMapping& map = mSoftTextDOMMapping[mid];
     if (map.mSoftTextOffset > aSoftTextOffset) {
       end = mid;
     } else {
       start = mid;
     }
   }
   if (start >= end)
     return NodeOffset(nullptr, -1);
   // 'start' is now the last mapping, if any, such that
   // mSoftTextOffset <= aSoftTextOffset.
   // If we're doing HINT_END, then we may want to return the end of the
   // the previous mapping instead of the start of this mapping
   if (aHint == HINT_END && start > 0) {
     const DOMTextMapping& map = mSoftTextDOMMapping[start - 1];
     if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
       return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength);
   }
   // We allow ourselves to return the end of this mapping even if we're
   // doing HINT_START. This will only happen if there is no mapping which this
   // point is the start of. I'm not 100% sure this is OK...
   const DOMTextMapping& map = mSoftTextDOMMapping[start];
   int32_t offset = aSoftTextOffset - map.mSoftTextOffset;
   if (offset >= 0 && offset <= map.mLength)
     return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
   return NodeOffset(nullptr, -1);
 }
 int32_t
 mozInlineSpellWordUtil::FindRealWordContaining(int32_t aSoftTextOffset,
     DOMMapHint aHint, bool aSearchForward)
 {
   NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
   if (!mSoftTextValid)
     return -1;
   // The invariant is that the range start..end includes the last word,
   // if any, such that mSoftTextOffset <= aSoftTextOffset
   int32_t start = 0;
   int32_t end = mRealWords.Length();
   while (end - start >= 2) {
     int32_t mid = (start + end)/2;
     const RealWord& word = mRealWords[mid];
     if (word.mSoftTextOffset > aSoftTextOffset) {
       end = mid;
     } else {
       start = mid;
     }
   }
   if (start >= end)
     return -1;
   // 'start' is now the last word, if any, such that
   // mSoftTextOffset <= aSoftTextOffset.
   // If we're doing HINT_END, then we may want to return the end of the
   // the previous word instead of the start of this word
   if (aHint == HINT_END && start > 0) {
     const RealWord& word = mRealWords[start - 1];
     if (word.mSoftTextOffset + word.mLength == aSoftTextOffset)
       return start - 1;
   }
   // We allow ourselves to return the end of this word even if we're
   // doing HINT_START. This will only happen if there is no word which this
   // point is the start of. I'm not 100% sure this is OK...
   const RealWord& word = mRealWords[start];
   int32_t offset = aSoftTextOffset - word.mSoftTextOffset;
   if (offset >= 0 && offset <= word.mLength)
     return start;
   if (aSearchForward) {
     if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
       // All words have mSoftTextOffset > aSoftTextOffset
       return 0;
     }
     // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset.
     // Word start+1, if it exists, will be the first with
     // mSoftTextOffset > aSoftTextOffset.
     if (start + 1 < int32_t(mRealWords.Length()))
       return start + 1;
   }
   return -1;
 }
 /*********** Word Splitting ************/
 // classifies a given character in the DOM word
 enum CharClass {
   CHAR_CLASS_WORD,
   CHAR_CLASS_SEPARATOR,
   CHAR_CLASS_END_OF_INPUT };
 // Encapsulates DOM-word to real-word splitting
 struct MOZ_STACK_CLASS WordSplitState
 {
   mozInlineSpellWordUtil*    mWordUtil;
   const nsDependentSubstring mDOMWordText;
   int32_t                    mDOMWordOffset;
   CharClass                  mCurCharClass;
   WordSplitState(mozInlineSpellWordUtil* aWordUtil,
                  const nsString& aString, int32_t aStart, int32_t aLen)
     : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),
       mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
   CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
   void Advance();
   void AdvanceThroughSeparators();
   void AdvanceThroughWord();
   // Finds special words like email addresses and URLs that may start at the
   // current position, and returns their length, or 0 if not found. This allows
   // arbitrary word breaking rules to be used for these special entities, as
   // long as they can not contain whitespace.
   bool IsSpecialWord();
   // Similar to IsSpecialWord except that this takes a split word as
   // input. This checks for things that do not require special word-breaking
   // rules.
   bool ShouldSkipWord(int32_t aStart, int32_t aLength);
 };
 // WordSplitState::ClassifyCharacter
 CharClass
 WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const
 {
   NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
                "Index out of range");
   if (aIndex == int32_t(mDOMWordText.Length()))
     return CHAR_CLASS_SEPARATOR;
   // this will classify the character, we want to treat "ignorable" characters
   // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
   nsIUGenCategory::nsUGenCategory
     charCategory = mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]);
   if (charCategory == nsIUGenCategory::kLetter ||
       IsIgnorableCharacter(mDOMWordText[aIndex]) ||
       mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
       mDOMWordText[aIndex] == 0x200D /* ZWJ */)
     return CHAR_CLASS_WORD;
   // If conditional punctuation is surrounded immediately on both sides by word
   // characters it also counts as a word character.
   if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
     if (!aRecurse) {
       // not allowed to look around, this punctuation counts like a separator
       return CHAR_CLASS_SEPARATOR;
     }
     // check the left-hand character
     if (aIndex == 0)
       return CHAR_CLASS_SEPARATOR;
     if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
       return CHAR_CLASS_SEPARATOR;
     // If the previous charatcer is a word-char, make sure that it's not a
     // special dot character.
     if (mDOMWordText[aIndex - 1] == '.')
       return CHAR_CLASS_SEPARATOR;
     // now we know left char is a word-char, check the right-hand character
     if (aIndex == int32_t(mDOMWordText.Length()) - 1)
       return CHAR_CLASS_SEPARATOR;
     if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
       return CHAR_CLASS_SEPARATOR;
     // If the next charatcer is a word-char, make sure that it's not a
     // special dot character.
     if (mDOMWordText[aIndex + 1] == '.')
       return CHAR_CLASS_SEPARATOR;
     // char on either side is a word, this counts as a word
     return CHAR_CLASS_WORD;
   }
   // The dot character, if appearing at the end of a word, should
   // be considered part of that word.  Example: "etc.", or
   // abbreviations
   if (aIndex > 0 &&
       mDOMWordText[aIndex] == '.' &&
       mDOMWordText[aIndex - 1] != '.' &&
       ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
     return CHAR_CLASS_WORD;
   }
   // all other punctuation
   if (charCategory == nsIUGenCategory::kSeparator ||
       charCategory == nsIUGenCategory::kOther ||
       charCategory == nsIUGenCategory::kPunctuation ||
       charCategory == nsIUGenCategory::kSymbol) {
     // Don't break on hyphens, as hunspell handles them on its own.
     if (aIndex > 0 &&
         mDOMWordText[aIndex] == '-' &&
         mDOMWordText[aIndex - 1] != '-' &&
         ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
       // A hyphen is only meaningful as a separator inside a word
       // if the previous and next characters are a word character.
       if (aIndex == int32_t(mDOMWordText.Length()) - 1)
         return CHAR_CLASS_SEPARATOR;
       if (mDOMWordText[aIndex + 1] != '.' &&
           ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
         return CHAR_CLASS_WORD;
     }
     return CHAR_CLASS_SEPARATOR;
   }
   // any other character counts as a word
   return CHAR_CLASS_WORD;
 }
 // WordSplitState::Advance
 void
 WordSplitState::Advance()
 {
   NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
   NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
                "Length beyond end");
   mDOMWordOffset ++;
   if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
     mCurCharClass = CHAR_CLASS_END_OF_INPUT;
   else
     mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
 }
 // WordSplitState::AdvanceThroughSeparators
 void
 WordSplitState::AdvanceThroughSeparators()
 {
   while (mCurCharClass == CHAR_CLASS_SEPARATOR)
     Advance();
 }
 // WordSplitState::AdvanceThroughWord
 void
 WordSplitState::AdvanceThroughWord()
 {
   while (mCurCharClass == CHAR_CLASS_WORD)
     Advance();
 }
 // WordSplitState::IsSpecialWord
 bool
 WordSplitState::IsSpecialWord()
 {
   // Search for email addresses. We simply define these as any sequence of
   // characters with an '@' character in the middle. The DOM word is already
   // split on whitepace, so we know that everything to the end is the address
   int32_t firstColon = -1;
   for (int32_t i = mDOMWordOffset;
        i < int32_t(mDOMWordText.Length()); i ++) {
     if (mDOMWordText[i] == '@') {
       // only accept this if there are unambiguous word characters (don't bother
       // recursing to disambiguate apostrophes) on each side. This prevents
       // classifying, e.g. "@home" as an email address
       // Use this condition to only accept words with '@' in the middle of
       // them. It works, but the inlinespellcker doesn't like this. The problem
       // is that you type "fhsgfh@" that's a misspelled word followed by a
       // symbol, but when you type another letter "fhsgfh@g" that first word
       // need to be unmarked misspelled. It doesn't do this. it only checks the
       // current position for potentially removing a spelling range.
       if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
           i < (int32_t)mDOMWordText.Length() - 1 &&
           ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
         return true;
       }
     } else if (mDOMWordText[i] == ':' && firstColon < 0) {
       firstColon = i;
       // If the first colon is followed by a slash, consider it a URL
       // This will catch things like asdf://foo.com
       if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
           mDOMWordText[firstColon + 1] == '/') {
         return true;
       }
     }
   }
   // Check the text before the first colon against some known protocols. It
   // is impossible to check against all protocols, especially since you can
   // plug in new protocols. We also don't want to waste time here checking
   // against a lot of obscure protocols.
   if (firstColon > mDOMWordOffset) {
     nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
                       firstColon - mDOMWordOffset));
     if (protocol.EqualsIgnoreCase("http") ||
         protocol.EqualsIgnoreCase("https") ||
         protocol.EqualsIgnoreCase("news") ||
         protocol.EqualsIgnoreCase("file") ||
         protocol.EqualsIgnoreCase("javascript") ||
         protocol.EqualsIgnoreCase("data") ||
         protocol.EqualsIgnoreCase("ftp")) {
       return true;
     }
   }
   // not anything special
   return false;
 }
 // WordSplitState::ShouldSkipWord
 bool
 WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength)
 {
   int32_t last = aStart + aLength;
   // check to see if the word contains a digit
   for (int32_t i = aStart; i < last; i ++) {
     if (unicode::GetGenCategory(mDOMWordText[i]) == nsIUGenCategory::kNumber) {
       return true;
     }
   }
   // not special
   return false;
 }
 // mozInlineSpellWordUtil::SplitDOMWord
 void
 mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd)
 {
   WordSplitState state(this, mSoftText, aStart, aEnd - aStart);
   state.mCurCharClass = state.ClassifyCharacter(0, true);
   state.AdvanceThroughSeparators();
   if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT &&
       state.IsSpecialWord()) {
     int32_t specialWordLength = state.mDOMWordText.Length() - state.mDOMWordOffset;
     mRealWords.AppendElement(
         RealWord(aStart + state.mDOMWordOffset, specialWordLength, false));
     return;
   }
   while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
     state.AdvanceThroughSeparators();
     if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT)
       break;
     // save the beginning of the word
     int32_t wordOffset = state.mDOMWordOffset;
     // find the end of the word
     state.AdvanceThroughWord();
     int32_t wordLen = state.mDOMWordOffset - wordOffset;
     mRealWords.AppendElement(
       RealWord(aStart + wordOffset, wordLen,
                !state.ShouldSkipWord(wordOffset, wordLen)));
   }
 }

The Tor Browser / annotate

extensions/spellcheck/src/mozInlineSpellWordUtil.cpp@6474c204b198 (annotated)

extensions/spellcheck/src/mozInlineSpellWordUtil.cpp