extensions/spellcheck/src/mozInlineSpellWordUtil.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #include "mozInlineSpellWordUtil.h"
     7 #include "nsDebug.h"
     8 #include "nsIAtom.h"
     9 #include "nsComponentManagerUtils.h"
    10 #include "nsIDOMCSSStyleDeclaration.h"
    11 #include "nsIDOMElement.h"
    12 #include "nsIDOMRange.h"
    13 #include "nsIEditor.h"
    14 #include "nsIDOMNode.h"
    15 #include "nsIDOMHTMLBRElement.h"
    16 #include "nsUnicharUtilCIID.h"
    17 #include "nsUnicodeProperties.h"
    18 #include "nsServiceManagerUtils.h"
    19 #include "nsIContent.h"
    20 #include "nsTextFragment.h"
    21 #include "mozilla/dom/Element.h"
    22 #include "nsRange.h"
    23 #include "nsContentUtils.h"
    24 #include "nsIFrame.h"
    25 #include <algorithm>
    27 using namespace mozilla;
    29 // IsIgnorableCharacter
    30 //
    31 //    These characters are ones that we should ignore in input.
    33 inline bool IsIgnorableCharacter(char16_t ch)
    34 {
    35   return (ch == 0xAD ||   // SOFT HYPHEN
    36           ch == 0x1806);  // MONGOLIAN TODO SOFT HYPHEN
    37 }
    39 // IsConditionalPunctuation
    40 //
    41 //    Some characters (like apostrophes) require characters on each side to be
    42 //    part of a word, and are otherwise punctuation.
    44 inline bool IsConditionalPunctuation(char16_t ch)
    45 {
    46   return (ch == '\'' ||
    47           ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK
    48           ch == 0x00B7); // MIDDLE DOT
    49 }
    51 // mozInlineSpellWordUtil::Init
    53 nsresult
    54 mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor)
    55 {
    56   nsresult rv;
    58   // getting the editor can fail commonly because the editor was detached, so
    59   // don't assert
    60   nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv);
    61   if (NS_FAILED(rv))
    62     return rv;
    64   nsCOMPtr<nsIDOMDocument> domDoc;
    65   rv = editor->GetDocument(getter_AddRefs(domDoc));
    66   NS_ENSURE_SUCCESS(rv, rv);
    67   NS_ENSURE_TRUE(domDoc, NS_ERROR_NULL_POINTER);
    69   mDOMDocument = domDoc;
    70   mDocument = do_QueryInterface(domDoc);
    72   // Find the root node for the editor. For contenteditable we'll need something
    73   // cleverer here.
    74   nsCOMPtr<nsIDOMElement> rootElt;
    75   rv = editor->GetRootElement(getter_AddRefs(rootElt));
    76   NS_ENSURE_SUCCESS(rv, rv);
    78   nsCOMPtr<nsINode> rootNode = do_QueryInterface(rootElt);
    79   mRootNode = rootNode;
    80   NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!");
    81   return NS_OK;
    82 }
    84 static inline bool
    85 IsTextNode(nsINode* aNode)
    86 {
    87   return aNode->IsNodeOfType(nsINode::eTEXT);
    88 }
    90 typedef void (* OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);
    92 // Find the next node in the DOM tree in preorder.
    93 // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
    94 // why we can't just use GetNextNode here, sadly.
    95 static nsINode*
    96 FindNextNode(nsINode* aNode, nsINode* aRoot,
    97              OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure)
    98 {
    99   NS_PRECONDITION(aNode, "Null starting node?");
   101   nsINode* next = aNode->GetFirstChild();
   102   if (next)
   103     return next;
   105   // Don't look at siblings or otherwise outside of aRoot
   106   if (aNode == aRoot)
   107     return nullptr;
   109   next = aNode->GetNextSibling();
   110   if (next)
   111     return next;
   113   // Go up
   114   for (;;) {
   115     if (aOnLeaveNode) {
   116       aOnLeaveNode(aNode, aClosure);
   117     }
   119     next = aNode->GetParent();
   120     if (next == aRoot || ! next)
   121       return nullptr;
   122     aNode = next;
   124     next = aNode->GetNextSibling();
   125     if (next)
   126       return next;
   127   }
   128 }
   130 // aNode is not a text node. Find the first text node starting at aNode/aOffset
   131 // in a preorder DOM traversal.
   132 static nsINode*
   133 FindNextTextNode(nsINode* aNode, int32_t aOffset, nsINode* aRoot)
   134 {
   135   NS_PRECONDITION(aNode, "Null starting node?");
   136   NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node");
   138   nsINode* checkNode;
   139   // Need to start at the aOffset'th child
   140   nsIContent* child = aNode->GetChildAt(aOffset);
   142   if (child) {
   143     checkNode = child;
   144   } else {
   145     // aOffset was beyond the end of the child list. 
   146     // goto next node after the last descendant of aNode in
   147     // a preorder DOM traversal.
   148     checkNode = aNode->GetNextNonChildNode(aRoot);
   149   }
   151   while (checkNode && !IsTextNode(checkNode)) {
   152     checkNode = checkNode->GetNextNode(aRoot);
   153   }
   154   return checkNode;
   155 }
   157 // mozInlineSpellWordUtil::SetEnd
   158 //
   159 //    We have two ranges "hard" and "soft". The hard boundary is simply
   160 //    the scope of the root node. The soft boundary is that which is set
   161 //    by the caller of this class by calling this function. If this function is
   162 //    not called, the soft boundary is the same as the hard boundary.
   163 //
   164 //    When we reach the soft boundary (mSoftEnd), we keep
   165 //    going until we reach the end of a word. This allows the caller to set the
   166 //    end of the range to anything, and we will always check whole multiples of
   167 //    words. When we reach the hard boundary we stop no matter what.
   168 //
   169 //    There is no beginning soft boundary. This is because we only go to the
   170 //    previous node once, when finding the previous word boundary in
   171 //    SetPosition(). You might think of the soft boundary as being this initial
   172 //    position.
   174 nsresult
   175 mozInlineSpellWordUtil::SetEnd(nsINode* aEndNode, int32_t aEndOffset)
   176 {
   177   NS_PRECONDITION(aEndNode, "Null end node?");
   179   NS_ASSERTION(mRootNode, "Not initialized");
   181   InvalidateWords();
   183   if (!IsTextNode(aEndNode)) {
   184     // End at the start of the first text node after aEndNode/aEndOffset.
   185     aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
   186     aEndOffset = 0;
   187   }
   188   mSoftEnd = NodeOffset(aEndNode, aEndOffset);
   189   return NS_OK;
   190 }
   192 nsresult
   193 mozInlineSpellWordUtil::SetPosition(nsINode* aNode, int32_t aOffset)
   194 {
   195   InvalidateWords();
   197   if (!IsTextNode(aNode)) {
   198     // Start at the start of the first text node after aNode/aOffset.
   199     aNode = FindNextTextNode(aNode, aOffset, mRootNode);
   200     aOffset = 0;
   201   }
   202   mSoftBegin = NodeOffset(aNode, aOffset);
   204   EnsureWords();
   206   int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin);
   207   if (textOffset < 0)
   208     return NS_OK;
   209   mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
   210   return NS_OK;
   211 }
   213 void
   214 mozInlineSpellWordUtil::EnsureWords()
   215 {
   216   if (mSoftTextValid)
   217     return;
   218   BuildSoftText();
   219   BuildRealWords();
   220   mSoftTextValid = true;
   221 }
   223 nsresult
   224 mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsRange** aRange)
   225 {
   226   NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
   227   NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
   228   return MakeRange(begin, end, aRange);
   229 }
   231 // mozInlineSpellWordUtil::GetRangeForWord
   233 nsresult
   234 mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode,
   235                                         int32_t aWordOffset,
   236                                         nsRange** aRange)
   237 {
   238   // Set our soft end and start
   239   nsCOMPtr<nsINode> wordNode = do_QueryInterface(aWordNode);
   240   NodeOffset pt = NodeOffset(wordNode, aWordOffset);
   242   InvalidateWords();
   243   mSoftBegin = mSoftEnd = pt;
   244   EnsureWords();
   246   int32_t offset = MapDOMPositionToSoftTextOffset(pt);
   247   if (offset < 0)
   248     return MakeRange(pt, pt, aRange);
   249   int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
   250   if (wordIndex < 0)
   251     return MakeRange(pt, pt, aRange);
   252   return MakeRangeForWord(mRealWords[wordIndex], aRange);
   253 }
   255 // This is to fix characters that the spellchecker may not like
   256 static void
   257 NormalizeWord(const nsSubstring& aInput, int32_t aPos, int32_t aLen, nsAString& aOutput)
   258 {
   259   aOutput.Truncate();
   260   for (int32_t i = 0; i < aLen; i++) {
   261     char16_t ch = aInput.CharAt(i + aPos);
   263     // remove ignorable characters from the word
   264     if (IsIgnorableCharacter(ch))
   265       continue;
   267     // the spellchecker doesn't handle curly apostrophes in all languages
   268     if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
   269       ch = '\'';
   270     }
   272     aOutput.Append(ch);
   273   }
   274 }
   276 // mozInlineSpellWordUtil::GetNextWord
   277 //
   278 //    FIXME-optimization: we shouldn't have to generate a range every single
   279 //    time. It would be better if the inline spellchecker didn't require a
   280 //    range unless the word was misspelled. This may or may not be possible.
   282 nsresult
   283 mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsRange** aRange,
   284                                     bool* aSkipChecking)
   285 {
   286 #ifdef DEBUG_SPELLCHECK
   287   printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex);
   288 #endif
   290   if (mNextWordIndex < 0 ||
   291       mNextWordIndex >= int32_t(mRealWords.Length())) {
   292     mNextWordIndex = -1;
   293     *aRange = nullptr;
   294     *aSkipChecking = true;
   295     return NS_OK;
   296   }
   298   const RealWord& word = mRealWords[mNextWordIndex];
   299   nsresult rv = MakeRangeForWord(word, aRange);
   300   NS_ENSURE_SUCCESS(rv, rv);
   301   ++mNextWordIndex;
   302   *aSkipChecking = !word.mCheckableWord;
   303   ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText);
   305 #ifdef DEBUG_SPELLCHECK
   306   printf("GetNextWord returning: %s (skip=%d)\n",
   307          NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking);
   308 #endif
   310   return NS_OK;
   311 }
   313 // mozInlineSpellWordUtil::MakeRange
   314 //
   315 //    Convenience function for creating a range over the current document.
   317 nsresult
   318 mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
   319                                   nsRange** aRange)
   320 {
   321   NS_ENSURE_ARG_POINTER(aBegin.mNode);
   322   if (!mDOMDocument)
   323     return NS_ERROR_NOT_INITIALIZED;
   325   nsRefPtr<nsRange> range = new nsRange(aBegin.mNode);
   326   nsresult rv = range->Set(aBegin.mNode, aBegin.mOffset,
   327                            aEnd.mNode, aEnd.mOffset);
   328   NS_ENSURE_SUCCESS(rv, rv);
   329   range.forget(aRange);
   331   return NS_OK;
   332 }
   334 /*********** DOM text extraction ************/
   336 // IsDOMWordSeparator
   337 //
   338 //    Determines if the given character should be considered as a DOM Word
   339 //    separator. Basically, this is whitespace, although it could also have
   340 //    certain punctuation that we know ALWAYS breaks words. This is important.
   341 //    For example, we can't have any punctuation that could appear in a URL
   342 //    or email address in this, because those need to always fit into a single
   343 //    DOM word.
   345 static bool
   346 IsDOMWordSeparator(char16_t ch)
   347 {
   348   // simple spaces
   349   if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
   350     return true;
   352   // complex spaces - check only if char isn't ASCII (uncommon)
   353   if (ch >= 0xA0 &&
   354       (ch == 0x00A0 ||  // NO-BREAK SPACE
   355        ch == 0x2002 ||  // EN SPACE
   356        ch == 0x2003 ||  // EM SPACE
   357        ch == 0x2009 ||  // THIN SPACE
   358        ch == 0x3000))   // IDEOGRAPHIC SPACE
   359     return true;
   361   // otherwise not a space
   362   return false;
   363 }
   365 static inline bool
   366 IsBRElement(nsINode* aNode)
   367 {
   368   return aNode->IsElement() &&
   369          aNode->AsElement()->IsHTML(nsGkAtoms::br);
   370 }
   372 /**
   373  * Given a TextNode, checks to see if there's a DOM word separator before
   374  * aBeforeOffset within it. This function does not modify aSeparatorOffset when
   375  * it returns false.
   376  *
   377  * @param aNode the TextNode to check.
   378  * @param aBeforeOffset the offset in the TextNode before which we will search
   379  *        for the DOM separator. You can pass INT32_MAX to search the entire
   380  *        length of the string.
   381  * @param aSeparatorOffset will be set to the offset of the first separator it
   382  *        encounters. Will not be written to if no separator is found.
   383  * @returns True if it found a separator.
   384  */
   385 static bool
   386 TextNodeContainsDOMWordSeparator(nsINode* aNode,
   387                                  int32_t aBeforeOffset,
   388                                  int32_t* aSeparatorOffset)
   389 {
   390   // aNode is actually an nsIContent, since it's eTEXT
   391   nsIContent* content = static_cast<nsIContent*>(aNode);
   392   const nsTextFragment* textFragment = content->GetText();
   393   NS_ASSERTION(textFragment, "Where is our text?");
   394   for (int32_t i = std::min(aBeforeOffset, int32_t(textFragment->GetLength())) - 1; i >= 0; --i) {
   395     if (IsDOMWordSeparator(textFragment->CharAt(i))) {
   396       // Be greedy, find as many separators as we can
   397       for (int32_t j = i - 1; j >= 0; --j) {
   398         if (IsDOMWordSeparator(textFragment->CharAt(j))) {
   399           i = j;
   400         } else {
   401           break;
   402         }
   403       }
   404       *aSeparatorOffset = i;
   405       return true;
   406     }
   407   }
   408   return false;
   409 }
   411 /**
   412  * Check if there's a DOM word separator before aBeforeOffset in this node.
   413  * Always returns true if it's a BR element.
   414  * aSeparatorOffset is set to the index of the first character in the last
   415  * separator if any is found (0 for BR elements).
   416  *
   417  * This function does not modify aSeparatorOffset when it returns false.
   418  */
   419 static bool
   420 ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,
   421                          int32_t* aSeparatorOffset)
   422 {
   423   if (IsBRElement(aNode)) {
   424     *aSeparatorOffset = 0;
   425     return true;
   426   }
   428   if (!IsTextNode(aNode))
   429     return false;
   431   return TextNodeContainsDOMWordSeparator(aNode, aBeforeOffset,
   432                                           aSeparatorOffset);
   433 }
   435 static bool
   436 IsBreakElement(nsINode* aNode)
   437 {
   438   if (!aNode->IsElement()) {
   439     return false;
   440   }
   442   dom::Element *element = aNode->AsElement();
   444   if (element->IsHTML(nsGkAtoms::br))
   445     return true;
   447   // If we don't have a frame, we don't consider ourselves a break
   448   // element.  In particular, words can span us.
   449   if (!element->GetPrimaryFrame())
   450     return false;
   452   // Anything that's not an inline element is a break element.
   453   // XXXbz should replaced inlines be break elements, though?
   454   return element->GetPrimaryFrame()->StyleDisplay()->mDisplay !=
   455     NS_STYLE_DISPLAY_INLINE;
   456 }
   458 struct CheckLeavingBreakElementClosure {
   459   bool          mLeftBreakElement;
   460 };
   462 static void
   463 CheckLeavingBreakElement(nsINode* aNode, void* aClosure)
   464 {
   465   CheckLeavingBreakElementClosure* cl =
   466     static_cast<CheckLeavingBreakElementClosure*>(aClosure);
   467   if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
   468     cl->mLeftBreakElement = true;
   469   }
   470 }
   472 void
   473 mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord)
   474 {
   475   nsAutoString result;
   476   ::NormalizeWord(aWord, 0, aWord.Length(), result);
   477   aWord = result;
   478 }
   480 void
   481 mozInlineSpellWordUtil::BuildSoftText()
   482 {
   483   // First we have to work backwards from mSoftStart to find a text node
   484   // containing a DOM word separator, a non-inline-element
   485   // boundary, or the hard start node. That's where we'll start building the
   486   // soft string from.
   487   nsINode* node = mSoftBegin.mNode;
   488   int32_t firstOffsetInNode = 0;
   489   int32_t checkBeforeOffset = mSoftBegin.mOffset;
   490   while (node) {
   491     if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
   492       if (node == mSoftBegin.mNode) {
   493         // If we find a word separator on the first node, look at the preceding
   494         // word on the text node as well.
   495         int32_t newOffset = 0;
   496         if (firstOffsetInNode > 0) {
   497           // Try to find the previous word boundary in the current node. If
   498           // we can't find one, start checking previous sibling nodes (if any
   499           // adjacent ones exist) to see if we can find any text nodes with
   500           // DOM word separators. We bail out as soon as we see a node that is
   501           // not a text node, or we run out of previous sibling nodes. In the
   502           // event that we simply cannot find any preceding word separator, the
   503           // offset is set to 0, and the soft text beginning node is set to the
   504           // "most previous" text node before the original starting node, or
   505           // kept at the original starting node if no previous text nodes exist.
   506           if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,
   507                                         &newOffset)) {
   508             nsINode* prevNode = node->GetPreviousSibling();
   509             while (prevNode && IsTextNode(prevNode)) {
   510               mSoftBegin.mNode = prevNode;
   511               if (TextNodeContainsDOMWordSeparator(prevNode, INT32_MAX,
   512                                                    &newOffset)) {
   513                 break;
   514               }
   515               prevNode = prevNode->GetPreviousSibling();
   516             }
   517           }
   518         }
   519         firstOffsetInNode = newOffset;
   520         mSoftBegin.mOffset = newOffset;
   521       }
   522       break;
   523     }
   524     checkBeforeOffset = INT32_MAX;
   525     if (IsBreakElement(node)) {
   526       // Since GetPreviousContent follows tree *preorder*, we're about to traverse
   527       // up out of 'node'. Since node induces breaks (e.g., it's a block),
   528       // don't bother trying to look outside it, just stop now.
   529       break;
   530     }
   531     // GetPreviousContent below expects mRootNode to be an ancestor of node.
   532     if (!nsContentUtils::ContentIsDescendantOf(node, mRootNode)) {
   533       break;
   534     }
   535     node = node->GetPreviousContent(mRootNode);
   536   }
   538   // Now build up the string moving forward through the DOM until we reach
   539   // the soft end and *then* see a DOM word separator, a non-inline-element
   540   // boundary, or the hard end node.
   541   mSoftText.Truncate();
   542   mSoftTextDOMMapping.Clear();
   543   bool seenSoftEnd = false;
   544   // Leave this outside the loop so large heap string allocations can be reused
   545   // across iterations
   546   while (node) {
   547     if (node == mSoftEnd.mNode) {
   548       seenSoftEnd = true;
   549     }
   551     bool exit = false;
   552     if (IsTextNode(node)) {
   553       nsIContent* content = static_cast<nsIContent*>(node);
   554       NS_ASSERTION(content, "Where is our content?");
   555       const nsTextFragment* textFragment = content->GetText();
   556       NS_ASSERTION(textFragment, "Where is our text?");
   557       int32_t lastOffsetInNode = textFragment->GetLength();
   559       if (seenSoftEnd) {
   560         // check whether we can stop after this
   561         for (int32_t i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0;
   562              i < int32_t(textFragment->GetLength()); ++i) {
   563           if (IsDOMWordSeparator(textFragment->CharAt(i))) {
   564             exit = true;
   565             // stop at the first separator after the soft end point
   566             lastOffsetInNode = i;
   567             break;
   568           }
   569         }
   570       }
   572       if (firstOffsetInNode < lastOffsetInNode) {
   573         int32_t len = lastOffsetInNode - firstOffsetInNode;
   574         mSoftTextDOMMapping.AppendElement(
   575           DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len));
   577         bool ok = textFragment->AppendTo(mSoftText, firstOffsetInNode, len,
   578                                          mozilla::fallible_t());
   579         if (!ok) {
   580             // probably out of memory, remove from mSoftTextDOMMapping
   581             mSoftTextDOMMapping.RemoveElementAt(mSoftTextDOMMapping.Length() - 1);
   582             exit = true;
   583         }
   584       }
   586       firstOffsetInNode = 0;
   587     }
   589     if (exit)
   590       break;
   592     CheckLeavingBreakElementClosure closure = { false };
   593     node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure);
   594     if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
   595       // We left, or are entering, a break element (e.g., block). Maybe we can
   596       // stop now.
   597       if (seenSoftEnd)
   598         break;
   599       // Record the break
   600       mSoftText.Append(' ');
   601     }
   602   }
   604 #ifdef DEBUG_SPELLCHECK
   605   printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get());
   606 #endif
   607 }
   609 void
   610 mozInlineSpellWordUtil::BuildRealWords()
   611 {
   612   // This is pretty simple. We just have to walk mSoftText, tokenizing it
   613   // into "real words".
   614   // We do an outer traversal of words delimited by IsDOMWordSeparator, calling
   615   // SplitDOMWord on each of those DOM words
   616   int32_t wordStart = -1;
   617   mRealWords.Clear();
   618   for (int32_t i = 0; i < int32_t(mSoftText.Length()); ++i) {
   619     if (IsDOMWordSeparator(mSoftText.CharAt(i))) {
   620       if (wordStart >= 0) {
   621         SplitDOMWord(wordStart, i);
   622         wordStart = -1;
   623       }
   624     } else {
   625       if (wordStart < 0) {
   626         wordStart = i;
   627       }
   628     }
   629   }
   630   if (wordStart >= 0) {
   631     SplitDOMWord(wordStart, mSoftText.Length());
   632   }
   633 }
   635 /*********** DOM/realwords<->mSoftText mapping functions ************/
   637 int32_t
   638 mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset)
   639 {
   640   if (!mSoftTextValid) {
   641     NS_ERROR("Soft text must be valid if we're to map into it");
   642     return -1;
   643   }
   645   for (int32_t i = 0; i < int32_t(mSoftTextDOMMapping.Length()); ++i) {
   646     const DOMTextMapping& map = mSoftTextDOMMapping[i];
   647     if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
   648       // Allow offsets at either end of the string, in particular, allow the
   649       // offset that's at the end of the contributed string
   650       int32_t offsetInContributedString =
   651         aNodeOffset.mOffset - map.mNodeOffset.mOffset;
   652       if (offsetInContributedString >= 0 &&
   653           offsetInContributedString <= map.mLength)
   654         return map.mSoftTextOffset + offsetInContributedString;
   655       return -1;
   656     }
   657   }
   658   return -1;
   659 }
   661 mozInlineSpellWordUtil::NodeOffset
   662 mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset,
   663                                                        DOMMapHint aHint)
   664 {
   665   NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
   666   if (!mSoftTextValid)
   667     return NodeOffset(nullptr, -1);
   669   // The invariant is that the range start..end includes the last mapping,
   670   // if any, such that mSoftTextOffset <= aSoftTextOffset
   671   int32_t start = 0;
   672   int32_t end = mSoftTextDOMMapping.Length();
   673   while (end - start >= 2) {
   674     int32_t mid = (start + end)/2;
   675     const DOMTextMapping& map = mSoftTextDOMMapping[mid];
   676     if (map.mSoftTextOffset > aSoftTextOffset) {
   677       end = mid;
   678     } else {
   679       start = mid;
   680     }
   681   }
   683   if (start >= end)
   684     return NodeOffset(nullptr, -1);
   686   // 'start' is now the last mapping, if any, such that
   687   // mSoftTextOffset <= aSoftTextOffset.
   688   // If we're doing HINT_END, then we may want to return the end of the
   689   // the previous mapping instead of the start of this mapping
   690   if (aHint == HINT_END && start > 0) {
   691     const DOMTextMapping& map = mSoftTextDOMMapping[start - 1];
   692     if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
   693       return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength);
   694   }
   696   // We allow ourselves to return the end of this mapping even if we're
   697   // doing HINT_START. This will only happen if there is no mapping which this
   698   // point is the start of. I'm not 100% sure this is OK...
   699   const DOMTextMapping& map = mSoftTextDOMMapping[start];
   700   int32_t offset = aSoftTextOffset - map.mSoftTextOffset;
   701   if (offset >= 0 && offset <= map.mLength)
   702     return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
   704   return NodeOffset(nullptr, -1);
   705 }
   707 int32_t
   708 mozInlineSpellWordUtil::FindRealWordContaining(int32_t aSoftTextOffset,
   709     DOMMapHint aHint, bool aSearchForward)
   710 {
   711   NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
   712   if (!mSoftTextValid)
   713     return -1;
   715   // The invariant is that the range start..end includes the last word,
   716   // if any, such that mSoftTextOffset <= aSoftTextOffset
   717   int32_t start = 0;
   718   int32_t end = mRealWords.Length();
   719   while (end - start >= 2) {
   720     int32_t mid = (start + end)/2;
   721     const RealWord& word = mRealWords[mid];
   722     if (word.mSoftTextOffset > aSoftTextOffset) {
   723       end = mid;
   724     } else {
   725       start = mid;
   726     }
   727   }
   729   if (start >= end)
   730     return -1;
   732   // 'start' is now the last word, if any, such that
   733   // mSoftTextOffset <= aSoftTextOffset.
   734   // If we're doing HINT_END, then we may want to return the end of the
   735   // the previous word instead of the start of this word
   736   if (aHint == HINT_END && start > 0) {
   737     const RealWord& word = mRealWords[start - 1];
   738     if (word.mSoftTextOffset + word.mLength == aSoftTextOffset)
   739       return start - 1;
   740   }
   742   // We allow ourselves to return the end of this word even if we're
   743   // doing HINT_START. This will only happen if there is no word which this
   744   // point is the start of. I'm not 100% sure this is OK...
   745   const RealWord& word = mRealWords[start];
   746   int32_t offset = aSoftTextOffset - word.mSoftTextOffset;
   747   if (offset >= 0 && offset <= word.mLength)
   748     return start;
   750   if (aSearchForward) {
   751     if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
   752       // All words have mSoftTextOffset > aSoftTextOffset
   753       return 0;
   754     }
   755     // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset.
   756     // Word start+1, if it exists, will be the first with
   757     // mSoftTextOffset > aSoftTextOffset.
   758     if (start + 1 < int32_t(mRealWords.Length()))
   759       return start + 1;
   760   }
   762   return -1;
   763 }
   765 /*********** Word Splitting ************/
   767 // classifies a given character in the DOM word
   768 enum CharClass {
   769   CHAR_CLASS_WORD,
   770   CHAR_CLASS_SEPARATOR,
   771   CHAR_CLASS_END_OF_INPUT };
   773 // Encapsulates DOM-word to real-word splitting
   774 struct MOZ_STACK_CLASS WordSplitState
   775 {
   776   mozInlineSpellWordUtil*    mWordUtil;
   777   const nsDependentSubstring mDOMWordText;
   778   int32_t                    mDOMWordOffset;
   779   CharClass                  mCurCharClass;
   781   WordSplitState(mozInlineSpellWordUtil* aWordUtil,
   782                  const nsString& aString, int32_t aStart, int32_t aLen)
   783     : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),
   784       mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
   786   CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
   787   void Advance();
   788   void AdvanceThroughSeparators();
   789   void AdvanceThroughWord();
   791   // Finds special words like email addresses and URLs that may start at the
   792   // current position, and returns their length, or 0 if not found. This allows
   793   // arbitrary word breaking rules to be used for these special entities, as
   794   // long as they can not contain whitespace.
   795   bool IsSpecialWord();
   797   // Similar to IsSpecialWord except that this takes a split word as
   798   // input. This checks for things that do not require special word-breaking
   799   // rules.
   800   bool ShouldSkipWord(int32_t aStart, int32_t aLength);
   801 };
   803 // WordSplitState::ClassifyCharacter
   805 CharClass
   806 WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const
   807 {
   808   NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
   809                "Index out of range");
   810   if (aIndex == int32_t(mDOMWordText.Length()))
   811     return CHAR_CLASS_SEPARATOR;
   813   // this will classify the character, we want to treat "ignorable" characters
   814   // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
   815   nsIUGenCategory::nsUGenCategory
   816     charCategory = mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]);
   817   if (charCategory == nsIUGenCategory::kLetter ||
   818       IsIgnorableCharacter(mDOMWordText[aIndex]) ||
   819       mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
   820       mDOMWordText[aIndex] == 0x200D /* ZWJ */)
   821     return CHAR_CLASS_WORD;
   823   // If conditional punctuation is surrounded immediately on both sides by word
   824   // characters it also counts as a word character.
   825   if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
   826     if (!aRecurse) {
   827       // not allowed to look around, this punctuation counts like a separator
   828       return CHAR_CLASS_SEPARATOR;
   829     }
   831     // check the left-hand character
   832     if (aIndex == 0)
   833       return CHAR_CLASS_SEPARATOR;
   834     if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
   835       return CHAR_CLASS_SEPARATOR;
   836     // If the previous charatcer is a word-char, make sure that it's not a
   837     // special dot character.
   838     if (mDOMWordText[aIndex - 1] == '.')
   839       return CHAR_CLASS_SEPARATOR;
   841     // now we know left char is a word-char, check the right-hand character
   842     if (aIndex == int32_t(mDOMWordText.Length()) - 1)
   843       return CHAR_CLASS_SEPARATOR;
   844     if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
   845       return CHAR_CLASS_SEPARATOR;
   846     // If the next charatcer is a word-char, make sure that it's not a
   847     // special dot character.
   848     if (mDOMWordText[aIndex + 1] == '.')
   849       return CHAR_CLASS_SEPARATOR;
   851     // char on either side is a word, this counts as a word
   852     return CHAR_CLASS_WORD;
   853   }
   855   // The dot character, if appearing at the end of a word, should
   856   // be considered part of that word.  Example: "etc.", or
   857   // abbreviations
   858   if (aIndex > 0 &&
   859       mDOMWordText[aIndex] == '.' &&
   860       mDOMWordText[aIndex - 1] != '.' &&
   861       ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
   862     return CHAR_CLASS_WORD;
   863   }
   865   // all other punctuation
   866   if (charCategory == nsIUGenCategory::kSeparator ||
   867       charCategory == nsIUGenCategory::kOther ||
   868       charCategory == nsIUGenCategory::kPunctuation ||
   869       charCategory == nsIUGenCategory::kSymbol) {
   870     // Don't break on hyphens, as hunspell handles them on its own.
   871     if (aIndex > 0 &&
   872         mDOMWordText[aIndex] == '-' &&
   873         mDOMWordText[aIndex - 1] != '-' &&
   874         ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
   875       // A hyphen is only meaningful as a separator inside a word
   876       // if the previous and next characters are a word character.
   877       if (aIndex == int32_t(mDOMWordText.Length()) - 1)
   878         return CHAR_CLASS_SEPARATOR;
   879       if (mDOMWordText[aIndex + 1] != '.' &&
   880           ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
   881         return CHAR_CLASS_WORD;
   882     }
   883     return CHAR_CLASS_SEPARATOR;
   884   }
   886   // any other character counts as a word
   887   return CHAR_CLASS_WORD;
   888 }
   891 // WordSplitState::Advance
   893 void
   894 WordSplitState::Advance()
   895 {
   896   NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
   897   NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
   898                "Length beyond end");
   900   mDOMWordOffset ++;
   901   if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
   902     mCurCharClass = CHAR_CLASS_END_OF_INPUT;
   903   else
   904     mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
   905 }
   908 // WordSplitState::AdvanceThroughSeparators
   910 void
   911 WordSplitState::AdvanceThroughSeparators()
   912 {
   913   while (mCurCharClass == CHAR_CLASS_SEPARATOR)
   914     Advance();
   915 }
   917 // WordSplitState::AdvanceThroughWord
   919 void
   920 WordSplitState::AdvanceThroughWord()
   921 {
   922   while (mCurCharClass == CHAR_CLASS_WORD)
   923     Advance();
   924 }
   927 // WordSplitState::IsSpecialWord
   929 bool
   930 WordSplitState::IsSpecialWord()
   931 {
   932   // Search for email addresses. We simply define these as any sequence of
   933   // characters with an '@' character in the middle. The DOM word is already
   934   // split on whitepace, so we know that everything to the end is the address
   935   int32_t firstColon = -1;
   936   for (int32_t i = mDOMWordOffset;
   937        i < int32_t(mDOMWordText.Length()); i ++) {
   938     if (mDOMWordText[i] == '@') {
   939       // only accept this if there are unambiguous word characters (don't bother
   940       // recursing to disambiguate apostrophes) on each side. This prevents
   941       // classifying, e.g. "@home" as an email address
   943       // Use this condition to only accept words with '@' in the middle of
   944       // them. It works, but the inlinespellcker doesn't like this. The problem
   945       // is that you type "fhsgfh@" that's a misspelled word followed by a
   946       // symbol, but when you type another letter "fhsgfh@g" that first word
   947       // need to be unmarked misspelled. It doesn't do this. it only checks the
   948       // current position for potentially removing a spelling range.
   949       if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
   950           i < (int32_t)mDOMWordText.Length() - 1 &&
   951           ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
   952         return true;
   953       }
   954     } else if (mDOMWordText[i] == ':' && firstColon < 0) {
   955       firstColon = i;
   957       // If the first colon is followed by a slash, consider it a URL
   958       // This will catch things like asdf://foo.com
   959       if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
   960           mDOMWordText[firstColon + 1] == '/') {
   961         return true;
   962       }
   963     }
   964   }
   966   // Check the text before the first colon against some known protocols. It
   967   // is impossible to check against all protocols, especially since you can
   968   // plug in new protocols. We also don't want to waste time here checking
   969   // against a lot of obscure protocols.
   970   if (firstColon > mDOMWordOffset) {
   971     nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
   972                       firstColon - mDOMWordOffset));
   973     if (protocol.EqualsIgnoreCase("http") ||
   974         protocol.EqualsIgnoreCase("https") ||
   975         protocol.EqualsIgnoreCase("news") ||
   976         protocol.EqualsIgnoreCase("file") ||
   977         protocol.EqualsIgnoreCase("javascript") ||
   978         protocol.EqualsIgnoreCase("data") ||
   979         protocol.EqualsIgnoreCase("ftp")) {
   980       return true;
   981     }
   982   }
   984   // not anything special
   985   return false;
   986 }
   988 // WordSplitState::ShouldSkipWord
   990 bool
   991 WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength)
   992 {
   993   int32_t last = aStart + aLength;
   995   // check to see if the word contains a digit
   996   for (int32_t i = aStart; i < last; i ++) {
   997     if (unicode::GetGenCategory(mDOMWordText[i]) == nsIUGenCategory::kNumber) {
   998       return true;
   999     }
  1002   // not special
  1003   return false;
  1006 // mozInlineSpellWordUtil::SplitDOMWord
  1008 void
  1009 mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd)
  1011   WordSplitState state(this, mSoftText, aStart, aEnd - aStart);
  1012   state.mCurCharClass = state.ClassifyCharacter(0, true);
  1014   state.AdvanceThroughSeparators();
  1015   if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT &&
  1016       state.IsSpecialWord()) {
  1017     int32_t specialWordLength = state.mDOMWordText.Length() - state.mDOMWordOffset;
  1018     mRealWords.AppendElement(
  1019         RealWord(aStart + state.mDOMWordOffset, specialWordLength, false));
  1021     return;
  1024   while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
  1025     state.AdvanceThroughSeparators();
  1026     if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT)
  1027       break;
  1029     // save the beginning of the word
  1030     int32_t wordOffset = state.mDOMWordOffset;
  1032     // find the end of the word
  1033     state.AdvanceThroughWord();
  1034     int32_t wordLen = state.mDOMWordOffset - wordOffset;
  1035     mRealWords.AppendElement(
  1036       RealWord(aStart + wordOffset, wordLen,
  1037                !state.ShouldSkipWord(wordOffset, wordLen)));

mercurial