The Tor Browser: extensions/spellcheck/src/mozInlineSpellWordUtil.cpp@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */

     2 /* This Source Code Form is subject to the terms of the Mozilla Public

     3  * License, v. 2.0. If a copy of the MPL was not distributed with this

     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     6 #include "mozInlineSpellWordUtil.h"

     7 #include "nsDebug.h"

     8 #include "nsIAtom.h"

     9 #include "nsComponentManagerUtils.h"

    10 #include "nsIDOMCSSStyleDeclaration.h"

    11 #include "nsIDOMElement.h"

    12 #include "nsIDOMRange.h"

    13 #include "nsIEditor.h"

    14 #include "nsIDOMNode.h"

    15 #include "nsIDOMHTMLBRElement.h"

    16 #include "nsUnicharUtilCIID.h"

    17 #include "nsUnicodeProperties.h"

    18 #include "nsServiceManagerUtils.h"

    19 #include "nsIContent.h"

    20 #include "nsTextFragment.h"

    21 #include "mozilla/dom/Element.h"

    22 #include "nsRange.h"

    23 #include "nsContentUtils.h"

    24 #include "nsIFrame.h"

    25 #include <algorithm>

    27 using namespace mozilla;

    29 // IsIgnorableCharacter

    30 //

    31 //    These characters are ones that we should ignore in input.

    33 inline bool IsIgnorableCharacter(char16_t ch)

    34 {

    35   return (ch == 0xAD ||   // SOFT HYPHEN

    36           ch == 0x1806);  // MONGOLIAN TODO SOFT HYPHEN

    37 }

    39 // IsConditionalPunctuation

    40 //

    41 //    Some characters (like apostrophes) require characters on each side to be

    42 //    part of a word, and are otherwise punctuation.

    44 inline bool IsConditionalPunctuation(char16_t ch)

    45 {

    46   return (ch == '\'' ||

    47           ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK

    48           ch == 0x00B7); // MIDDLE DOT

    49 }

    51 // mozInlineSpellWordUtil::Init

    53 nsresult

    54 mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor)

    55 {

    56   nsresult rv;

    58   // getting the editor can fail commonly because the editor was detached, so

    59   // don't assert

    60   nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv);

    61   if (NS_FAILED(rv))

    62     return rv;

    64   nsCOMPtr<nsIDOMDocument> domDoc;

    65   rv = editor->GetDocument(getter_AddRefs(domDoc));

    66   NS_ENSURE_SUCCESS(rv, rv);

    67   NS_ENSURE_TRUE(domDoc, NS_ERROR_NULL_POINTER);

    69   mDOMDocument = domDoc;

    70   mDocument = do_QueryInterface(domDoc);

    72   // Find the root node for the editor. For contenteditable we'll need something

    73   // cleverer here.

    74   nsCOMPtr<nsIDOMElement> rootElt;

    75   rv = editor->GetRootElement(getter_AddRefs(rootElt));

    76   NS_ENSURE_SUCCESS(rv, rv);

    78   nsCOMPtr<nsINode> rootNode = do_QueryInterface(rootElt);

    79   mRootNode = rootNode;

    80   NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!");

    81   return NS_OK;

    82 }

    84 static inline bool

    85 IsTextNode(nsINode* aNode)

    86 {

    87   return aNode->IsNodeOfType(nsINode::eTEXT);

    88 }

    90 typedef void (* OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);

    92 // Find the next node in the DOM tree in preorder.

    93 // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is

    94 // why we can't just use GetNextNode here, sadly.

    95 static nsINode*

    96 FindNextNode(nsINode* aNode, nsINode* aRoot,

    97              OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure)

    98 {

    99   NS_PRECONDITION(aNode, "Null starting node?");

   101   nsINode* next = aNode->GetFirstChild();

   102   if (next)

   103     return next;

   105   // Don't look at siblings or otherwise outside of aRoot

   106   if (aNode == aRoot)

   107     return nullptr;

   109   next = aNode->GetNextSibling();

   110   if (next)

   111     return next;

   113   // Go up

   114   for (;;) {

   115     if (aOnLeaveNode) {

   116       aOnLeaveNode(aNode, aClosure);

   117     }

   119     next = aNode->GetParent();

   120     if (next == aRoot || ! next)

   121       return nullptr;

   122     aNode = next;

   124     next = aNode->GetNextSibling();

   125     if (next)

   126       return next;

   127   }

   128 }

   130 // aNode is not a text node. Find the first text node starting at aNode/aOffset

   131 // in a preorder DOM traversal.

   132 static nsINode*

   133 FindNextTextNode(nsINode* aNode, int32_t aOffset, nsINode* aRoot)

   134 {

   135   NS_PRECONDITION(aNode, "Null starting node?");

   136   NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node");

   138   nsINode* checkNode;

   139   // Need to start at the aOffset'th child

   140   nsIContent* child = aNode->GetChildAt(aOffset);

   142   if (child) {

   143     checkNode = child;

   144   } else {

   145     // aOffset was beyond the end of the child list.

   146     // goto next node after the last descendant of aNode in

   147     // a preorder DOM traversal.

   148     checkNode = aNode->GetNextNonChildNode(aRoot);

   149   }

   151   while (checkNode && !IsTextNode(checkNode)) {

   152     checkNode = checkNode->GetNextNode(aRoot);

   153   }

   154   return checkNode;

   155 }

   157 // mozInlineSpellWordUtil::SetEnd

   158 //

   159 //    We have two ranges "hard" and "soft". The hard boundary is simply

   160 //    the scope of the root node. The soft boundary is that which is set

   161 //    by the caller of this class by calling this function. If this function is

   162 //    not called, the soft boundary is the same as the hard boundary.

   163 //

   164 //    When we reach the soft boundary (mSoftEnd), we keep

   165 //    going until we reach the end of a word. This allows the caller to set the

   166 //    end of the range to anything, and we will always check whole multiples of

   167 //    words. When we reach the hard boundary we stop no matter what.

   168 //

   169 //    There is no beginning soft boundary. This is because we only go to the

   170 //    previous node once, when finding the previous word boundary in

   171 //    SetPosition(). You might think of the soft boundary as being this initial

   172 //    position.

   174 nsresult

   175 mozInlineSpellWordUtil::SetEnd(nsINode* aEndNode, int32_t aEndOffset)

   176 {

   177   NS_PRECONDITION(aEndNode, "Null end node?");

   179   NS_ASSERTION(mRootNode, "Not initialized");

   181   InvalidateWords();

   183   if (!IsTextNode(aEndNode)) {

   184     // End at the start of the first text node after aEndNode/aEndOffset.

   185     aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);

   186     aEndOffset = 0;

   187   }

   188   mSoftEnd = NodeOffset(aEndNode, aEndOffset);

   189   return NS_OK;

   190 }

   192 nsresult

   193 mozInlineSpellWordUtil::SetPosition(nsINode* aNode, int32_t aOffset)

   194 {

   195   InvalidateWords();

   197   if (!IsTextNode(aNode)) {

   198     // Start at the start of the first text node after aNode/aOffset.

   199     aNode = FindNextTextNode(aNode, aOffset, mRootNode);

   200     aOffset = 0;

   201   }

   202   mSoftBegin = NodeOffset(aNode, aOffset);

   204   EnsureWords();

   206   int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin);

   207   if (textOffset < 0)

   208     return NS_OK;

   209   mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);

   210   return NS_OK;

   211 }

   213 void

   214 mozInlineSpellWordUtil::EnsureWords()

   215 {

   216   if (mSoftTextValid)

   217     return;

   218   BuildSoftText();

   219   BuildRealWords();

   220   mSoftTextValid = true;

   221 }

   223 nsresult

   224 mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsRange** aRange)

   225 {

   226   NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);

   227   NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);

   228   return MakeRange(begin, end, aRange);

   229 }

   231 // mozInlineSpellWordUtil::GetRangeForWord

   233 nsresult

   234 mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode,

   235                                         int32_t aWordOffset,

   236                                         nsRange** aRange)

   237 {

   238   // Set our soft end and start

   239   nsCOMPtr<nsINode> wordNode = do_QueryInterface(aWordNode);

   240   NodeOffset pt = NodeOffset(wordNode, aWordOffset);

   242   InvalidateWords();

   243   mSoftBegin = mSoftEnd = pt;

   244   EnsureWords();

   246   int32_t offset = MapDOMPositionToSoftTextOffset(pt);

   247   if (offset < 0)

   248     return MakeRange(pt, pt, aRange);

   249   int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);

   250   if (wordIndex < 0)

   251     return MakeRange(pt, pt, aRange);

   252   return MakeRangeForWord(mRealWords[wordIndex], aRange);

   253 }

   255 // This is to fix characters that the spellchecker may not like

   256 static void

   257 NormalizeWord(const nsSubstring& aInput, int32_t aPos, int32_t aLen, nsAString& aOutput)

   258 {

   259   aOutput.Truncate();

   260   for (int32_t i = 0; i < aLen; i++) {

   261     char16_t ch = aInput.CharAt(i + aPos);

   263     // remove ignorable characters from the word

   264     if (IsIgnorableCharacter(ch))

   265       continue;

   267     // the spellchecker doesn't handle curly apostrophes in all languages

   268     if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK

   269       ch = '\'';

   270     }

   272     aOutput.Append(ch);

   273   }

   274 }

   276 // mozInlineSpellWordUtil::GetNextWord

   277 //

   278 //    FIXME-optimization: we shouldn't have to generate a range every single

   279 //    time. It would be better if the inline spellchecker didn't require a

   280 //    range unless the word was misspelled. This may or may not be possible.

   282 nsresult

   283 mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsRange** aRange,

   284                                     bool* aSkipChecking)

   285 {

   286 #ifdef DEBUG_SPELLCHECK

   287   printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex);

   288 #endif

   290   if (mNextWordIndex < 0 ||

   291       mNextWordIndex >= int32_t(mRealWords.Length())) {

   292     mNextWordIndex = -1;

   293     *aRange = nullptr;

   294     *aSkipChecking = true;

   295     return NS_OK;

   296   }

   298   const RealWord& word = mRealWords[mNextWordIndex];

   299   nsresult rv = MakeRangeForWord(word, aRange);

   300   NS_ENSURE_SUCCESS(rv, rv);

   301   ++mNextWordIndex;

   302   *aSkipChecking = !word.mCheckableWord;

   303   ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText);

   305 #ifdef DEBUG_SPELLCHECK

   306   printf("GetNextWord returning: %s (skip=%d)\n",

   307          NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking);

   308 #endif

   310   return NS_OK;

   311 }

   313 // mozInlineSpellWordUtil::MakeRange

   314 //

   315 //    Convenience function for creating a range over the current document.

   317 nsresult

   318 mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,

   319                                   nsRange** aRange)

   320 {

   321   NS_ENSURE_ARG_POINTER(aBegin.mNode);

   322   if (!mDOMDocument)

   323     return NS_ERROR_NOT_INITIALIZED;

   325   nsRefPtr<nsRange> range = new nsRange(aBegin.mNode);

   326   nsresult rv = range->Set(aBegin.mNode, aBegin.mOffset,

   327                            aEnd.mNode, aEnd.mOffset);

   328   NS_ENSURE_SUCCESS(rv, rv);

   329   range.forget(aRange);

   331   return NS_OK;

   332 }

   334 /*********** DOM text extraction ************/

   336 // IsDOMWordSeparator

   337 //

   338 //    Determines if the given character should be considered as a DOM Word

   339 //    separator. Basically, this is whitespace, although it could also have

   340 //    certain punctuation that we know ALWAYS breaks words. This is important.

   341 //    For example, we can't have any punctuation that could appear in a URL

   342 //    or email address in this, because those need to always fit into a single

   343 //    DOM word.

   345 static bool

   346 IsDOMWordSeparator(char16_t ch)

   347 {

   348   // simple spaces

   349   if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')

   350     return true;

   352   // complex spaces - check only if char isn't ASCII (uncommon)

   353   if (ch >= 0xA0 &&

   354       (ch == 0x00A0 ||  // NO-BREAK SPACE

   355        ch == 0x2002 ||  // EN SPACE

   356        ch == 0x2003 ||  // EM SPACE

   357        ch == 0x2009 ||  // THIN SPACE

   358        ch == 0x3000))   // IDEOGRAPHIC SPACE

   359     return true;

   361   // otherwise not a space

   362   return false;

   363 }

   365 static inline bool

   366 IsBRElement(nsINode* aNode)

   367 {

   368   return aNode->IsElement() &&

   369          aNode->AsElement()->IsHTML(nsGkAtoms::br);

   370 }

   372 /**

   373  * Given a TextNode, checks to see if there's a DOM word separator before

   374  * aBeforeOffset within it. This function does not modify aSeparatorOffset when

   375  * it returns false.

   376  *

   377  * @param aNode the TextNode to check.

   378  * @param aBeforeOffset the offset in the TextNode before which we will search

   379  *        for the DOM separator. You can pass INT32_MAX to search the entire

   380  *        length of the string.

   381  * @param aSeparatorOffset will be set to the offset of the first separator it

   382  *        encounters. Will not be written to if no separator is found.

   383  * @returns True if it found a separator.

   384  */

   385 static bool

   386 TextNodeContainsDOMWordSeparator(nsINode* aNode,

   387                                  int32_t aBeforeOffset,

   388                                  int32_t* aSeparatorOffset)

   389 {

   390   // aNode is actually an nsIContent, since it's eTEXT

   391   nsIContent* content = static_cast<nsIContent*>(aNode);

   392   const nsTextFragment* textFragment = content->GetText();

   393   NS_ASSERTION(textFragment, "Where is our text?");

   394   for (int32_t i = std::min(aBeforeOffset, int32_t(textFragment->GetLength())) - 1; i >= 0; --i) {

   395     if (IsDOMWordSeparator(textFragment->CharAt(i))) {

   396       // Be greedy, find as many separators as we can

   397       for (int32_t j = i - 1; j >= 0; --j) {

   398         if (IsDOMWordSeparator(textFragment->CharAt(j))) {

   399           i = j;

   400         } else {

   401           break;

   402         }

   403       }

   404       *aSeparatorOffset = i;

   405       return true;

   406     }

   407   }

   408   return false;

   409 }

   411 /**

   412  * Check if there's a DOM word separator before aBeforeOffset in this node.

   413  * Always returns true if it's a BR element.

   414  * aSeparatorOffset is set to the index of the first character in the last

   415  * separator if any is found (0 for BR elements).

   416  *

   417  * This function does not modify aSeparatorOffset when it returns false.

   418  */

   419 static bool

   420 ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,

   421                          int32_t* aSeparatorOffset)

   422 {

   423   if (IsBRElement(aNode)) {

   424     *aSeparatorOffset = 0;

   425     return true;

   426   }

   428   if (!IsTextNode(aNode))

   429     return false;

   431   return TextNodeContainsDOMWordSeparator(aNode, aBeforeOffset,

   432                                           aSeparatorOffset);

   433 }

   435 static bool

   436 IsBreakElement(nsINode* aNode)

   437 {

   438   if (!aNode->IsElement()) {

   439     return false;

   440   }

   442   dom::Element *element = aNode->AsElement();

   444   if (element->IsHTML(nsGkAtoms::br))

   445     return true;

   447   // If we don't have a frame, we don't consider ourselves a break

   448   // element.  In particular, words can span us.

   449   if (!element->GetPrimaryFrame())

   450     return false;

   452   // Anything that's not an inline element is a break element.

   453   // XXXbz should replaced inlines be break elements, though?

   454   return element->GetPrimaryFrame()->StyleDisplay()->mDisplay !=

   455     NS_STYLE_DISPLAY_INLINE;

   456 }

   458 struct CheckLeavingBreakElementClosure {

   459   bool          mLeftBreakElement;

   460 };

   462 static void

   463 CheckLeavingBreakElement(nsINode* aNode, void* aClosure)

   464 {

   465   CheckLeavingBreakElementClosure* cl =

   466     static_cast<CheckLeavingBreakElementClosure*>(aClosure);

   467   if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {

   468     cl->mLeftBreakElement = true;

   469   }

   470 }

   472 void

   473 mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord)

   474 {

   475   nsAutoString result;

   476   ::NormalizeWord(aWord, 0, aWord.Length(), result);

   477   aWord = result;

   478 }

   480 void

   481 mozInlineSpellWordUtil::BuildSoftText()

   482 {

   483   // First we have to work backwards from mSoftStart to find a text node

   484   // containing a DOM word separator, a non-inline-element

   485   // boundary, or the hard start node. That's where we'll start building the

   486   // soft string from.

   487   nsINode* node = mSoftBegin.mNode;

   488   int32_t firstOffsetInNode = 0;

   489   int32_t checkBeforeOffset = mSoftBegin.mOffset;

   490   while (node) {

   491     if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {

   492       if (node == mSoftBegin.mNode) {

   493         // If we find a word separator on the first node, look at the preceding

   494         // word on the text node as well.

   495         int32_t newOffset = 0;

   496         if (firstOffsetInNode > 0) {

   497           // Try to find the previous word boundary in the current node. If

   498           // we can't find one, start checking previous sibling nodes (if any

   499           // adjacent ones exist) to see if we can find any text nodes with

   500           // DOM word separators. We bail out as soon as we see a node that is

   501           // not a text node, or we run out of previous sibling nodes. In the

   502           // event that we simply cannot find any preceding word separator, the

   503           // offset is set to 0, and the soft text beginning node is set to the

   504           // "most previous" text node before the original starting node, or

   505           // kept at the original starting node if no previous text nodes exist.

   506           if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,

   507                                         &newOffset)) {

   508             nsINode* prevNode = node->GetPreviousSibling();

   509             while (prevNode && IsTextNode(prevNode)) {

   510               mSoftBegin.mNode = prevNode;

   511               if (TextNodeContainsDOMWordSeparator(prevNode, INT32_MAX,

   512                                                    &newOffset)) {

   513                 break;

   514               }

   515               prevNode = prevNode->GetPreviousSibling();

   516             }

   517           }

   518         }

   519         firstOffsetInNode = newOffset;

   520         mSoftBegin.mOffset = newOffset;

   521       }

   522       break;

   523     }

   524     checkBeforeOffset = INT32_MAX;

   525     if (IsBreakElement(node)) {

   526       // Since GetPreviousContent follows tree *preorder*, we're about to traverse

   527       // up out of 'node'. Since node induces breaks (e.g., it's a block),

   528       // don't bother trying to look outside it, just stop now.

   529       break;

   530     }

   531     // GetPreviousContent below expects mRootNode to be an ancestor of node.

   532     if (!nsContentUtils::ContentIsDescendantOf(node, mRootNode)) {

   533       break;

   534     }

   535     node = node->GetPreviousContent(mRootNode);

   536   }

   538   // Now build up the string moving forward through the DOM until we reach

   539   // the soft end and *then* see a DOM word separator, a non-inline-element

   540   // boundary, or the hard end node.

   541   mSoftText.Truncate();

   542   mSoftTextDOMMapping.Clear();

   543   bool seenSoftEnd = false;

   544   // Leave this outside the loop so large heap string allocations can be reused

   545   // across iterations

   546   while (node) {

   547     if (node == mSoftEnd.mNode) {

   548       seenSoftEnd = true;

   549     }

   551     bool exit = false;

   552     if (IsTextNode(node)) {

   553       nsIContent* content = static_cast<nsIContent*>(node);

   554       NS_ASSERTION(content, "Where is our content?");

   555       const nsTextFragment* textFragment = content->GetText();

   556       NS_ASSERTION(textFragment, "Where is our text?");

   557       int32_t lastOffsetInNode = textFragment->GetLength();

   559       if (seenSoftEnd) {

   560         // check whether we can stop after this

   561         for (int32_t i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0;

   562              i < int32_t(textFragment->GetLength()); ++i) {

   563           if (IsDOMWordSeparator(textFragment->CharAt(i))) {

   564             exit = true;

   565             // stop at the first separator after the soft end point

   566             lastOffsetInNode = i;

   567             break;

   568           }

   569         }

   570       }

   572       if (firstOffsetInNode < lastOffsetInNode) {

   573         int32_t len = lastOffsetInNode - firstOffsetInNode;

   574         mSoftTextDOMMapping.AppendElement(

   575           DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len));

   577         bool ok = textFragment->AppendTo(mSoftText, firstOffsetInNode, len,

   578                                          mozilla::fallible_t());

   579         if (!ok) {

   580             // probably out of memory, remove from mSoftTextDOMMapping

   581             mSoftTextDOMMapping.RemoveElementAt(mSoftTextDOMMapping.Length() - 1);

   582             exit = true;

   583         }

   584       }

   586       firstOffsetInNode = 0;

   587     }

   589     if (exit)

   590       break;

   592     CheckLeavingBreakElementClosure closure = { false };

   593     node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure);

   594     if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {

   595       // We left, or are entering, a break element (e.g., block). Maybe we can

   596       // stop now.

   597       if (seenSoftEnd)

   598         break;

   599       // Record the break

   600       mSoftText.Append(' ');

   601     }

   602   }

   604 #ifdef DEBUG_SPELLCHECK

   605   printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get());

   606 #endif

   607 }

   609 void

   610 mozInlineSpellWordUtil::BuildRealWords()

   611 {

   612   // This is pretty simple. We just have to walk mSoftText, tokenizing it

   613   // into "real words".

   614   // We do an outer traversal of words delimited by IsDOMWordSeparator, calling

   615   // SplitDOMWord on each of those DOM words

   616   int32_t wordStart = -1;

   617   mRealWords.Clear();

   618   for (int32_t i = 0; i < int32_t(mSoftText.Length()); ++i) {

   619     if (IsDOMWordSeparator(mSoftText.CharAt(i))) {

   620       if (wordStart >= 0) {

   621         SplitDOMWord(wordStart, i);

   622         wordStart = -1;

   623       }

   624     } else {

   625       if (wordStart < 0) {

   626         wordStart = i;

   627       }

   628     }

   629   }

   630   if (wordStart >= 0) {

   631     SplitDOMWord(wordStart, mSoftText.Length());

   632   }

   633 }

   635 /*********** DOM/realwords<->mSoftText mapping functions ************/

   637 int32_t

   638 mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset)

   639 {

   640   if (!mSoftTextValid) {

   641     NS_ERROR("Soft text must be valid if we're to map into it");

   642     return -1;

   643   }

   645   for (int32_t i = 0; i < int32_t(mSoftTextDOMMapping.Length()); ++i) {

   646     const DOMTextMapping& map = mSoftTextDOMMapping[i];

   647     if (map.mNodeOffset.mNode == aNodeOffset.mNode) {

   648       // Allow offsets at either end of the string, in particular, allow the

   649       // offset that's at the end of the contributed string

   650       int32_t offsetInContributedString =

   651         aNodeOffset.mOffset - map.mNodeOffset.mOffset;

   652       if (offsetInContributedString >= 0 &&

   653           offsetInContributedString <= map.mLength)

   654         return map.mSoftTextOffset + offsetInContributedString;

   655       return -1;

   656     }

   657   }

   658   return -1;

   659 }

   661 mozInlineSpellWordUtil::NodeOffset

   662 mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset,

   663                                                        DOMMapHint aHint)

   664 {

   665   NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");

   666   if (!mSoftTextValid)

   667     return NodeOffset(nullptr, -1);

   669   // The invariant is that the range start..end includes the last mapping,

   670   // if any, such that mSoftTextOffset <= aSoftTextOffset

   671   int32_t start = 0;

   672   int32_t end = mSoftTextDOMMapping.Length();

   673   while (end - start >= 2) {

   674     int32_t mid = (start + end)/2;

   675     const DOMTextMapping& map = mSoftTextDOMMapping[mid];

   676     if (map.mSoftTextOffset > aSoftTextOffset) {

   677       end = mid;

   678     } else {

   679       start = mid;

   680     }

   681   }

   683   if (start >= end)

   684     return NodeOffset(nullptr, -1);

   686   // 'start' is now the last mapping, if any, such that

   687   // mSoftTextOffset <= aSoftTextOffset.

   688   // If we're doing HINT_END, then we may want to return the end of the

   689   // the previous mapping instead of the start of this mapping

   690   if (aHint == HINT_END && start > 0) {

   691     const DOMTextMapping& map = mSoftTextDOMMapping[start - 1];

   692     if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)

   693       return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength);

   694   }

   696   // We allow ourselves to return the end of this mapping even if we're

   697   // doing HINT_START. This will only happen if there is no mapping which this

   698   // point is the start of. I'm not 100% sure this is OK...

   699   const DOMTextMapping& map = mSoftTextDOMMapping[start];

   700   int32_t offset = aSoftTextOffset - map.mSoftTextOffset;

   701   if (offset >= 0 && offset <= map.mLength)

   702     return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);

   704   return NodeOffset(nullptr, -1);

   705 }

   707 int32_t

   708 mozInlineSpellWordUtil::FindRealWordContaining(int32_t aSoftTextOffset,

   709     DOMMapHint aHint, bool aSearchForward)

   710 {

   711   NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");

   712   if (!mSoftTextValid)

   713     return -1;

   715   // The invariant is that the range start..end includes the last word,

   716   // if any, such that mSoftTextOffset <= aSoftTextOffset

   717   int32_t start = 0;

   718   int32_t end = mRealWords.Length();

   719   while (end - start >= 2) {

   720     int32_t mid = (start + end)/2;

   721     const RealWord& word = mRealWords[mid];

   722     if (word.mSoftTextOffset > aSoftTextOffset) {

   723       end = mid;

   724     } else {

   725       start = mid;

   726     }

   727   }

   729   if (start >= end)

   730     return -1;

   732   // 'start' is now the last word, if any, such that

   733   // mSoftTextOffset <= aSoftTextOffset.

   734   // If we're doing HINT_END, then we may want to return the end of the

   735   // the previous word instead of the start of this word

   736   if (aHint == HINT_END && start > 0) {

   737     const RealWord& word = mRealWords[start - 1];

   738     if (word.mSoftTextOffset + word.mLength == aSoftTextOffset)

   739       return start - 1;

   740   }

   742   // We allow ourselves to return the end of this word even if we're

   743   // doing HINT_START. This will only happen if there is no word which this

   744   // point is the start of. I'm not 100% sure this is OK...

   745   const RealWord& word = mRealWords[start];

   746   int32_t offset = aSoftTextOffset - word.mSoftTextOffset;

   747   if (offset >= 0 && offset <= word.mLength)

   748     return start;

   750   if (aSearchForward) {

   751     if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {

   752       // All words have mSoftTextOffset > aSoftTextOffset

   753       return 0;

   754     }

   755     // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset.

   756     // Word start+1, if it exists, will be the first with

   757     // mSoftTextOffset > aSoftTextOffset.

   758     if (start + 1 < int32_t(mRealWords.Length()))

   759       return start + 1;

   760   }

   762   return -1;

   763 }

   765 /*********** Word Splitting ************/

   767 // classifies a given character in the DOM word

   768 enum CharClass {

   769   CHAR_CLASS_WORD,

   770   CHAR_CLASS_SEPARATOR,

   771   CHAR_CLASS_END_OF_INPUT };

   773 // Encapsulates DOM-word to real-word splitting

   774 struct MOZ_STACK_CLASS WordSplitState

   775 {

   776   mozInlineSpellWordUtil*    mWordUtil;

   777   const nsDependentSubstring mDOMWordText;

   778   int32_t                    mDOMWordOffset;

   779   CharClass                  mCurCharClass;

   781   WordSplitState(mozInlineSpellWordUtil* aWordUtil,

   782                  const nsString& aString, int32_t aStart, int32_t aLen)

   783     : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),

   784       mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}

   786   CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;

   787   void Advance();

   788   void AdvanceThroughSeparators();

   789   void AdvanceThroughWord();

   791   // Finds special words like email addresses and URLs that may start at the

   792   // current position, and returns their length, or 0 if not found. This allows

   793   // arbitrary word breaking rules to be used for these special entities, as

   794   // long as they can not contain whitespace.

   795   bool IsSpecialWord();

   797   // Similar to IsSpecialWord except that this takes a split word as

   798   // input. This checks for things that do not require special word-breaking

   799   // rules.

   800   bool ShouldSkipWord(int32_t aStart, int32_t aLength);

   801 };

   803 // WordSplitState::ClassifyCharacter

   805 CharClass

   806 WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const

   807 {

   808   NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),

   809                "Index out of range");

   810   if (aIndex == int32_t(mDOMWordText.Length()))

   811     return CHAR_CLASS_SEPARATOR;

   813   // this will classify the character, we want to treat "ignorable" characters

   814   // such as soft hyphens, and also ZWJ and ZWNJ as word characters.

   815   nsIUGenCategory::nsUGenCategory

   816     charCategory = mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]);

   817   if (charCategory == nsIUGenCategory::kLetter ||

   818       IsIgnorableCharacter(mDOMWordText[aIndex]) ||

   819       mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||

   820       mDOMWordText[aIndex] == 0x200D /* ZWJ */)

   821     return CHAR_CLASS_WORD;

   823   // If conditional punctuation is surrounded immediately on both sides by word

   824   // characters it also counts as a word character.

   825   if (IsConditionalPunctuation(mDOMWordText[aIndex])) {

   826     if (!aRecurse) {

   827       // not allowed to look around, this punctuation counts like a separator

   828       return CHAR_CLASS_SEPARATOR;

   829     }

   831     // check the left-hand character

   832     if (aIndex == 0)

   833       return CHAR_CLASS_SEPARATOR;

   834     if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)

   835       return CHAR_CLASS_SEPARATOR;

   836     // If the previous charatcer is a word-char, make sure that it's not a

   837     // special dot character.

   838     if (mDOMWordText[aIndex - 1] == '.')

   839       return CHAR_CLASS_SEPARATOR;

   841     // now we know left char is a word-char, check the right-hand character

   842     if (aIndex == int32_t(mDOMWordText.Length()) - 1)

   843       return CHAR_CLASS_SEPARATOR;

   844     if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)

   845       return CHAR_CLASS_SEPARATOR;

   846     // If the next charatcer is a word-char, make sure that it's not a

   847     // special dot character.

   848     if (mDOMWordText[aIndex + 1] == '.')

   849       return CHAR_CLASS_SEPARATOR;

   851     // char on either side is a word, this counts as a word

   852     return CHAR_CLASS_WORD;

   853   }

   855   // The dot character, if appearing at the end of a word, should

   856   // be considered part of that word.  Example: "etc.", or

   857   // abbreviations

   858   if (aIndex > 0 &&

   859       mDOMWordText[aIndex] == '.' &&

   860       mDOMWordText[aIndex - 1] != '.' &&

   861       ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {

   862     return CHAR_CLASS_WORD;

   863   }

   865   // all other punctuation

   866   if (charCategory == nsIUGenCategory::kSeparator ||

   867       charCategory == nsIUGenCategory::kOther ||

   868       charCategory == nsIUGenCategory::kPunctuation ||

   869       charCategory == nsIUGenCategory::kSymbol) {

   870     // Don't break on hyphens, as hunspell handles them on its own.

   871     if (aIndex > 0 &&

   872         mDOMWordText[aIndex] == '-' &&

   873         mDOMWordText[aIndex - 1] != '-' &&

   874         ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {

   875       // A hyphen is only meaningful as a separator inside a word

   876       // if the previous and next characters are a word character.

   877       if (aIndex == int32_t(mDOMWordText.Length()) - 1)

   878         return CHAR_CLASS_SEPARATOR;

   879       if (mDOMWordText[aIndex + 1] != '.' &&

   880           ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)

   881         return CHAR_CLASS_WORD;

   882     }

   883     return CHAR_CLASS_SEPARATOR;

   884   }

   886   // any other character counts as a word

   887   return CHAR_CLASS_WORD;

   888 }

   891 // WordSplitState::Advance

   893 void

   894 WordSplitState::Advance()

   895 {

   896   NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");

   897   NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(),

   898                "Length beyond end");

   900   mDOMWordOffset ++;

   901   if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())

   902     mCurCharClass = CHAR_CLASS_END_OF_INPUT;

   903   else

   904     mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);

   905 }

   908 // WordSplitState::AdvanceThroughSeparators

   910 void

   911 WordSplitState::AdvanceThroughSeparators()

   912 {

   913   while (mCurCharClass == CHAR_CLASS_SEPARATOR)

   914     Advance();

   915 }

   917 // WordSplitState::AdvanceThroughWord

   919 void

   920 WordSplitState::AdvanceThroughWord()

   921 {

   922   while (mCurCharClass == CHAR_CLASS_WORD)

   923     Advance();

   924 }

   927 // WordSplitState::IsSpecialWord

   929 bool

   930 WordSplitState::IsSpecialWord()

   931 {

   932   // Search for email addresses. We simply define these as any sequence of

   933   // characters with an '@' character in the middle. The DOM word is already

   934   // split on whitepace, so we know that everything to the end is the address

   935   int32_t firstColon = -1;

   936   for (int32_t i = mDOMWordOffset;

   937        i < int32_t(mDOMWordText.Length()); i ++) {

   938     if (mDOMWordText[i] == '@') {

   939       // only accept this if there are unambiguous word characters (don't bother

   940       // recursing to disambiguate apostrophes) on each side. This prevents

   941       // classifying, e.g. "@home" as an email address

   943       // Use this condition to only accept words with '@' in the middle of

   944       // them. It works, but the inlinespellcker doesn't like this. The problem

   945       // is that you type "fhsgfh@" that's a misspelled word followed by a

   946       // symbol, but when you type another letter "fhsgfh@g" that first word

   947       // need to be unmarked misspelled. It doesn't do this. it only checks the

   948       // current position for potentially removing a spelling range.

   949       if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&

   950           i < (int32_t)mDOMWordText.Length() - 1 &&

   951           ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {

   952         return true;

   953       }

   954     } else if (mDOMWordText[i] == ':' && firstColon < 0) {

   955       firstColon = i;

   957       // If the first colon is followed by a slash, consider it a URL

   958       // This will catch things like asdf://foo.com

   959       if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&

   960           mDOMWordText[firstColon + 1] == '/') {

   961         return true;

   962       }

   963     }

   964   }

   966   // Check the text before the first colon against some known protocols. It

   967   // is impossible to check against all protocols, especially since you can

   968   // plug in new protocols. We also don't want to waste time here checking

   969   // against a lot of obscure protocols.

   970   if (firstColon > mDOMWordOffset) {

   971     nsString protocol(Substring(mDOMWordText, mDOMWordOffset,

   972                       firstColon - mDOMWordOffset));

   973     if (protocol.EqualsIgnoreCase("http") ||

   974         protocol.EqualsIgnoreCase("https") ||

   975         protocol.EqualsIgnoreCase("news") ||

   976         protocol.EqualsIgnoreCase("file") ||

   977         protocol.EqualsIgnoreCase("javascript") ||

   978         protocol.EqualsIgnoreCase("data") ||

   979         protocol.EqualsIgnoreCase("ftp")) {

   980       return true;

   981     }

   982   }

   984   // not anything special

   985   return false;

   986 }

   988 // WordSplitState::ShouldSkipWord

   990 bool

   991 WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength)

   992 {

   993   int32_t last = aStart + aLength;

   995   // check to see if the word contains a digit

   996   for (int32_t i = aStart; i < last; i ++) {

   997     if (unicode::GetGenCategory(mDOMWordText[i]) == nsIUGenCategory::kNumber) {

   998       return true;

   999     }

  1000   }

  1002   // not special

  1003   return false;

  1004 }

  1006 // mozInlineSpellWordUtil::SplitDOMWord

  1008 void

  1009 mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd)

  1010 {

  1011   WordSplitState state(this, mSoftText, aStart, aEnd - aStart);

  1012   state.mCurCharClass = state.ClassifyCharacter(0, true);

  1014   state.AdvanceThroughSeparators();

  1015   if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT &&

  1016       state.IsSpecialWord()) {

  1017     int32_t specialWordLength = state.mDOMWordText.Length() - state.mDOMWordOffset;

  1018     mRealWords.AppendElement(

  1019         RealWord(aStart + state.mDOMWordOffset, specialWordLength, false));

  1021     return;

  1022   }

  1024   while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {

  1025     state.AdvanceThroughSeparators();

  1026     if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT)

  1027       break;

  1029     // save the beginning of the word

  1030     int32_t wordOffset = state.mDOMWordOffset;

  1032     // find the end of the word

  1033     state.AdvanceThroughWord();

  1034     int32_t wordLen = state.mDOMWordOffset - wordOffset;

  1035     mRealWords.AppendElement(

  1036       RealWord(aStart + wordOffset, wordLen,

  1037                !state.ShouldSkipWord(wordOffset, wordLen)));

  1038   }

  1039 }

The Tor Browser / file revision

extensions/spellcheck/src/mozInlineSpellWordUtil.cpp@6474c204b198

extensions/spellcheck/src/mozInlineSpellWordUtil.cpp