Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "mozInlineSpellWordUtil.h"
7 #include "nsDebug.h"
8 #include "nsIAtom.h"
9 #include "nsComponentManagerUtils.h"
10 #include "nsIDOMCSSStyleDeclaration.h"
11 #include "nsIDOMElement.h"
12 #include "nsIDOMRange.h"
13 #include "nsIEditor.h"
14 #include "nsIDOMNode.h"
15 #include "nsIDOMHTMLBRElement.h"
16 #include "nsUnicharUtilCIID.h"
17 #include "nsUnicodeProperties.h"
18 #include "nsServiceManagerUtils.h"
19 #include "nsIContent.h"
20 #include "nsTextFragment.h"
21 #include "mozilla/dom/Element.h"
22 #include "nsRange.h"
23 #include "nsContentUtils.h"
24 #include "nsIFrame.h"
25 #include <algorithm>
27 using namespace mozilla;
29 // IsIgnorableCharacter
30 //
31 // These characters are ones that we should ignore in input.
33 inline bool IsIgnorableCharacter(char16_t ch)
34 {
35 return (ch == 0xAD || // SOFT HYPHEN
36 ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN
37 }
39 // IsConditionalPunctuation
40 //
41 // Some characters (like apostrophes) require characters on each side to be
42 // part of a word, and are otherwise punctuation.
44 inline bool IsConditionalPunctuation(char16_t ch)
45 {
46 return (ch == '\'' ||
47 ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK
48 ch == 0x00B7); // MIDDLE DOT
49 }
51 // mozInlineSpellWordUtil::Init
53 nsresult
54 mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor)
55 {
56 nsresult rv;
58 // getting the editor can fail commonly because the editor was detached, so
59 // don't assert
60 nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv);
61 if (NS_FAILED(rv))
62 return rv;
64 nsCOMPtr<nsIDOMDocument> domDoc;
65 rv = editor->GetDocument(getter_AddRefs(domDoc));
66 NS_ENSURE_SUCCESS(rv, rv);
67 NS_ENSURE_TRUE(domDoc, NS_ERROR_NULL_POINTER);
69 mDOMDocument = domDoc;
70 mDocument = do_QueryInterface(domDoc);
72 // Find the root node for the editor. For contenteditable we'll need something
73 // cleverer here.
74 nsCOMPtr<nsIDOMElement> rootElt;
75 rv = editor->GetRootElement(getter_AddRefs(rootElt));
76 NS_ENSURE_SUCCESS(rv, rv);
78 nsCOMPtr<nsINode> rootNode = do_QueryInterface(rootElt);
79 mRootNode = rootNode;
80 NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!");
81 return NS_OK;
82 }
84 static inline bool
85 IsTextNode(nsINode* aNode)
86 {
87 return aNode->IsNodeOfType(nsINode::eTEXT);
88 }
90 typedef void (* OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);
92 // Find the next node in the DOM tree in preorder.
93 // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
94 // why we can't just use GetNextNode here, sadly.
95 static nsINode*
96 FindNextNode(nsINode* aNode, nsINode* aRoot,
97 OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure)
98 {
99 NS_PRECONDITION(aNode, "Null starting node?");
101 nsINode* next = aNode->GetFirstChild();
102 if (next)
103 return next;
105 // Don't look at siblings or otherwise outside of aRoot
106 if (aNode == aRoot)
107 return nullptr;
109 next = aNode->GetNextSibling();
110 if (next)
111 return next;
113 // Go up
114 for (;;) {
115 if (aOnLeaveNode) {
116 aOnLeaveNode(aNode, aClosure);
117 }
119 next = aNode->GetParent();
120 if (next == aRoot || ! next)
121 return nullptr;
122 aNode = next;
124 next = aNode->GetNextSibling();
125 if (next)
126 return next;
127 }
128 }
130 // aNode is not a text node. Find the first text node starting at aNode/aOffset
131 // in a preorder DOM traversal.
132 static nsINode*
133 FindNextTextNode(nsINode* aNode, int32_t aOffset, nsINode* aRoot)
134 {
135 NS_PRECONDITION(aNode, "Null starting node?");
136 NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node");
138 nsINode* checkNode;
139 // Need to start at the aOffset'th child
140 nsIContent* child = aNode->GetChildAt(aOffset);
142 if (child) {
143 checkNode = child;
144 } else {
145 // aOffset was beyond the end of the child list.
146 // goto next node after the last descendant of aNode in
147 // a preorder DOM traversal.
148 checkNode = aNode->GetNextNonChildNode(aRoot);
149 }
151 while (checkNode && !IsTextNode(checkNode)) {
152 checkNode = checkNode->GetNextNode(aRoot);
153 }
154 return checkNode;
155 }
157 // mozInlineSpellWordUtil::SetEnd
158 //
159 // We have two ranges "hard" and "soft". The hard boundary is simply
160 // the scope of the root node. The soft boundary is that which is set
161 // by the caller of this class by calling this function. If this function is
162 // not called, the soft boundary is the same as the hard boundary.
163 //
164 // When we reach the soft boundary (mSoftEnd), we keep
165 // going until we reach the end of a word. This allows the caller to set the
166 // end of the range to anything, and we will always check whole multiples of
167 // words. When we reach the hard boundary we stop no matter what.
168 //
169 // There is no beginning soft boundary. This is because we only go to the
170 // previous node once, when finding the previous word boundary in
171 // SetPosition(). You might think of the soft boundary as being this initial
172 // position.
174 nsresult
175 mozInlineSpellWordUtil::SetEnd(nsINode* aEndNode, int32_t aEndOffset)
176 {
177 NS_PRECONDITION(aEndNode, "Null end node?");
179 NS_ASSERTION(mRootNode, "Not initialized");
181 InvalidateWords();
183 if (!IsTextNode(aEndNode)) {
184 // End at the start of the first text node after aEndNode/aEndOffset.
185 aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
186 aEndOffset = 0;
187 }
188 mSoftEnd = NodeOffset(aEndNode, aEndOffset);
189 return NS_OK;
190 }
192 nsresult
193 mozInlineSpellWordUtil::SetPosition(nsINode* aNode, int32_t aOffset)
194 {
195 InvalidateWords();
197 if (!IsTextNode(aNode)) {
198 // Start at the start of the first text node after aNode/aOffset.
199 aNode = FindNextTextNode(aNode, aOffset, mRootNode);
200 aOffset = 0;
201 }
202 mSoftBegin = NodeOffset(aNode, aOffset);
204 EnsureWords();
206 int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin);
207 if (textOffset < 0)
208 return NS_OK;
209 mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
210 return NS_OK;
211 }
213 void
214 mozInlineSpellWordUtil::EnsureWords()
215 {
216 if (mSoftTextValid)
217 return;
218 BuildSoftText();
219 BuildRealWords();
220 mSoftTextValid = true;
221 }
223 nsresult
224 mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsRange** aRange)
225 {
226 NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
227 NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
228 return MakeRange(begin, end, aRange);
229 }
231 // mozInlineSpellWordUtil::GetRangeForWord
233 nsresult
234 mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode,
235 int32_t aWordOffset,
236 nsRange** aRange)
237 {
238 // Set our soft end and start
239 nsCOMPtr<nsINode> wordNode = do_QueryInterface(aWordNode);
240 NodeOffset pt = NodeOffset(wordNode, aWordOffset);
242 InvalidateWords();
243 mSoftBegin = mSoftEnd = pt;
244 EnsureWords();
246 int32_t offset = MapDOMPositionToSoftTextOffset(pt);
247 if (offset < 0)
248 return MakeRange(pt, pt, aRange);
249 int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
250 if (wordIndex < 0)
251 return MakeRange(pt, pt, aRange);
252 return MakeRangeForWord(mRealWords[wordIndex], aRange);
253 }
255 // This is to fix characters that the spellchecker may not like
256 static void
257 NormalizeWord(const nsSubstring& aInput, int32_t aPos, int32_t aLen, nsAString& aOutput)
258 {
259 aOutput.Truncate();
260 for (int32_t i = 0; i < aLen; i++) {
261 char16_t ch = aInput.CharAt(i + aPos);
263 // remove ignorable characters from the word
264 if (IsIgnorableCharacter(ch))
265 continue;
267 // the spellchecker doesn't handle curly apostrophes in all languages
268 if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
269 ch = '\'';
270 }
272 aOutput.Append(ch);
273 }
274 }
276 // mozInlineSpellWordUtil::GetNextWord
277 //
278 // FIXME-optimization: we shouldn't have to generate a range every single
279 // time. It would be better if the inline spellchecker didn't require a
280 // range unless the word was misspelled. This may or may not be possible.
282 nsresult
283 mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsRange** aRange,
284 bool* aSkipChecking)
285 {
286 #ifdef DEBUG_SPELLCHECK
287 printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex);
288 #endif
290 if (mNextWordIndex < 0 ||
291 mNextWordIndex >= int32_t(mRealWords.Length())) {
292 mNextWordIndex = -1;
293 *aRange = nullptr;
294 *aSkipChecking = true;
295 return NS_OK;
296 }
298 const RealWord& word = mRealWords[mNextWordIndex];
299 nsresult rv = MakeRangeForWord(word, aRange);
300 NS_ENSURE_SUCCESS(rv, rv);
301 ++mNextWordIndex;
302 *aSkipChecking = !word.mCheckableWord;
303 ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText);
305 #ifdef DEBUG_SPELLCHECK
306 printf("GetNextWord returning: %s (skip=%d)\n",
307 NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking);
308 #endif
310 return NS_OK;
311 }
313 // mozInlineSpellWordUtil::MakeRange
314 //
315 // Convenience function for creating a range over the current document.
317 nsresult
318 mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
319 nsRange** aRange)
320 {
321 NS_ENSURE_ARG_POINTER(aBegin.mNode);
322 if (!mDOMDocument)
323 return NS_ERROR_NOT_INITIALIZED;
325 nsRefPtr<nsRange> range = new nsRange(aBegin.mNode);
326 nsresult rv = range->Set(aBegin.mNode, aBegin.mOffset,
327 aEnd.mNode, aEnd.mOffset);
328 NS_ENSURE_SUCCESS(rv, rv);
329 range.forget(aRange);
331 return NS_OK;
332 }
334 /*********** DOM text extraction ************/
336 // IsDOMWordSeparator
337 //
338 // Determines if the given character should be considered as a DOM Word
339 // separator. Basically, this is whitespace, although it could also have
340 // certain punctuation that we know ALWAYS breaks words. This is important.
341 // For example, we can't have any punctuation that could appear in a URL
342 // or email address in this, because those need to always fit into a single
343 // DOM word.
345 static bool
346 IsDOMWordSeparator(char16_t ch)
347 {
348 // simple spaces
349 if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
350 return true;
352 // complex spaces - check only if char isn't ASCII (uncommon)
353 if (ch >= 0xA0 &&
354 (ch == 0x00A0 || // NO-BREAK SPACE
355 ch == 0x2002 || // EN SPACE
356 ch == 0x2003 || // EM SPACE
357 ch == 0x2009 || // THIN SPACE
358 ch == 0x3000)) // IDEOGRAPHIC SPACE
359 return true;
361 // otherwise not a space
362 return false;
363 }
365 static inline bool
366 IsBRElement(nsINode* aNode)
367 {
368 return aNode->IsElement() &&
369 aNode->AsElement()->IsHTML(nsGkAtoms::br);
370 }
372 /**
373 * Given a TextNode, checks to see if there's a DOM word separator before
374 * aBeforeOffset within it. This function does not modify aSeparatorOffset when
375 * it returns false.
376 *
377 * @param aNode the TextNode to check.
378 * @param aBeforeOffset the offset in the TextNode before which we will search
379 * for the DOM separator. You can pass INT32_MAX to search the entire
380 * length of the string.
381 * @param aSeparatorOffset will be set to the offset of the first separator it
382 * encounters. Will not be written to if no separator is found.
383 * @returns True if it found a separator.
384 */
385 static bool
386 TextNodeContainsDOMWordSeparator(nsINode* aNode,
387 int32_t aBeforeOffset,
388 int32_t* aSeparatorOffset)
389 {
390 // aNode is actually an nsIContent, since it's eTEXT
391 nsIContent* content = static_cast<nsIContent*>(aNode);
392 const nsTextFragment* textFragment = content->GetText();
393 NS_ASSERTION(textFragment, "Where is our text?");
394 for (int32_t i = std::min(aBeforeOffset, int32_t(textFragment->GetLength())) - 1; i >= 0; --i) {
395 if (IsDOMWordSeparator(textFragment->CharAt(i))) {
396 // Be greedy, find as many separators as we can
397 for (int32_t j = i - 1; j >= 0; --j) {
398 if (IsDOMWordSeparator(textFragment->CharAt(j))) {
399 i = j;
400 } else {
401 break;
402 }
403 }
404 *aSeparatorOffset = i;
405 return true;
406 }
407 }
408 return false;
409 }
411 /**
412 * Check if there's a DOM word separator before aBeforeOffset in this node.
413 * Always returns true if it's a BR element.
414 * aSeparatorOffset is set to the index of the first character in the last
415 * separator if any is found (0 for BR elements).
416 *
417 * This function does not modify aSeparatorOffset when it returns false.
418 */
419 static bool
420 ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,
421 int32_t* aSeparatorOffset)
422 {
423 if (IsBRElement(aNode)) {
424 *aSeparatorOffset = 0;
425 return true;
426 }
428 if (!IsTextNode(aNode))
429 return false;
431 return TextNodeContainsDOMWordSeparator(aNode, aBeforeOffset,
432 aSeparatorOffset);
433 }
435 static bool
436 IsBreakElement(nsINode* aNode)
437 {
438 if (!aNode->IsElement()) {
439 return false;
440 }
442 dom::Element *element = aNode->AsElement();
444 if (element->IsHTML(nsGkAtoms::br))
445 return true;
447 // If we don't have a frame, we don't consider ourselves a break
448 // element. In particular, words can span us.
449 if (!element->GetPrimaryFrame())
450 return false;
452 // Anything that's not an inline element is a break element.
453 // XXXbz should replaced inlines be break elements, though?
454 return element->GetPrimaryFrame()->StyleDisplay()->mDisplay !=
455 NS_STYLE_DISPLAY_INLINE;
456 }
458 struct CheckLeavingBreakElementClosure {
459 bool mLeftBreakElement;
460 };
462 static void
463 CheckLeavingBreakElement(nsINode* aNode, void* aClosure)
464 {
465 CheckLeavingBreakElementClosure* cl =
466 static_cast<CheckLeavingBreakElementClosure*>(aClosure);
467 if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
468 cl->mLeftBreakElement = true;
469 }
470 }
472 void
473 mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord)
474 {
475 nsAutoString result;
476 ::NormalizeWord(aWord, 0, aWord.Length(), result);
477 aWord = result;
478 }
480 void
481 mozInlineSpellWordUtil::BuildSoftText()
482 {
483 // First we have to work backwards from mSoftStart to find a text node
484 // containing a DOM word separator, a non-inline-element
485 // boundary, or the hard start node. That's where we'll start building the
486 // soft string from.
487 nsINode* node = mSoftBegin.mNode;
488 int32_t firstOffsetInNode = 0;
489 int32_t checkBeforeOffset = mSoftBegin.mOffset;
490 while (node) {
491 if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
492 if (node == mSoftBegin.mNode) {
493 // If we find a word separator on the first node, look at the preceding
494 // word on the text node as well.
495 int32_t newOffset = 0;
496 if (firstOffsetInNode > 0) {
497 // Try to find the previous word boundary in the current node. If
498 // we can't find one, start checking previous sibling nodes (if any
499 // adjacent ones exist) to see if we can find any text nodes with
500 // DOM word separators. We bail out as soon as we see a node that is
501 // not a text node, or we run out of previous sibling nodes. In the
502 // event that we simply cannot find any preceding word separator, the
503 // offset is set to 0, and the soft text beginning node is set to the
504 // "most previous" text node before the original starting node, or
505 // kept at the original starting node if no previous text nodes exist.
506 if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,
507 &newOffset)) {
508 nsINode* prevNode = node->GetPreviousSibling();
509 while (prevNode && IsTextNode(prevNode)) {
510 mSoftBegin.mNode = prevNode;
511 if (TextNodeContainsDOMWordSeparator(prevNode, INT32_MAX,
512 &newOffset)) {
513 break;
514 }
515 prevNode = prevNode->GetPreviousSibling();
516 }
517 }
518 }
519 firstOffsetInNode = newOffset;
520 mSoftBegin.mOffset = newOffset;
521 }
522 break;
523 }
524 checkBeforeOffset = INT32_MAX;
525 if (IsBreakElement(node)) {
526 // Since GetPreviousContent follows tree *preorder*, we're about to traverse
527 // up out of 'node'. Since node induces breaks (e.g., it's a block),
528 // don't bother trying to look outside it, just stop now.
529 break;
530 }
531 // GetPreviousContent below expects mRootNode to be an ancestor of node.
532 if (!nsContentUtils::ContentIsDescendantOf(node, mRootNode)) {
533 break;
534 }
535 node = node->GetPreviousContent(mRootNode);
536 }
538 // Now build up the string moving forward through the DOM until we reach
539 // the soft end and *then* see a DOM word separator, a non-inline-element
540 // boundary, or the hard end node.
541 mSoftText.Truncate();
542 mSoftTextDOMMapping.Clear();
543 bool seenSoftEnd = false;
544 // Leave this outside the loop so large heap string allocations can be reused
545 // across iterations
546 while (node) {
547 if (node == mSoftEnd.mNode) {
548 seenSoftEnd = true;
549 }
551 bool exit = false;
552 if (IsTextNode(node)) {
553 nsIContent* content = static_cast<nsIContent*>(node);
554 NS_ASSERTION(content, "Where is our content?");
555 const nsTextFragment* textFragment = content->GetText();
556 NS_ASSERTION(textFragment, "Where is our text?");
557 int32_t lastOffsetInNode = textFragment->GetLength();
559 if (seenSoftEnd) {
560 // check whether we can stop after this
561 for (int32_t i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0;
562 i < int32_t(textFragment->GetLength()); ++i) {
563 if (IsDOMWordSeparator(textFragment->CharAt(i))) {
564 exit = true;
565 // stop at the first separator after the soft end point
566 lastOffsetInNode = i;
567 break;
568 }
569 }
570 }
572 if (firstOffsetInNode < lastOffsetInNode) {
573 int32_t len = lastOffsetInNode - firstOffsetInNode;
574 mSoftTextDOMMapping.AppendElement(
575 DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len));
577 bool ok = textFragment->AppendTo(mSoftText, firstOffsetInNode, len,
578 mozilla::fallible_t());
579 if (!ok) {
580 // probably out of memory, remove from mSoftTextDOMMapping
581 mSoftTextDOMMapping.RemoveElementAt(mSoftTextDOMMapping.Length() - 1);
582 exit = true;
583 }
584 }
586 firstOffsetInNode = 0;
587 }
589 if (exit)
590 break;
592 CheckLeavingBreakElementClosure closure = { false };
593 node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure);
594 if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
595 // We left, or are entering, a break element (e.g., block). Maybe we can
596 // stop now.
597 if (seenSoftEnd)
598 break;
599 // Record the break
600 mSoftText.Append(' ');
601 }
602 }
604 #ifdef DEBUG_SPELLCHECK
605 printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get());
606 #endif
607 }
609 void
610 mozInlineSpellWordUtil::BuildRealWords()
611 {
612 // This is pretty simple. We just have to walk mSoftText, tokenizing it
613 // into "real words".
614 // We do an outer traversal of words delimited by IsDOMWordSeparator, calling
615 // SplitDOMWord on each of those DOM words
616 int32_t wordStart = -1;
617 mRealWords.Clear();
618 for (int32_t i = 0; i < int32_t(mSoftText.Length()); ++i) {
619 if (IsDOMWordSeparator(mSoftText.CharAt(i))) {
620 if (wordStart >= 0) {
621 SplitDOMWord(wordStart, i);
622 wordStart = -1;
623 }
624 } else {
625 if (wordStart < 0) {
626 wordStart = i;
627 }
628 }
629 }
630 if (wordStart >= 0) {
631 SplitDOMWord(wordStart, mSoftText.Length());
632 }
633 }
635 /*********** DOM/realwords<->mSoftText mapping functions ************/
637 int32_t
638 mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset)
639 {
640 if (!mSoftTextValid) {
641 NS_ERROR("Soft text must be valid if we're to map into it");
642 return -1;
643 }
645 for (int32_t i = 0; i < int32_t(mSoftTextDOMMapping.Length()); ++i) {
646 const DOMTextMapping& map = mSoftTextDOMMapping[i];
647 if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
648 // Allow offsets at either end of the string, in particular, allow the
649 // offset that's at the end of the contributed string
650 int32_t offsetInContributedString =
651 aNodeOffset.mOffset - map.mNodeOffset.mOffset;
652 if (offsetInContributedString >= 0 &&
653 offsetInContributedString <= map.mLength)
654 return map.mSoftTextOffset + offsetInContributedString;
655 return -1;
656 }
657 }
658 return -1;
659 }
661 mozInlineSpellWordUtil::NodeOffset
662 mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset,
663 DOMMapHint aHint)
664 {
665 NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
666 if (!mSoftTextValid)
667 return NodeOffset(nullptr, -1);
669 // The invariant is that the range start..end includes the last mapping,
670 // if any, such that mSoftTextOffset <= aSoftTextOffset
671 int32_t start = 0;
672 int32_t end = mSoftTextDOMMapping.Length();
673 while (end - start >= 2) {
674 int32_t mid = (start + end)/2;
675 const DOMTextMapping& map = mSoftTextDOMMapping[mid];
676 if (map.mSoftTextOffset > aSoftTextOffset) {
677 end = mid;
678 } else {
679 start = mid;
680 }
681 }
683 if (start >= end)
684 return NodeOffset(nullptr, -1);
686 // 'start' is now the last mapping, if any, such that
687 // mSoftTextOffset <= aSoftTextOffset.
688 // If we're doing HINT_END, then we may want to return the end of the
689 // the previous mapping instead of the start of this mapping
690 if (aHint == HINT_END && start > 0) {
691 const DOMTextMapping& map = mSoftTextDOMMapping[start - 1];
692 if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
693 return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength);
694 }
696 // We allow ourselves to return the end of this mapping even if we're
697 // doing HINT_START. This will only happen if there is no mapping which this
698 // point is the start of. I'm not 100% sure this is OK...
699 const DOMTextMapping& map = mSoftTextDOMMapping[start];
700 int32_t offset = aSoftTextOffset - map.mSoftTextOffset;
701 if (offset >= 0 && offset <= map.mLength)
702 return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
704 return NodeOffset(nullptr, -1);
705 }
707 int32_t
708 mozInlineSpellWordUtil::FindRealWordContaining(int32_t aSoftTextOffset,
709 DOMMapHint aHint, bool aSearchForward)
710 {
711 NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
712 if (!mSoftTextValid)
713 return -1;
715 // The invariant is that the range start..end includes the last word,
716 // if any, such that mSoftTextOffset <= aSoftTextOffset
717 int32_t start = 0;
718 int32_t end = mRealWords.Length();
719 while (end - start >= 2) {
720 int32_t mid = (start + end)/2;
721 const RealWord& word = mRealWords[mid];
722 if (word.mSoftTextOffset > aSoftTextOffset) {
723 end = mid;
724 } else {
725 start = mid;
726 }
727 }
729 if (start >= end)
730 return -1;
732 // 'start' is now the last word, if any, such that
733 // mSoftTextOffset <= aSoftTextOffset.
734 // If we're doing HINT_END, then we may want to return the end of the
735 // the previous word instead of the start of this word
736 if (aHint == HINT_END && start > 0) {
737 const RealWord& word = mRealWords[start - 1];
738 if (word.mSoftTextOffset + word.mLength == aSoftTextOffset)
739 return start - 1;
740 }
742 // We allow ourselves to return the end of this word even if we're
743 // doing HINT_START. This will only happen if there is no word which this
744 // point is the start of. I'm not 100% sure this is OK...
745 const RealWord& word = mRealWords[start];
746 int32_t offset = aSoftTextOffset - word.mSoftTextOffset;
747 if (offset >= 0 && offset <= word.mLength)
748 return start;
750 if (aSearchForward) {
751 if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
752 // All words have mSoftTextOffset > aSoftTextOffset
753 return 0;
754 }
755 // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset.
756 // Word start+1, if it exists, will be the first with
757 // mSoftTextOffset > aSoftTextOffset.
758 if (start + 1 < int32_t(mRealWords.Length()))
759 return start + 1;
760 }
762 return -1;
763 }
765 /*********** Word Splitting ************/
767 // classifies a given character in the DOM word
768 enum CharClass {
769 CHAR_CLASS_WORD,
770 CHAR_CLASS_SEPARATOR,
771 CHAR_CLASS_END_OF_INPUT };
773 // Encapsulates DOM-word to real-word splitting
774 struct MOZ_STACK_CLASS WordSplitState
775 {
776 mozInlineSpellWordUtil* mWordUtil;
777 const nsDependentSubstring mDOMWordText;
778 int32_t mDOMWordOffset;
779 CharClass mCurCharClass;
781 WordSplitState(mozInlineSpellWordUtil* aWordUtil,
782 const nsString& aString, int32_t aStart, int32_t aLen)
783 : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),
784 mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
786 CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
787 void Advance();
788 void AdvanceThroughSeparators();
789 void AdvanceThroughWord();
791 // Finds special words like email addresses and URLs that may start at the
792 // current position, and returns their length, or 0 if not found. This allows
793 // arbitrary word breaking rules to be used for these special entities, as
794 // long as they can not contain whitespace.
795 bool IsSpecialWord();
797 // Similar to IsSpecialWord except that this takes a split word as
798 // input. This checks for things that do not require special word-breaking
799 // rules.
800 bool ShouldSkipWord(int32_t aStart, int32_t aLength);
801 };
803 // WordSplitState::ClassifyCharacter
805 CharClass
806 WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const
807 {
808 NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
809 "Index out of range");
810 if (aIndex == int32_t(mDOMWordText.Length()))
811 return CHAR_CLASS_SEPARATOR;
813 // this will classify the character, we want to treat "ignorable" characters
814 // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
815 nsIUGenCategory::nsUGenCategory
816 charCategory = mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]);
817 if (charCategory == nsIUGenCategory::kLetter ||
818 IsIgnorableCharacter(mDOMWordText[aIndex]) ||
819 mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
820 mDOMWordText[aIndex] == 0x200D /* ZWJ */)
821 return CHAR_CLASS_WORD;
823 // If conditional punctuation is surrounded immediately on both sides by word
824 // characters it also counts as a word character.
825 if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
826 if (!aRecurse) {
827 // not allowed to look around, this punctuation counts like a separator
828 return CHAR_CLASS_SEPARATOR;
829 }
831 // check the left-hand character
832 if (aIndex == 0)
833 return CHAR_CLASS_SEPARATOR;
834 if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
835 return CHAR_CLASS_SEPARATOR;
836 // If the previous charatcer is a word-char, make sure that it's not a
837 // special dot character.
838 if (mDOMWordText[aIndex - 1] == '.')
839 return CHAR_CLASS_SEPARATOR;
841 // now we know left char is a word-char, check the right-hand character
842 if (aIndex == int32_t(mDOMWordText.Length()) - 1)
843 return CHAR_CLASS_SEPARATOR;
844 if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
845 return CHAR_CLASS_SEPARATOR;
846 // If the next charatcer is a word-char, make sure that it's not a
847 // special dot character.
848 if (mDOMWordText[aIndex + 1] == '.')
849 return CHAR_CLASS_SEPARATOR;
851 // char on either side is a word, this counts as a word
852 return CHAR_CLASS_WORD;
853 }
855 // The dot character, if appearing at the end of a word, should
856 // be considered part of that word. Example: "etc.", or
857 // abbreviations
858 if (aIndex > 0 &&
859 mDOMWordText[aIndex] == '.' &&
860 mDOMWordText[aIndex - 1] != '.' &&
861 ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
862 return CHAR_CLASS_WORD;
863 }
865 // all other punctuation
866 if (charCategory == nsIUGenCategory::kSeparator ||
867 charCategory == nsIUGenCategory::kOther ||
868 charCategory == nsIUGenCategory::kPunctuation ||
869 charCategory == nsIUGenCategory::kSymbol) {
870 // Don't break on hyphens, as hunspell handles them on its own.
871 if (aIndex > 0 &&
872 mDOMWordText[aIndex] == '-' &&
873 mDOMWordText[aIndex - 1] != '-' &&
874 ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
875 // A hyphen is only meaningful as a separator inside a word
876 // if the previous and next characters are a word character.
877 if (aIndex == int32_t(mDOMWordText.Length()) - 1)
878 return CHAR_CLASS_SEPARATOR;
879 if (mDOMWordText[aIndex + 1] != '.' &&
880 ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
881 return CHAR_CLASS_WORD;
882 }
883 return CHAR_CLASS_SEPARATOR;
884 }
886 // any other character counts as a word
887 return CHAR_CLASS_WORD;
888 }
891 // WordSplitState::Advance
893 void
894 WordSplitState::Advance()
895 {
896 NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
897 NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
898 "Length beyond end");
900 mDOMWordOffset ++;
901 if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
902 mCurCharClass = CHAR_CLASS_END_OF_INPUT;
903 else
904 mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
905 }
908 // WordSplitState::AdvanceThroughSeparators
910 void
911 WordSplitState::AdvanceThroughSeparators()
912 {
913 while (mCurCharClass == CHAR_CLASS_SEPARATOR)
914 Advance();
915 }
917 // WordSplitState::AdvanceThroughWord
919 void
920 WordSplitState::AdvanceThroughWord()
921 {
922 while (mCurCharClass == CHAR_CLASS_WORD)
923 Advance();
924 }
927 // WordSplitState::IsSpecialWord
929 bool
930 WordSplitState::IsSpecialWord()
931 {
932 // Search for email addresses. We simply define these as any sequence of
933 // characters with an '@' character in the middle. The DOM word is already
934 // split on whitepace, so we know that everything to the end is the address
935 int32_t firstColon = -1;
936 for (int32_t i = mDOMWordOffset;
937 i < int32_t(mDOMWordText.Length()); i ++) {
938 if (mDOMWordText[i] == '@') {
939 // only accept this if there are unambiguous word characters (don't bother
940 // recursing to disambiguate apostrophes) on each side. This prevents
941 // classifying, e.g. "@home" as an email address
943 // Use this condition to only accept words with '@' in the middle of
944 // them. It works, but the inlinespellcker doesn't like this. The problem
945 // is that you type "fhsgfh@" that's a misspelled word followed by a
946 // symbol, but when you type another letter "fhsgfh@g" that first word
947 // need to be unmarked misspelled. It doesn't do this. it only checks the
948 // current position for potentially removing a spelling range.
949 if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
950 i < (int32_t)mDOMWordText.Length() - 1 &&
951 ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
952 return true;
953 }
954 } else if (mDOMWordText[i] == ':' && firstColon < 0) {
955 firstColon = i;
957 // If the first colon is followed by a slash, consider it a URL
958 // This will catch things like asdf://foo.com
959 if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
960 mDOMWordText[firstColon + 1] == '/') {
961 return true;
962 }
963 }
964 }
966 // Check the text before the first colon against some known protocols. It
967 // is impossible to check against all protocols, especially since you can
968 // plug in new protocols. We also don't want to waste time here checking
969 // against a lot of obscure protocols.
970 if (firstColon > mDOMWordOffset) {
971 nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
972 firstColon - mDOMWordOffset));
973 if (protocol.EqualsIgnoreCase("http") ||
974 protocol.EqualsIgnoreCase("https") ||
975 protocol.EqualsIgnoreCase("news") ||
976 protocol.EqualsIgnoreCase("file") ||
977 protocol.EqualsIgnoreCase("javascript") ||
978 protocol.EqualsIgnoreCase("data") ||
979 protocol.EqualsIgnoreCase("ftp")) {
980 return true;
981 }
982 }
984 // not anything special
985 return false;
986 }
988 // WordSplitState::ShouldSkipWord
990 bool
991 WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength)
992 {
993 int32_t last = aStart + aLength;
995 // check to see if the word contains a digit
996 for (int32_t i = aStart; i < last; i ++) {
997 if (unicode::GetGenCategory(mDOMWordText[i]) == nsIUGenCategory::kNumber) {
998 return true;
999 }
1000 }
1002 // not special
1003 return false;
1004 }
1006 // mozInlineSpellWordUtil::SplitDOMWord
1008 void
1009 mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd)
1010 {
1011 WordSplitState state(this, mSoftText, aStart, aEnd - aStart);
1012 state.mCurCharClass = state.ClassifyCharacter(0, true);
1014 state.AdvanceThroughSeparators();
1015 if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT &&
1016 state.IsSpecialWord()) {
1017 int32_t specialWordLength = state.mDOMWordText.Length() - state.mDOMWordOffset;
1018 mRealWords.AppendElement(
1019 RealWord(aStart + state.mDOMWordOffset, specialWordLength, false));
1021 return;
1022 }
1024 while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
1025 state.AdvanceThroughSeparators();
1026 if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT)
1027 break;
1029 // save the beginning of the word
1030 int32_t wordOffset = state.mDOMWordOffset;
1032 // find the end of the word
1033 state.AdvanceThroughWord();
1034 int32_t wordLen = state.mDOMWordOffset - wordOffset;
1035 mRealWords.AppendElement(
1036 RealWord(aStart + wordOffset, wordLen,
1037 !state.ShouldSkipWord(wordOffset, wordLen)));
1038 }
1039 }