extensions/spellcheck/src/mozInlineSpellWordUtil.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #include "mozInlineSpellWordUtil.h"
michael@0 7 #include "nsDebug.h"
michael@0 8 #include "nsIAtom.h"
michael@0 9 #include "nsComponentManagerUtils.h"
michael@0 10 #include "nsIDOMCSSStyleDeclaration.h"
michael@0 11 #include "nsIDOMElement.h"
michael@0 12 #include "nsIDOMRange.h"
michael@0 13 #include "nsIEditor.h"
michael@0 14 #include "nsIDOMNode.h"
michael@0 15 #include "nsIDOMHTMLBRElement.h"
michael@0 16 #include "nsUnicharUtilCIID.h"
michael@0 17 #include "nsUnicodeProperties.h"
michael@0 18 #include "nsServiceManagerUtils.h"
michael@0 19 #include "nsIContent.h"
michael@0 20 #include "nsTextFragment.h"
michael@0 21 #include "mozilla/dom/Element.h"
michael@0 22 #include "nsRange.h"
michael@0 23 #include "nsContentUtils.h"
michael@0 24 #include "nsIFrame.h"
michael@0 25 #include <algorithm>
michael@0 26
michael@0 27 using namespace mozilla;
michael@0 28
michael@0 29 // IsIgnorableCharacter
michael@0 30 //
michael@0 31 // These characters are ones that we should ignore in input.
michael@0 32
michael@0 33 inline bool IsIgnorableCharacter(char16_t ch)
michael@0 34 {
michael@0 35 return (ch == 0xAD || // SOFT HYPHEN
michael@0 36 ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN
michael@0 37 }
michael@0 38
michael@0 39 // IsConditionalPunctuation
michael@0 40 //
michael@0 41 // Some characters (like apostrophes) require characters on each side to be
michael@0 42 // part of a word, and are otherwise punctuation.
michael@0 43
michael@0 44 inline bool IsConditionalPunctuation(char16_t ch)
michael@0 45 {
michael@0 46 return (ch == '\'' ||
michael@0 47 ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK
michael@0 48 ch == 0x00B7); // MIDDLE DOT
michael@0 49 }
michael@0 50
michael@0 51 // mozInlineSpellWordUtil::Init
michael@0 52
michael@0 53 nsresult
michael@0 54 mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor)
michael@0 55 {
michael@0 56 nsresult rv;
michael@0 57
michael@0 58 // getting the editor can fail commonly because the editor was detached, so
michael@0 59 // don't assert
michael@0 60 nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv);
michael@0 61 if (NS_FAILED(rv))
michael@0 62 return rv;
michael@0 63
michael@0 64 nsCOMPtr<nsIDOMDocument> domDoc;
michael@0 65 rv = editor->GetDocument(getter_AddRefs(domDoc));
michael@0 66 NS_ENSURE_SUCCESS(rv, rv);
michael@0 67 NS_ENSURE_TRUE(domDoc, NS_ERROR_NULL_POINTER);
michael@0 68
michael@0 69 mDOMDocument = domDoc;
michael@0 70 mDocument = do_QueryInterface(domDoc);
michael@0 71
michael@0 72 // Find the root node for the editor. For contenteditable we'll need something
michael@0 73 // cleverer here.
michael@0 74 nsCOMPtr<nsIDOMElement> rootElt;
michael@0 75 rv = editor->GetRootElement(getter_AddRefs(rootElt));
michael@0 76 NS_ENSURE_SUCCESS(rv, rv);
michael@0 77
michael@0 78 nsCOMPtr<nsINode> rootNode = do_QueryInterface(rootElt);
michael@0 79 mRootNode = rootNode;
michael@0 80 NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!");
michael@0 81 return NS_OK;
michael@0 82 }
michael@0 83
michael@0 84 static inline bool
michael@0 85 IsTextNode(nsINode* aNode)
michael@0 86 {
michael@0 87 return aNode->IsNodeOfType(nsINode::eTEXT);
michael@0 88 }
michael@0 89
michael@0 90 typedef void (* OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);
michael@0 91
michael@0 92 // Find the next node in the DOM tree in preorder.
michael@0 93 // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
michael@0 94 // why we can't just use GetNextNode here, sadly.
michael@0 95 static nsINode*
michael@0 96 FindNextNode(nsINode* aNode, nsINode* aRoot,
michael@0 97 OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure)
michael@0 98 {
michael@0 99 NS_PRECONDITION(aNode, "Null starting node?");
michael@0 100
michael@0 101 nsINode* next = aNode->GetFirstChild();
michael@0 102 if (next)
michael@0 103 return next;
michael@0 104
michael@0 105 // Don't look at siblings or otherwise outside of aRoot
michael@0 106 if (aNode == aRoot)
michael@0 107 return nullptr;
michael@0 108
michael@0 109 next = aNode->GetNextSibling();
michael@0 110 if (next)
michael@0 111 return next;
michael@0 112
michael@0 113 // Go up
michael@0 114 for (;;) {
michael@0 115 if (aOnLeaveNode) {
michael@0 116 aOnLeaveNode(aNode, aClosure);
michael@0 117 }
michael@0 118
michael@0 119 next = aNode->GetParent();
michael@0 120 if (next == aRoot || ! next)
michael@0 121 return nullptr;
michael@0 122 aNode = next;
michael@0 123
michael@0 124 next = aNode->GetNextSibling();
michael@0 125 if (next)
michael@0 126 return next;
michael@0 127 }
michael@0 128 }
michael@0 129
michael@0 130 // aNode is not a text node. Find the first text node starting at aNode/aOffset
michael@0 131 // in a preorder DOM traversal.
michael@0 132 static nsINode*
michael@0 133 FindNextTextNode(nsINode* aNode, int32_t aOffset, nsINode* aRoot)
michael@0 134 {
michael@0 135 NS_PRECONDITION(aNode, "Null starting node?");
michael@0 136 NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node");
michael@0 137
michael@0 138 nsINode* checkNode;
michael@0 139 // Need to start at the aOffset'th child
michael@0 140 nsIContent* child = aNode->GetChildAt(aOffset);
michael@0 141
michael@0 142 if (child) {
michael@0 143 checkNode = child;
michael@0 144 } else {
michael@0 145 // aOffset was beyond the end of the child list.
michael@0 146 // goto next node after the last descendant of aNode in
michael@0 147 // a preorder DOM traversal.
michael@0 148 checkNode = aNode->GetNextNonChildNode(aRoot);
michael@0 149 }
michael@0 150
michael@0 151 while (checkNode && !IsTextNode(checkNode)) {
michael@0 152 checkNode = checkNode->GetNextNode(aRoot);
michael@0 153 }
michael@0 154 return checkNode;
michael@0 155 }
michael@0 156
michael@0 157 // mozInlineSpellWordUtil::SetEnd
michael@0 158 //
michael@0 159 // We have two ranges "hard" and "soft". The hard boundary is simply
michael@0 160 // the scope of the root node. The soft boundary is that which is set
michael@0 161 // by the caller of this class by calling this function. If this function is
michael@0 162 // not called, the soft boundary is the same as the hard boundary.
michael@0 163 //
michael@0 164 // When we reach the soft boundary (mSoftEnd), we keep
michael@0 165 // going until we reach the end of a word. This allows the caller to set the
michael@0 166 // end of the range to anything, and we will always check whole multiples of
michael@0 167 // words. When we reach the hard boundary we stop no matter what.
michael@0 168 //
michael@0 169 // There is no beginning soft boundary. This is because we only go to the
michael@0 170 // previous node once, when finding the previous word boundary in
michael@0 171 // SetPosition(). You might think of the soft boundary as being this initial
michael@0 172 // position.
michael@0 173
michael@0 174 nsresult
michael@0 175 mozInlineSpellWordUtil::SetEnd(nsINode* aEndNode, int32_t aEndOffset)
michael@0 176 {
michael@0 177 NS_PRECONDITION(aEndNode, "Null end node?");
michael@0 178
michael@0 179 NS_ASSERTION(mRootNode, "Not initialized");
michael@0 180
michael@0 181 InvalidateWords();
michael@0 182
michael@0 183 if (!IsTextNode(aEndNode)) {
michael@0 184 // End at the start of the first text node after aEndNode/aEndOffset.
michael@0 185 aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
michael@0 186 aEndOffset = 0;
michael@0 187 }
michael@0 188 mSoftEnd = NodeOffset(aEndNode, aEndOffset);
michael@0 189 return NS_OK;
michael@0 190 }
michael@0 191
michael@0 192 nsresult
michael@0 193 mozInlineSpellWordUtil::SetPosition(nsINode* aNode, int32_t aOffset)
michael@0 194 {
michael@0 195 InvalidateWords();
michael@0 196
michael@0 197 if (!IsTextNode(aNode)) {
michael@0 198 // Start at the start of the first text node after aNode/aOffset.
michael@0 199 aNode = FindNextTextNode(aNode, aOffset, mRootNode);
michael@0 200 aOffset = 0;
michael@0 201 }
michael@0 202 mSoftBegin = NodeOffset(aNode, aOffset);
michael@0 203
michael@0 204 EnsureWords();
michael@0 205
michael@0 206 int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin);
michael@0 207 if (textOffset < 0)
michael@0 208 return NS_OK;
michael@0 209 mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
michael@0 210 return NS_OK;
michael@0 211 }
michael@0 212
michael@0 213 void
michael@0 214 mozInlineSpellWordUtil::EnsureWords()
michael@0 215 {
michael@0 216 if (mSoftTextValid)
michael@0 217 return;
michael@0 218 BuildSoftText();
michael@0 219 BuildRealWords();
michael@0 220 mSoftTextValid = true;
michael@0 221 }
michael@0 222
michael@0 223 nsresult
michael@0 224 mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsRange** aRange)
michael@0 225 {
michael@0 226 NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
michael@0 227 NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
michael@0 228 return MakeRange(begin, end, aRange);
michael@0 229 }
michael@0 230
michael@0 231 // mozInlineSpellWordUtil::GetRangeForWord
michael@0 232
michael@0 233 nsresult
michael@0 234 mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode,
michael@0 235 int32_t aWordOffset,
michael@0 236 nsRange** aRange)
michael@0 237 {
michael@0 238 // Set our soft end and start
michael@0 239 nsCOMPtr<nsINode> wordNode = do_QueryInterface(aWordNode);
michael@0 240 NodeOffset pt = NodeOffset(wordNode, aWordOffset);
michael@0 241
michael@0 242 InvalidateWords();
michael@0 243 mSoftBegin = mSoftEnd = pt;
michael@0 244 EnsureWords();
michael@0 245
michael@0 246 int32_t offset = MapDOMPositionToSoftTextOffset(pt);
michael@0 247 if (offset < 0)
michael@0 248 return MakeRange(pt, pt, aRange);
michael@0 249 int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
michael@0 250 if (wordIndex < 0)
michael@0 251 return MakeRange(pt, pt, aRange);
michael@0 252 return MakeRangeForWord(mRealWords[wordIndex], aRange);
michael@0 253 }
michael@0 254
michael@0 255 // This is to fix characters that the spellchecker may not like
michael@0 256 static void
michael@0 257 NormalizeWord(const nsSubstring& aInput, int32_t aPos, int32_t aLen, nsAString& aOutput)
michael@0 258 {
michael@0 259 aOutput.Truncate();
michael@0 260 for (int32_t i = 0; i < aLen; i++) {
michael@0 261 char16_t ch = aInput.CharAt(i + aPos);
michael@0 262
michael@0 263 // remove ignorable characters from the word
michael@0 264 if (IsIgnorableCharacter(ch))
michael@0 265 continue;
michael@0 266
michael@0 267 // the spellchecker doesn't handle curly apostrophes in all languages
michael@0 268 if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
michael@0 269 ch = '\'';
michael@0 270 }
michael@0 271
michael@0 272 aOutput.Append(ch);
michael@0 273 }
michael@0 274 }
michael@0 275
michael@0 276 // mozInlineSpellWordUtil::GetNextWord
michael@0 277 //
michael@0 278 // FIXME-optimization: we shouldn't have to generate a range every single
michael@0 279 // time. It would be better if the inline spellchecker didn't require a
michael@0 280 // range unless the word was misspelled. This may or may not be possible.
michael@0 281
michael@0 282 nsresult
michael@0 283 mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsRange** aRange,
michael@0 284 bool* aSkipChecking)
michael@0 285 {
michael@0 286 #ifdef DEBUG_SPELLCHECK
michael@0 287 printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex);
michael@0 288 #endif
michael@0 289
michael@0 290 if (mNextWordIndex < 0 ||
michael@0 291 mNextWordIndex >= int32_t(mRealWords.Length())) {
michael@0 292 mNextWordIndex = -1;
michael@0 293 *aRange = nullptr;
michael@0 294 *aSkipChecking = true;
michael@0 295 return NS_OK;
michael@0 296 }
michael@0 297
michael@0 298 const RealWord& word = mRealWords[mNextWordIndex];
michael@0 299 nsresult rv = MakeRangeForWord(word, aRange);
michael@0 300 NS_ENSURE_SUCCESS(rv, rv);
michael@0 301 ++mNextWordIndex;
michael@0 302 *aSkipChecking = !word.mCheckableWord;
michael@0 303 ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText);
michael@0 304
michael@0 305 #ifdef DEBUG_SPELLCHECK
michael@0 306 printf("GetNextWord returning: %s (skip=%d)\n",
michael@0 307 NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking);
michael@0 308 #endif
michael@0 309
michael@0 310 return NS_OK;
michael@0 311 }
michael@0 312
michael@0 313 // mozInlineSpellWordUtil::MakeRange
michael@0 314 //
michael@0 315 // Convenience function for creating a range over the current document.
michael@0 316
michael@0 317 nsresult
michael@0 318 mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
michael@0 319 nsRange** aRange)
michael@0 320 {
michael@0 321 NS_ENSURE_ARG_POINTER(aBegin.mNode);
michael@0 322 if (!mDOMDocument)
michael@0 323 return NS_ERROR_NOT_INITIALIZED;
michael@0 324
michael@0 325 nsRefPtr<nsRange> range = new nsRange(aBegin.mNode);
michael@0 326 nsresult rv = range->Set(aBegin.mNode, aBegin.mOffset,
michael@0 327 aEnd.mNode, aEnd.mOffset);
michael@0 328 NS_ENSURE_SUCCESS(rv, rv);
michael@0 329 range.forget(aRange);
michael@0 330
michael@0 331 return NS_OK;
michael@0 332 }
michael@0 333
michael@0 334 /*********** DOM text extraction ************/
michael@0 335
michael@0 336 // IsDOMWordSeparator
michael@0 337 //
michael@0 338 // Determines if the given character should be considered as a DOM Word
michael@0 339 // separator. Basically, this is whitespace, although it could also have
michael@0 340 // certain punctuation that we know ALWAYS breaks words. This is important.
michael@0 341 // For example, we can't have any punctuation that could appear in a URL
michael@0 342 // or email address in this, because those need to always fit into a single
michael@0 343 // DOM word.
michael@0 344
michael@0 345 static bool
michael@0 346 IsDOMWordSeparator(char16_t ch)
michael@0 347 {
michael@0 348 // simple spaces
michael@0 349 if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
michael@0 350 return true;
michael@0 351
michael@0 352 // complex spaces - check only if char isn't ASCII (uncommon)
michael@0 353 if (ch >= 0xA0 &&
michael@0 354 (ch == 0x00A0 || // NO-BREAK SPACE
michael@0 355 ch == 0x2002 || // EN SPACE
michael@0 356 ch == 0x2003 || // EM SPACE
michael@0 357 ch == 0x2009 || // THIN SPACE
michael@0 358 ch == 0x3000)) // IDEOGRAPHIC SPACE
michael@0 359 return true;
michael@0 360
michael@0 361 // otherwise not a space
michael@0 362 return false;
michael@0 363 }
michael@0 364
michael@0 365 static inline bool
michael@0 366 IsBRElement(nsINode* aNode)
michael@0 367 {
michael@0 368 return aNode->IsElement() &&
michael@0 369 aNode->AsElement()->IsHTML(nsGkAtoms::br);
michael@0 370 }
michael@0 371
michael@0 372 /**
michael@0 373 * Given a TextNode, checks to see if there's a DOM word separator before
michael@0 374 * aBeforeOffset within it. This function does not modify aSeparatorOffset when
michael@0 375 * it returns false.
michael@0 376 *
michael@0 377 * @param aNode the TextNode to check.
michael@0 378 * @param aBeforeOffset the offset in the TextNode before which we will search
michael@0 379 * for the DOM separator. You can pass INT32_MAX to search the entire
michael@0 380 * length of the string.
michael@0 381 * @param aSeparatorOffset will be set to the offset of the first separator it
michael@0 382 * encounters. Will not be written to if no separator is found.
michael@0 383 * @returns True if it found a separator.
michael@0 384 */
michael@0 385 static bool
michael@0 386 TextNodeContainsDOMWordSeparator(nsINode* aNode,
michael@0 387 int32_t aBeforeOffset,
michael@0 388 int32_t* aSeparatorOffset)
michael@0 389 {
michael@0 390 // aNode is actually an nsIContent, since it's eTEXT
michael@0 391 nsIContent* content = static_cast<nsIContent*>(aNode);
michael@0 392 const nsTextFragment* textFragment = content->GetText();
michael@0 393 NS_ASSERTION(textFragment, "Where is our text?");
michael@0 394 for (int32_t i = std::min(aBeforeOffset, int32_t(textFragment->GetLength())) - 1; i >= 0; --i) {
michael@0 395 if (IsDOMWordSeparator(textFragment->CharAt(i))) {
michael@0 396 // Be greedy, find as many separators as we can
michael@0 397 for (int32_t j = i - 1; j >= 0; --j) {
michael@0 398 if (IsDOMWordSeparator(textFragment->CharAt(j))) {
michael@0 399 i = j;
michael@0 400 } else {
michael@0 401 break;
michael@0 402 }
michael@0 403 }
michael@0 404 *aSeparatorOffset = i;
michael@0 405 return true;
michael@0 406 }
michael@0 407 }
michael@0 408 return false;
michael@0 409 }
michael@0 410
michael@0 411 /**
michael@0 412 * Check if there's a DOM word separator before aBeforeOffset in this node.
michael@0 413 * Always returns true if it's a BR element.
michael@0 414 * aSeparatorOffset is set to the index of the first character in the last
michael@0 415 * separator if any is found (0 for BR elements).
michael@0 416 *
michael@0 417 * This function does not modify aSeparatorOffset when it returns false.
michael@0 418 */
michael@0 419 static bool
michael@0 420 ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,
michael@0 421 int32_t* aSeparatorOffset)
michael@0 422 {
michael@0 423 if (IsBRElement(aNode)) {
michael@0 424 *aSeparatorOffset = 0;
michael@0 425 return true;
michael@0 426 }
michael@0 427
michael@0 428 if (!IsTextNode(aNode))
michael@0 429 return false;
michael@0 430
michael@0 431 return TextNodeContainsDOMWordSeparator(aNode, aBeforeOffset,
michael@0 432 aSeparatorOffset);
michael@0 433 }
michael@0 434
michael@0 435 static bool
michael@0 436 IsBreakElement(nsINode* aNode)
michael@0 437 {
michael@0 438 if (!aNode->IsElement()) {
michael@0 439 return false;
michael@0 440 }
michael@0 441
michael@0 442 dom::Element *element = aNode->AsElement();
michael@0 443
michael@0 444 if (element->IsHTML(nsGkAtoms::br))
michael@0 445 return true;
michael@0 446
michael@0 447 // If we don't have a frame, we don't consider ourselves a break
michael@0 448 // element. In particular, words can span us.
michael@0 449 if (!element->GetPrimaryFrame())
michael@0 450 return false;
michael@0 451
michael@0 452 // Anything that's not an inline element is a break element.
michael@0 453 // XXXbz should replaced inlines be break elements, though?
michael@0 454 return element->GetPrimaryFrame()->StyleDisplay()->mDisplay !=
michael@0 455 NS_STYLE_DISPLAY_INLINE;
michael@0 456 }
michael@0 457
michael@0 458 struct CheckLeavingBreakElementClosure {
michael@0 459 bool mLeftBreakElement;
michael@0 460 };
michael@0 461
michael@0 462 static void
michael@0 463 CheckLeavingBreakElement(nsINode* aNode, void* aClosure)
michael@0 464 {
michael@0 465 CheckLeavingBreakElementClosure* cl =
michael@0 466 static_cast<CheckLeavingBreakElementClosure*>(aClosure);
michael@0 467 if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
michael@0 468 cl->mLeftBreakElement = true;
michael@0 469 }
michael@0 470 }
michael@0 471
michael@0 472 void
michael@0 473 mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord)
michael@0 474 {
michael@0 475 nsAutoString result;
michael@0 476 ::NormalizeWord(aWord, 0, aWord.Length(), result);
michael@0 477 aWord = result;
michael@0 478 }
michael@0 479
michael@0 480 void
michael@0 481 mozInlineSpellWordUtil::BuildSoftText()
michael@0 482 {
michael@0 483 // First we have to work backwards from mSoftStart to find a text node
michael@0 484 // containing a DOM word separator, a non-inline-element
michael@0 485 // boundary, or the hard start node. That's where we'll start building the
michael@0 486 // soft string from.
michael@0 487 nsINode* node = mSoftBegin.mNode;
michael@0 488 int32_t firstOffsetInNode = 0;
michael@0 489 int32_t checkBeforeOffset = mSoftBegin.mOffset;
michael@0 490 while (node) {
michael@0 491 if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
michael@0 492 if (node == mSoftBegin.mNode) {
michael@0 493 // If we find a word separator on the first node, look at the preceding
michael@0 494 // word on the text node as well.
michael@0 495 int32_t newOffset = 0;
michael@0 496 if (firstOffsetInNode > 0) {
michael@0 497 // Try to find the previous word boundary in the current node. If
michael@0 498 // we can't find one, start checking previous sibling nodes (if any
michael@0 499 // adjacent ones exist) to see if we can find any text nodes with
michael@0 500 // DOM word separators. We bail out as soon as we see a node that is
michael@0 501 // not a text node, or we run out of previous sibling nodes. In the
michael@0 502 // event that we simply cannot find any preceding word separator, the
michael@0 503 // offset is set to 0, and the soft text beginning node is set to the
michael@0 504 // "most previous" text node before the original starting node, or
michael@0 505 // kept at the original starting node if no previous text nodes exist.
michael@0 506 if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,
michael@0 507 &newOffset)) {
michael@0 508 nsINode* prevNode = node->GetPreviousSibling();
michael@0 509 while (prevNode && IsTextNode(prevNode)) {
michael@0 510 mSoftBegin.mNode = prevNode;
michael@0 511 if (TextNodeContainsDOMWordSeparator(prevNode, INT32_MAX,
michael@0 512 &newOffset)) {
michael@0 513 break;
michael@0 514 }
michael@0 515 prevNode = prevNode->GetPreviousSibling();
michael@0 516 }
michael@0 517 }
michael@0 518 }
michael@0 519 firstOffsetInNode = newOffset;
michael@0 520 mSoftBegin.mOffset = newOffset;
michael@0 521 }
michael@0 522 break;
michael@0 523 }
michael@0 524 checkBeforeOffset = INT32_MAX;
michael@0 525 if (IsBreakElement(node)) {
michael@0 526 // Since GetPreviousContent follows tree *preorder*, we're about to traverse
michael@0 527 // up out of 'node'. Since node induces breaks (e.g., it's a block),
michael@0 528 // don't bother trying to look outside it, just stop now.
michael@0 529 break;
michael@0 530 }
michael@0 531 // GetPreviousContent below expects mRootNode to be an ancestor of node.
michael@0 532 if (!nsContentUtils::ContentIsDescendantOf(node, mRootNode)) {
michael@0 533 break;
michael@0 534 }
michael@0 535 node = node->GetPreviousContent(mRootNode);
michael@0 536 }
michael@0 537
michael@0 538 // Now build up the string moving forward through the DOM until we reach
michael@0 539 // the soft end and *then* see a DOM word separator, a non-inline-element
michael@0 540 // boundary, or the hard end node.
michael@0 541 mSoftText.Truncate();
michael@0 542 mSoftTextDOMMapping.Clear();
michael@0 543 bool seenSoftEnd = false;
michael@0 544 // Leave this outside the loop so large heap string allocations can be reused
michael@0 545 // across iterations
michael@0 546 while (node) {
michael@0 547 if (node == mSoftEnd.mNode) {
michael@0 548 seenSoftEnd = true;
michael@0 549 }
michael@0 550
michael@0 551 bool exit = false;
michael@0 552 if (IsTextNode(node)) {
michael@0 553 nsIContent* content = static_cast<nsIContent*>(node);
michael@0 554 NS_ASSERTION(content, "Where is our content?");
michael@0 555 const nsTextFragment* textFragment = content->GetText();
michael@0 556 NS_ASSERTION(textFragment, "Where is our text?");
michael@0 557 int32_t lastOffsetInNode = textFragment->GetLength();
michael@0 558
michael@0 559 if (seenSoftEnd) {
michael@0 560 // check whether we can stop after this
michael@0 561 for (int32_t i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0;
michael@0 562 i < int32_t(textFragment->GetLength()); ++i) {
michael@0 563 if (IsDOMWordSeparator(textFragment->CharAt(i))) {
michael@0 564 exit = true;
michael@0 565 // stop at the first separator after the soft end point
michael@0 566 lastOffsetInNode = i;
michael@0 567 break;
michael@0 568 }
michael@0 569 }
michael@0 570 }
michael@0 571
michael@0 572 if (firstOffsetInNode < lastOffsetInNode) {
michael@0 573 int32_t len = lastOffsetInNode - firstOffsetInNode;
michael@0 574 mSoftTextDOMMapping.AppendElement(
michael@0 575 DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len));
michael@0 576
michael@0 577 bool ok = textFragment->AppendTo(mSoftText, firstOffsetInNode, len,
michael@0 578 mozilla::fallible_t());
michael@0 579 if (!ok) {
michael@0 580 // probably out of memory, remove from mSoftTextDOMMapping
michael@0 581 mSoftTextDOMMapping.RemoveElementAt(mSoftTextDOMMapping.Length() - 1);
michael@0 582 exit = true;
michael@0 583 }
michael@0 584 }
michael@0 585
michael@0 586 firstOffsetInNode = 0;
michael@0 587 }
michael@0 588
michael@0 589 if (exit)
michael@0 590 break;
michael@0 591
michael@0 592 CheckLeavingBreakElementClosure closure = { false };
michael@0 593 node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure);
michael@0 594 if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
michael@0 595 // We left, or are entering, a break element (e.g., block). Maybe we can
michael@0 596 // stop now.
michael@0 597 if (seenSoftEnd)
michael@0 598 break;
michael@0 599 // Record the break
michael@0 600 mSoftText.Append(' ');
michael@0 601 }
michael@0 602 }
michael@0 603
michael@0 604 #ifdef DEBUG_SPELLCHECK
michael@0 605 printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get());
michael@0 606 #endif
michael@0 607 }
michael@0 608
michael@0 609 void
michael@0 610 mozInlineSpellWordUtil::BuildRealWords()
michael@0 611 {
michael@0 612 // This is pretty simple. We just have to walk mSoftText, tokenizing it
michael@0 613 // into "real words".
michael@0 614 // We do an outer traversal of words delimited by IsDOMWordSeparator, calling
michael@0 615 // SplitDOMWord on each of those DOM words
michael@0 616 int32_t wordStart = -1;
michael@0 617 mRealWords.Clear();
michael@0 618 for (int32_t i = 0; i < int32_t(mSoftText.Length()); ++i) {
michael@0 619 if (IsDOMWordSeparator(mSoftText.CharAt(i))) {
michael@0 620 if (wordStart >= 0) {
michael@0 621 SplitDOMWord(wordStart, i);
michael@0 622 wordStart = -1;
michael@0 623 }
michael@0 624 } else {
michael@0 625 if (wordStart < 0) {
michael@0 626 wordStart = i;
michael@0 627 }
michael@0 628 }
michael@0 629 }
michael@0 630 if (wordStart >= 0) {
michael@0 631 SplitDOMWord(wordStart, mSoftText.Length());
michael@0 632 }
michael@0 633 }
michael@0 634
michael@0 635 /*********** DOM/realwords<->mSoftText mapping functions ************/
michael@0 636
michael@0 637 int32_t
michael@0 638 mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset)
michael@0 639 {
michael@0 640 if (!mSoftTextValid) {
michael@0 641 NS_ERROR("Soft text must be valid if we're to map into it");
michael@0 642 return -1;
michael@0 643 }
michael@0 644
michael@0 645 for (int32_t i = 0; i < int32_t(mSoftTextDOMMapping.Length()); ++i) {
michael@0 646 const DOMTextMapping& map = mSoftTextDOMMapping[i];
michael@0 647 if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
michael@0 648 // Allow offsets at either end of the string, in particular, allow the
michael@0 649 // offset that's at the end of the contributed string
michael@0 650 int32_t offsetInContributedString =
michael@0 651 aNodeOffset.mOffset - map.mNodeOffset.mOffset;
michael@0 652 if (offsetInContributedString >= 0 &&
michael@0 653 offsetInContributedString <= map.mLength)
michael@0 654 return map.mSoftTextOffset + offsetInContributedString;
michael@0 655 return -1;
michael@0 656 }
michael@0 657 }
michael@0 658 return -1;
michael@0 659 }
michael@0 660
michael@0 661 mozInlineSpellWordUtil::NodeOffset
michael@0 662 mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset,
michael@0 663 DOMMapHint aHint)
michael@0 664 {
michael@0 665 NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
michael@0 666 if (!mSoftTextValid)
michael@0 667 return NodeOffset(nullptr, -1);
michael@0 668
michael@0 669 // The invariant is that the range start..end includes the last mapping,
michael@0 670 // if any, such that mSoftTextOffset <= aSoftTextOffset
michael@0 671 int32_t start = 0;
michael@0 672 int32_t end = mSoftTextDOMMapping.Length();
michael@0 673 while (end - start >= 2) {
michael@0 674 int32_t mid = (start + end)/2;
michael@0 675 const DOMTextMapping& map = mSoftTextDOMMapping[mid];
michael@0 676 if (map.mSoftTextOffset > aSoftTextOffset) {
michael@0 677 end = mid;
michael@0 678 } else {
michael@0 679 start = mid;
michael@0 680 }
michael@0 681 }
michael@0 682
michael@0 683 if (start >= end)
michael@0 684 return NodeOffset(nullptr, -1);
michael@0 685
michael@0 686 // 'start' is now the last mapping, if any, such that
michael@0 687 // mSoftTextOffset <= aSoftTextOffset.
michael@0 688 // If we're doing HINT_END, then we may want to return the end of the
michael@0 689 // the previous mapping instead of the start of this mapping
michael@0 690 if (aHint == HINT_END && start > 0) {
michael@0 691 const DOMTextMapping& map = mSoftTextDOMMapping[start - 1];
michael@0 692 if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
michael@0 693 return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength);
michael@0 694 }
michael@0 695
michael@0 696 // We allow ourselves to return the end of this mapping even if we're
michael@0 697 // doing HINT_START. This will only happen if there is no mapping which this
michael@0 698 // point is the start of. I'm not 100% sure this is OK...
michael@0 699 const DOMTextMapping& map = mSoftTextDOMMapping[start];
michael@0 700 int32_t offset = aSoftTextOffset - map.mSoftTextOffset;
michael@0 701 if (offset >= 0 && offset <= map.mLength)
michael@0 702 return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
michael@0 703
michael@0 704 return NodeOffset(nullptr, -1);
michael@0 705 }
michael@0 706
michael@0 707 int32_t
michael@0 708 mozInlineSpellWordUtil::FindRealWordContaining(int32_t aSoftTextOffset,
michael@0 709 DOMMapHint aHint, bool aSearchForward)
michael@0 710 {
michael@0 711 NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
michael@0 712 if (!mSoftTextValid)
michael@0 713 return -1;
michael@0 714
michael@0 715 // The invariant is that the range start..end includes the last word,
michael@0 716 // if any, such that mSoftTextOffset <= aSoftTextOffset
michael@0 717 int32_t start = 0;
michael@0 718 int32_t end = mRealWords.Length();
michael@0 719 while (end - start >= 2) {
michael@0 720 int32_t mid = (start + end)/2;
michael@0 721 const RealWord& word = mRealWords[mid];
michael@0 722 if (word.mSoftTextOffset > aSoftTextOffset) {
michael@0 723 end = mid;
michael@0 724 } else {
michael@0 725 start = mid;
michael@0 726 }
michael@0 727 }
michael@0 728
michael@0 729 if (start >= end)
michael@0 730 return -1;
michael@0 731
michael@0 732 // 'start' is now the last word, if any, such that
michael@0 733 // mSoftTextOffset <= aSoftTextOffset.
michael@0 734 // If we're doing HINT_END, then we may want to return the end of the
michael@0 735 // the previous word instead of the start of this word
michael@0 736 if (aHint == HINT_END && start > 0) {
michael@0 737 const RealWord& word = mRealWords[start - 1];
michael@0 738 if (word.mSoftTextOffset + word.mLength == aSoftTextOffset)
michael@0 739 return start - 1;
michael@0 740 }
michael@0 741
michael@0 742 // We allow ourselves to return the end of this word even if we're
michael@0 743 // doing HINT_START. This will only happen if there is no word which this
michael@0 744 // point is the start of. I'm not 100% sure this is OK...
michael@0 745 const RealWord& word = mRealWords[start];
michael@0 746 int32_t offset = aSoftTextOffset - word.mSoftTextOffset;
michael@0 747 if (offset >= 0 && offset <= word.mLength)
michael@0 748 return start;
michael@0 749
michael@0 750 if (aSearchForward) {
michael@0 751 if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
michael@0 752 // All words have mSoftTextOffset > aSoftTextOffset
michael@0 753 return 0;
michael@0 754 }
michael@0 755 // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset.
michael@0 756 // Word start+1, if it exists, will be the first with
michael@0 757 // mSoftTextOffset > aSoftTextOffset.
michael@0 758 if (start + 1 < int32_t(mRealWords.Length()))
michael@0 759 return start + 1;
michael@0 760 }
michael@0 761
michael@0 762 return -1;
michael@0 763 }
michael@0 764
michael@0 765 /*********** Word Splitting ************/
michael@0 766
michael@0 767 // classifies a given character in the DOM word
michael@0 768 enum CharClass {
michael@0 769 CHAR_CLASS_WORD,
michael@0 770 CHAR_CLASS_SEPARATOR,
michael@0 771 CHAR_CLASS_END_OF_INPUT };
michael@0 772
michael@0 773 // Encapsulates DOM-word to real-word splitting
michael@0 774 struct MOZ_STACK_CLASS WordSplitState
michael@0 775 {
michael@0 776 mozInlineSpellWordUtil* mWordUtil;
michael@0 777 const nsDependentSubstring mDOMWordText;
michael@0 778 int32_t mDOMWordOffset;
michael@0 779 CharClass mCurCharClass;
michael@0 780
michael@0 781 WordSplitState(mozInlineSpellWordUtil* aWordUtil,
michael@0 782 const nsString& aString, int32_t aStart, int32_t aLen)
michael@0 783 : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),
michael@0 784 mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
michael@0 785
michael@0 786 CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
michael@0 787 void Advance();
michael@0 788 void AdvanceThroughSeparators();
michael@0 789 void AdvanceThroughWord();
michael@0 790
michael@0 791 // Finds special words like email addresses and URLs that may start at the
michael@0 792 // current position, and returns their length, or 0 if not found. This allows
michael@0 793 // arbitrary word breaking rules to be used for these special entities, as
michael@0 794 // long as they can not contain whitespace.
michael@0 795 bool IsSpecialWord();
michael@0 796
michael@0 797 // Similar to IsSpecialWord except that this takes a split word as
michael@0 798 // input. This checks for things that do not require special word-breaking
michael@0 799 // rules.
michael@0 800 bool ShouldSkipWord(int32_t aStart, int32_t aLength);
michael@0 801 };
michael@0 802
michael@0 803 // WordSplitState::ClassifyCharacter
michael@0 804
michael@0 805 CharClass
michael@0 806 WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const
michael@0 807 {
michael@0 808 NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
michael@0 809 "Index out of range");
michael@0 810 if (aIndex == int32_t(mDOMWordText.Length()))
michael@0 811 return CHAR_CLASS_SEPARATOR;
michael@0 812
michael@0 813 // this will classify the character, we want to treat "ignorable" characters
michael@0 814 // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
michael@0 815 nsIUGenCategory::nsUGenCategory
michael@0 816 charCategory = mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]);
michael@0 817 if (charCategory == nsIUGenCategory::kLetter ||
michael@0 818 IsIgnorableCharacter(mDOMWordText[aIndex]) ||
michael@0 819 mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
michael@0 820 mDOMWordText[aIndex] == 0x200D /* ZWJ */)
michael@0 821 return CHAR_CLASS_WORD;
michael@0 822
michael@0 823 // If conditional punctuation is surrounded immediately on both sides by word
michael@0 824 // characters it also counts as a word character.
michael@0 825 if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
michael@0 826 if (!aRecurse) {
michael@0 827 // not allowed to look around, this punctuation counts like a separator
michael@0 828 return CHAR_CLASS_SEPARATOR;
michael@0 829 }
michael@0 830
michael@0 831 // check the left-hand character
michael@0 832 if (aIndex == 0)
michael@0 833 return CHAR_CLASS_SEPARATOR;
michael@0 834 if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
michael@0 835 return CHAR_CLASS_SEPARATOR;
michael@0 836 // If the previous charatcer is a word-char, make sure that it's not a
michael@0 837 // special dot character.
michael@0 838 if (mDOMWordText[aIndex - 1] == '.')
michael@0 839 return CHAR_CLASS_SEPARATOR;
michael@0 840
michael@0 841 // now we know left char is a word-char, check the right-hand character
michael@0 842 if (aIndex == int32_t(mDOMWordText.Length()) - 1)
michael@0 843 return CHAR_CLASS_SEPARATOR;
michael@0 844 if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
michael@0 845 return CHAR_CLASS_SEPARATOR;
michael@0 846 // If the next charatcer is a word-char, make sure that it's not a
michael@0 847 // special dot character.
michael@0 848 if (mDOMWordText[aIndex + 1] == '.')
michael@0 849 return CHAR_CLASS_SEPARATOR;
michael@0 850
michael@0 851 // char on either side is a word, this counts as a word
michael@0 852 return CHAR_CLASS_WORD;
michael@0 853 }
michael@0 854
michael@0 855 // The dot character, if appearing at the end of a word, should
michael@0 856 // be considered part of that word. Example: "etc.", or
michael@0 857 // abbreviations
michael@0 858 if (aIndex > 0 &&
michael@0 859 mDOMWordText[aIndex] == '.' &&
michael@0 860 mDOMWordText[aIndex - 1] != '.' &&
michael@0 861 ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
michael@0 862 return CHAR_CLASS_WORD;
michael@0 863 }
michael@0 864
michael@0 865 // all other punctuation
michael@0 866 if (charCategory == nsIUGenCategory::kSeparator ||
michael@0 867 charCategory == nsIUGenCategory::kOther ||
michael@0 868 charCategory == nsIUGenCategory::kPunctuation ||
michael@0 869 charCategory == nsIUGenCategory::kSymbol) {
michael@0 870 // Don't break on hyphens, as hunspell handles them on its own.
michael@0 871 if (aIndex > 0 &&
michael@0 872 mDOMWordText[aIndex] == '-' &&
michael@0 873 mDOMWordText[aIndex - 1] != '-' &&
michael@0 874 ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
michael@0 875 // A hyphen is only meaningful as a separator inside a word
michael@0 876 // if the previous and next characters are a word character.
michael@0 877 if (aIndex == int32_t(mDOMWordText.Length()) - 1)
michael@0 878 return CHAR_CLASS_SEPARATOR;
michael@0 879 if (mDOMWordText[aIndex + 1] != '.' &&
michael@0 880 ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
michael@0 881 return CHAR_CLASS_WORD;
michael@0 882 }
michael@0 883 return CHAR_CLASS_SEPARATOR;
michael@0 884 }
michael@0 885
michael@0 886 // any other character counts as a word
michael@0 887 return CHAR_CLASS_WORD;
michael@0 888 }
michael@0 889
michael@0 890
michael@0 891 // WordSplitState::Advance
michael@0 892
michael@0 893 void
michael@0 894 WordSplitState::Advance()
michael@0 895 {
michael@0 896 NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
michael@0 897 NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
michael@0 898 "Length beyond end");
michael@0 899
michael@0 900 mDOMWordOffset ++;
michael@0 901 if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
michael@0 902 mCurCharClass = CHAR_CLASS_END_OF_INPUT;
michael@0 903 else
michael@0 904 mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
michael@0 905 }
michael@0 906
michael@0 907
michael@0 908 // WordSplitState::AdvanceThroughSeparators
michael@0 909
michael@0 910 void
michael@0 911 WordSplitState::AdvanceThroughSeparators()
michael@0 912 {
michael@0 913 while (mCurCharClass == CHAR_CLASS_SEPARATOR)
michael@0 914 Advance();
michael@0 915 }
michael@0 916
michael@0 917 // WordSplitState::AdvanceThroughWord
michael@0 918
michael@0 919 void
michael@0 920 WordSplitState::AdvanceThroughWord()
michael@0 921 {
michael@0 922 while (mCurCharClass == CHAR_CLASS_WORD)
michael@0 923 Advance();
michael@0 924 }
michael@0 925
michael@0 926
michael@0 927 // WordSplitState::IsSpecialWord
michael@0 928
michael@0 929 bool
michael@0 930 WordSplitState::IsSpecialWord()
michael@0 931 {
michael@0 932 // Search for email addresses. We simply define these as any sequence of
michael@0 933 // characters with an '@' character in the middle. The DOM word is already
michael@0 934 // split on whitepace, so we know that everything to the end is the address
michael@0 935 int32_t firstColon = -1;
michael@0 936 for (int32_t i = mDOMWordOffset;
michael@0 937 i < int32_t(mDOMWordText.Length()); i ++) {
michael@0 938 if (mDOMWordText[i] == '@') {
michael@0 939 // only accept this if there are unambiguous word characters (don't bother
michael@0 940 // recursing to disambiguate apostrophes) on each side. This prevents
michael@0 941 // classifying, e.g. "@home" as an email address
michael@0 942
michael@0 943 // Use this condition to only accept words with '@' in the middle of
michael@0 944 // them. It works, but the inlinespellcker doesn't like this. The problem
michael@0 945 // is that you type "fhsgfh@" that's a misspelled word followed by a
michael@0 946 // symbol, but when you type another letter "fhsgfh@g" that first word
michael@0 947 // need to be unmarked misspelled. It doesn't do this. it only checks the
michael@0 948 // current position for potentially removing a spelling range.
michael@0 949 if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
michael@0 950 i < (int32_t)mDOMWordText.Length() - 1 &&
michael@0 951 ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
michael@0 952 return true;
michael@0 953 }
michael@0 954 } else if (mDOMWordText[i] == ':' && firstColon < 0) {
michael@0 955 firstColon = i;
michael@0 956
michael@0 957 // If the first colon is followed by a slash, consider it a URL
michael@0 958 // This will catch things like asdf://foo.com
michael@0 959 if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
michael@0 960 mDOMWordText[firstColon + 1] == '/') {
michael@0 961 return true;
michael@0 962 }
michael@0 963 }
michael@0 964 }
michael@0 965
michael@0 966 // Check the text before the first colon against some known protocols. It
michael@0 967 // is impossible to check against all protocols, especially since you can
michael@0 968 // plug in new protocols. We also don't want to waste time here checking
michael@0 969 // against a lot of obscure protocols.
michael@0 970 if (firstColon > mDOMWordOffset) {
michael@0 971 nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
michael@0 972 firstColon - mDOMWordOffset));
michael@0 973 if (protocol.EqualsIgnoreCase("http") ||
michael@0 974 protocol.EqualsIgnoreCase("https") ||
michael@0 975 protocol.EqualsIgnoreCase("news") ||
michael@0 976 protocol.EqualsIgnoreCase("file") ||
michael@0 977 protocol.EqualsIgnoreCase("javascript") ||
michael@0 978 protocol.EqualsIgnoreCase("data") ||
michael@0 979 protocol.EqualsIgnoreCase("ftp")) {
michael@0 980 return true;
michael@0 981 }
michael@0 982 }
michael@0 983
michael@0 984 // not anything special
michael@0 985 return false;
michael@0 986 }
michael@0 987
michael@0 988 // WordSplitState::ShouldSkipWord
michael@0 989
michael@0 990 bool
michael@0 991 WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength)
michael@0 992 {
michael@0 993 int32_t last = aStart + aLength;
michael@0 994
michael@0 995 // check to see if the word contains a digit
michael@0 996 for (int32_t i = aStart; i < last; i ++) {
michael@0 997 if (unicode::GetGenCategory(mDOMWordText[i]) == nsIUGenCategory::kNumber) {
michael@0 998 return true;
michael@0 999 }
michael@0 1000 }
michael@0 1001
michael@0 1002 // not special
michael@0 1003 return false;
michael@0 1004 }
michael@0 1005
michael@0 1006 // mozInlineSpellWordUtil::SplitDOMWord
michael@0 1007
michael@0 1008 void
michael@0 1009 mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd)
michael@0 1010 {
michael@0 1011 WordSplitState state(this, mSoftText, aStart, aEnd - aStart);
michael@0 1012 state.mCurCharClass = state.ClassifyCharacter(0, true);
michael@0 1013
michael@0 1014 state.AdvanceThroughSeparators();
michael@0 1015 if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT &&
michael@0 1016 state.IsSpecialWord()) {
michael@0 1017 int32_t specialWordLength = state.mDOMWordText.Length() - state.mDOMWordOffset;
michael@0 1018 mRealWords.AppendElement(
michael@0 1019 RealWord(aStart + state.mDOMWordOffset, specialWordLength, false));
michael@0 1020
michael@0 1021 return;
michael@0 1022 }
michael@0 1023
michael@0 1024 while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
michael@0 1025 state.AdvanceThroughSeparators();
michael@0 1026 if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT)
michael@0 1027 break;
michael@0 1028
michael@0 1029 // save the beginning of the word
michael@0 1030 int32_t wordOffset = state.mDOMWordOffset;
michael@0 1031
michael@0 1032 // find the end of the word
michael@0 1033 state.AdvanceThroughWord();
michael@0 1034 int32_t wordLen = state.mDOMWordOffset - wordOffset;
michael@0 1035 mRealWords.AppendElement(
michael@0 1036 RealWord(aStart + wordOffset, wordLen,
michael@0 1037 !state.ShouldSkipWord(wordOffset, wordLen)));
michael@0 1038 }
michael@0 1039 }

mercurial