|
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "mozInlineSpellWordUtil.h" |
|
7 #include "nsDebug.h" |
|
8 #include "nsIAtom.h" |
|
9 #include "nsComponentManagerUtils.h" |
|
10 #include "nsIDOMCSSStyleDeclaration.h" |
|
11 #include "nsIDOMElement.h" |
|
12 #include "nsIDOMRange.h" |
|
13 #include "nsIEditor.h" |
|
14 #include "nsIDOMNode.h" |
|
15 #include "nsIDOMHTMLBRElement.h" |
|
16 #include "nsUnicharUtilCIID.h" |
|
17 #include "nsUnicodeProperties.h" |
|
18 #include "nsServiceManagerUtils.h" |
|
19 #include "nsIContent.h" |
|
20 #include "nsTextFragment.h" |
|
21 #include "mozilla/dom/Element.h" |
|
22 #include "nsRange.h" |
|
23 #include "nsContentUtils.h" |
|
24 #include "nsIFrame.h" |
|
25 #include <algorithm> |
|
26 |
|
27 using namespace mozilla; |
|
28 |
|
29 // IsIgnorableCharacter |
|
30 // |
|
31 // These characters are ones that we should ignore in input. |
|
32 |
|
33 inline bool IsIgnorableCharacter(char16_t ch) |
|
34 { |
|
35 return (ch == 0xAD || // SOFT HYPHEN |
|
36 ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN |
|
37 } |
|
38 |
|
39 // IsConditionalPunctuation |
|
40 // |
|
41 // Some characters (like apostrophes) require characters on each side to be |
|
42 // part of a word, and are otherwise punctuation. |
|
43 |
|
44 inline bool IsConditionalPunctuation(char16_t ch) |
|
45 { |
|
46 return (ch == '\'' || |
|
47 ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK |
|
48 ch == 0x00B7); // MIDDLE DOT |
|
49 } |
|
50 |
|
51 // mozInlineSpellWordUtil::Init |
|
52 |
|
53 nsresult |
|
54 mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor) |
|
55 { |
|
56 nsresult rv; |
|
57 |
|
58 // getting the editor can fail commonly because the editor was detached, so |
|
59 // don't assert |
|
60 nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv); |
|
61 if (NS_FAILED(rv)) |
|
62 return rv; |
|
63 |
|
64 nsCOMPtr<nsIDOMDocument> domDoc; |
|
65 rv = editor->GetDocument(getter_AddRefs(domDoc)); |
|
66 NS_ENSURE_SUCCESS(rv, rv); |
|
67 NS_ENSURE_TRUE(domDoc, NS_ERROR_NULL_POINTER); |
|
68 |
|
69 mDOMDocument = domDoc; |
|
70 mDocument = do_QueryInterface(domDoc); |
|
71 |
|
72 // Find the root node for the editor. For contenteditable we'll need something |
|
73 // cleverer here. |
|
74 nsCOMPtr<nsIDOMElement> rootElt; |
|
75 rv = editor->GetRootElement(getter_AddRefs(rootElt)); |
|
76 NS_ENSURE_SUCCESS(rv, rv); |
|
77 |
|
78 nsCOMPtr<nsINode> rootNode = do_QueryInterface(rootElt); |
|
79 mRootNode = rootNode; |
|
80 NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!"); |
|
81 return NS_OK; |
|
82 } |
|
83 |
|
84 static inline bool |
|
85 IsTextNode(nsINode* aNode) |
|
86 { |
|
87 return aNode->IsNodeOfType(nsINode::eTEXT); |
|
88 } |
|
89 |
|
90 typedef void (* OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure); |
|
91 |
|
92 // Find the next node in the DOM tree in preorder. |
|
93 // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is |
|
94 // why we can't just use GetNextNode here, sadly. |
|
95 static nsINode* |
|
96 FindNextNode(nsINode* aNode, nsINode* aRoot, |
|
97 OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure) |
|
98 { |
|
99 NS_PRECONDITION(aNode, "Null starting node?"); |
|
100 |
|
101 nsINode* next = aNode->GetFirstChild(); |
|
102 if (next) |
|
103 return next; |
|
104 |
|
105 // Don't look at siblings or otherwise outside of aRoot |
|
106 if (aNode == aRoot) |
|
107 return nullptr; |
|
108 |
|
109 next = aNode->GetNextSibling(); |
|
110 if (next) |
|
111 return next; |
|
112 |
|
113 // Go up |
|
114 for (;;) { |
|
115 if (aOnLeaveNode) { |
|
116 aOnLeaveNode(aNode, aClosure); |
|
117 } |
|
118 |
|
119 next = aNode->GetParent(); |
|
120 if (next == aRoot || ! next) |
|
121 return nullptr; |
|
122 aNode = next; |
|
123 |
|
124 next = aNode->GetNextSibling(); |
|
125 if (next) |
|
126 return next; |
|
127 } |
|
128 } |
|
129 |
|
130 // aNode is not a text node. Find the first text node starting at aNode/aOffset |
|
131 // in a preorder DOM traversal. |
|
132 static nsINode* |
|
133 FindNextTextNode(nsINode* aNode, int32_t aOffset, nsINode* aRoot) |
|
134 { |
|
135 NS_PRECONDITION(aNode, "Null starting node?"); |
|
136 NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node"); |
|
137 |
|
138 nsINode* checkNode; |
|
139 // Need to start at the aOffset'th child |
|
140 nsIContent* child = aNode->GetChildAt(aOffset); |
|
141 |
|
142 if (child) { |
|
143 checkNode = child; |
|
144 } else { |
|
145 // aOffset was beyond the end of the child list. |
|
146 // goto next node after the last descendant of aNode in |
|
147 // a preorder DOM traversal. |
|
148 checkNode = aNode->GetNextNonChildNode(aRoot); |
|
149 } |
|
150 |
|
151 while (checkNode && !IsTextNode(checkNode)) { |
|
152 checkNode = checkNode->GetNextNode(aRoot); |
|
153 } |
|
154 return checkNode; |
|
155 } |
|
156 |
|
157 // mozInlineSpellWordUtil::SetEnd |
|
158 // |
|
159 // We have two ranges "hard" and "soft". The hard boundary is simply |
|
160 // the scope of the root node. The soft boundary is that which is set |
|
161 // by the caller of this class by calling this function. If this function is |
|
162 // not called, the soft boundary is the same as the hard boundary. |
|
163 // |
|
164 // When we reach the soft boundary (mSoftEnd), we keep |
|
165 // going until we reach the end of a word. This allows the caller to set the |
|
166 // end of the range to anything, and we will always check whole multiples of |
|
167 // words. When we reach the hard boundary we stop no matter what. |
|
168 // |
|
169 // There is no beginning soft boundary. This is because we only go to the |
|
170 // previous node once, when finding the previous word boundary in |
|
171 // SetPosition(). You might think of the soft boundary as being this initial |
|
172 // position. |
|
173 |
|
174 nsresult |
|
175 mozInlineSpellWordUtil::SetEnd(nsINode* aEndNode, int32_t aEndOffset) |
|
176 { |
|
177 NS_PRECONDITION(aEndNode, "Null end node?"); |
|
178 |
|
179 NS_ASSERTION(mRootNode, "Not initialized"); |
|
180 |
|
181 InvalidateWords(); |
|
182 |
|
183 if (!IsTextNode(aEndNode)) { |
|
184 // End at the start of the first text node after aEndNode/aEndOffset. |
|
185 aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode); |
|
186 aEndOffset = 0; |
|
187 } |
|
188 mSoftEnd = NodeOffset(aEndNode, aEndOffset); |
|
189 return NS_OK; |
|
190 } |
|
191 |
|
192 nsresult |
|
193 mozInlineSpellWordUtil::SetPosition(nsINode* aNode, int32_t aOffset) |
|
194 { |
|
195 InvalidateWords(); |
|
196 |
|
197 if (!IsTextNode(aNode)) { |
|
198 // Start at the start of the first text node after aNode/aOffset. |
|
199 aNode = FindNextTextNode(aNode, aOffset, mRootNode); |
|
200 aOffset = 0; |
|
201 } |
|
202 mSoftBegin = NodeOffset(aNode, aOffset); |
|
203 |
|
204 EnsureWords(); |
|
205 |
|
206 int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin); |
|
207 if (textOffset < 0) |
|
208 return NS_OK; |
|
209 mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true); |
|
210 return NS_OK; |
|
211 } |
|
212 |
|
213 void |
|
214 mozInlineSpellWordUtil::EnsureWords() |
|
215 { |
|
216 if (mSoftTextValid) |
|
217 return; |
|
218 BuildSoftText(); |
|
219 BuildRealWords(); |
|
220 mSoftTextValid = true; |
|
221 } |
|
222 |
|
223 nsresult |
|
224 mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsRange** aRange) |
|
225 { |
|
226 NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN); |
|
227 NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END); |
|
228 return MakeRange(begin, end, aRange); |
|
229 } |
|
230 |
|
231 // mozInlineSpellWordUtil::GetRangeForWord |
|
232 |
|
233 nsresult |
|
234 mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode, |
|
235 int32_t aWordOffset, |
|
236 nsRange** aRange) |
|
237 { |
|
238 // Set our soft end and start |
|
239 nsCOMPtr<nsINode> wordNode = do_QueryInterface(aWordNode); |
|
240 NodeOffset pt = NodeOffset(wordNode, aWordOffset); |
|
241 |
|
242 InvalidateWords(); |
|
243 mSoftBegin = mSoftEnd = pt; |
|
244 EnsureWords(); |
|
245 |
|
246 int32_t offset = MapDOMPositionToSoftTextOffset(pt); |
|
247 if (offset < 0) |
|
248 return MakeRange(pt, pt, aRange); |
|
249 int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false); |
|
250 if (wordIndex < 0) |
|
251 return MakeRange(pt, pt, aRange); |
|
252 return MakeRangeForWord(mRealWords[wordIndex], aRange); |
|
253 } |
|
254 |
|
255 // This is to fix characters that the spellchecker may not like |
|
256 static void |
|
257 NormalizeWord(const nsSubstring& aInput, int32_t aPos, int32_t aLen, nsAString& aOutput) |
|
258 { |
|
259 aOutput.Truncate(); |
|
260 for (int32_t i = 0; i < aLen; i++) { |
|
261 char16_t ch = aInput.CharAt(i + aPos); |
|
262 |
|
263 // remove ignorable characters from the word |
|
264 if (IsIgnorableCharacter(ch)) |
|
265 continue; |
|
266 |
|
267 // the spellchecker doesn't handle curly apostrophes in all languages |
|
268 if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK |
|
269 ch = '\''; |
|
270 } |
|
271 |
|
272 aOutput.Append(ch); |
|
273 } |
|
274 } |
|
275 |
|
276 // mozInlineSpellWordUtil::GetNextWord |
|
277 // |
|
278 // FIXME-optimization: we shouldn't have to generate a range every single |
|
279 // time. It would be better if the inline spellchecker didn't require a |
|
280 // range unless the word was misspelled. This may or may not be possible. |
|
281 |
|
282 nsresult |
|
283 mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsRange** aRange, |
|
284 bool* aSkipChecking) |
|
285 { |
|
286 #ifdef DEBUG_SPELLCHECK |
|
287 printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex); |
|
288 #endif |
|
289 |
|
290 if (mNextWordIndex < 0 || |
|
291 mNextWordIndex >= int32_t(mRealWords.Length())) { |
|
292 mNextWordIndex = -1; |
|
293 *aRange = nullptr; |
|
294 *aSkipChecking = true; |
|
295 return NS_OK; |
|
296 } |
|
297 |
|
298 const RealWord& word = mRealWords[mNextWordIndex]; |
|
299 nsresult rv = MakeRangeForWord(word, aRange); |
|
300 NS_ENSURE_SUCCESS(rv, rv); |
|
301 ++mNextWordIndex; |
|
302 *aSkipChecking = !word.mCheckableWord; |
|
303 ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText); |
|
304 |
|
305 #ifdef DEBUG_SPELLCHECK |
|
306 printf("GetNextWord returning: %s (skip=%d)\n", |
|
307 NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking); |
|
308 #endif |
|
309 |
|
310 return NS_OK; |
|
311 } |
|
312 |
|
313 // mozInlineSpellWordUtil::MakeRange |
|
314 // |
|
315 // Convenience function for creating a range over the current document. |
|
316 |
|
317 nsresult |
|
318 mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd, |
|
319 nsRange** aRange) |
|
320 { |
|
321 NS_ENSURE_ARG_POINTER(aBegin.mNode); |
|
322 if (!mDOMDocument) |
|
323 return NS_ERROR_NOT_INITIALIZED; |
|
324 |
|
325 nsRefPtr<nsRange> range = new nsRange(aBegin.mNode); |
|
326 nsresult rv = range->Set(aBegin.mNode, aBegin.mOffset, |
|
327 aEnd.mNode, aEnd.mOffset); |
|
328 NS_ENSURE_SUCCESS(rv, rv); |
|
329 range.forget(aRange); |
|
330 |
|
331 return NS_OK; |
|
332 } |
|
333 |
|
334 /*********** DOM text extraction ************/ |
|
335 |
|
336 // IsDOMWordSeparator |
|
337 // |
|
338 // Determines if the given character should be considered as a DOM Word |
|
339 // separator. Basically, this is whitespace, although it could also have |
|
340 // certain punctuation that we know ALWAYS breaks words. This is important. |
|
341 // For example, we can't have any punctuation that could appear in a URL |
|
342 // or email address in this, because those need to always fit into a single |
|
343 // DOM word. |
|
344 |
|
345 static bool |
|
346 IsDOMWordSeparator(char16_t ch) |
|
347 { |
|
348 // simple spaces |
|
349 if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') |
|
350 return true; |
|
351 |
|
352 // complex spaces - check only if char isn't ASCII (uncommon) |
|
353 if (ch >= 0xA0 && |
|
354 (ch == 0x00A0 || // NO-BREAK SPACE |
|
355 ch == 0x2002 || // EN SPACE |
|
356 ch == 0x2003 || // EM SPACE |
|
357 ch == 0x2009 || // THIN SPACE |
|
358 ch == 0x3000)) // IDEOGRAPHIC SPACE |
|
359 return true; |
|
360 |
|
361 // otherwise not a space |
|
362 return false; |
|
363 } |
|
364 |
|
365 static inline bool |
|
366 IsBRElement(nsINode* aNode) |
|
367 { |
|
368 return aNode->IsElement() && |
|
369 aNode->AsElement()->IsHTML(nsGkAtoms::br); |
|
370 } |
|
371 |
|
372 /** |
|
373 * Given a TextNode, checks to see if there's a DOM word separator before |
|
374 * aBeforeOffset within it. This function does not modify aSeparatorOffset when |
|
375 * it returns false. |
|
376 * |
|
377 * @param aNode the TextNode to check. |
|
378 * @param aBeforeOffset the offset in the TextNode before which we will search |
|
379 * for the DOM separator. You can pass INT32_MAX to search the entire |
|
380 * length of the string. |
|
381 * @param aSeparatorOffset will be set to the offset of the first separator it |
|
382 * encounters. Will not be written to if no separator is found. |
|
383 * @returns True if it found a separator. |
|
384 */ |
|
385 static bool |
|
386 TextNodeContainsDOMWordSeparator(nsINode* aNode, |
|
387 int32_t aBeforeOffset, |
|
388 int32_t* aSeparatorOffset) |
|
389 { |
|
390 // aNode is actually an nsIContent, since it's eTEXT |
|
391 nsIContent* content = static_cast<nsIContent*>(aNode); |
|
392 const nsTextFragment* textFragment = content->GetText(); |
|
393 NS_ASSERTION(textFragment, "Where is our text?"); |
|
394 for (int32_t i = std::min(aBeforeOffset, int32_t(textFragment->GetLength())) - 1; i >= 0; --i) { |
|
395 if (IsDOMWordSeparator(textFragment->CharAt(i))) { |
|
396 // Be greedy, find as many separators as we can |
|
397 for (int32_t j = i - 1; j >= 0; --j) { |
|
398 if (IsDOMWordSeparator(textFragment->CharAt(j))) { |
|
399 i = j; |
|
400 } else { |
|
401 break; |
|
402 } |
|
403 } |
|
404 *aSeparatorOffset = i; |
|
405 return true; |
|
406 } |
|
407 } |
|
408 return false; |
|
409 } |
|
410 |
|
411 /** |
|
412 * Check if there's a DOM word separator before aBeforeOffset in this node. |
|
413 * Always returns true if it's a BR element. |
|
414 * aSeparatorOffset is set to the index of the first character in the last |
|
415 * separator if any is found (0 for BR elements). |
|
416 * |
|
417 * This function does not modify aSeparatorOffset when it returns false. |
|
418 */ |
|
419 static bool |
|
420 ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset, |
|
421 int32_t* aSeparatorOffset) |
|
422 { |
|
423 if (IsBRElement(aNode)) { |
|
424 *aSeparatorOffset = 0; |
|
425 return true; |
|
426 } |
|
427 |
|
428 if (!IsTextNode(aNode)) |
|
429 return false; |
|
430 |
|
431 return TextNodeContainsDOMWordSeparator(aNode, aBeforeOffset, |
|
432 aSeparatorOffset); |
|
433 } |
|
434 |
|
435 static bool |
|
436 IsBreakElement(nsINode* aNode) |
|
437 { |
|
438 if (!aNode->IsElement()) { |
|
439 return false; |
|
440 } |
|
441 |
|
442 dom::Element *element = aNode->AsElement(); |
|
443 |
|
444 if (element->IsHTML(nsGkAtoms::br)) |
|
445 return true; |
|
446 |
|
447 // If we don't have a frame, we don't consider ourselves a break |
|
448 // element. In particular, words can span us. |
|
449 if (!element->GetPrimaryFrame()) |
|
450 return false; |
|
451 |
|
452 // Anything that's not an inline element is a break element. |
|
453 // XXXbz should replaced inlines be break elements, though? |
|
454 return element->GetPrimaryFrame()->StyleDisplay()->mDisplay != |
|
455 NS_STYLE_DISPLAY_INLINE; |
|
456 } |
|
457 |
|
458 struct CheckLeavingBreakElementClosure { |
|
459 bool mLeftBreakElement; |
|
460 }; |
|
461 |
|
462 static void |
|
463 CheckLeavingBreakElement(nsINode* aNode, void* aClosure) |
|
464 { |
|
465 CheckLeavingBreakElementClosure* cl = |
|
466 static_cast<CheckLeavingBreakElementClosure*>(aClosure); |
|
467 if (!cl->mLeftBreakElement && IsBreakElement(aNode)) { |
|
468 cl->mLeftBreakElement = true; |
|
469 } |
|
470 } |
|
471 |
|
472 void |
|
473 mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord) |
|
474 { |
|
475 nsAutoString result; |
|
476 ::NormalizeWord(aWord, 0, aWord.Length(), result); |
|
477 aWord = result; |
|
478 } |
|
479 |
|
480 void |
|
481 mozInlineSpellWordUtil::BuildSoftText() |
|
482 { |
|
483 // First we have to work backwards from mSoftStart to find a text node |
|
484 // containing a DOM word separator, a non-inline-element |
|
485 // boundary, or the hard start node. That's where we'll start building the |
|
486 // soft string from. |
|
487 nsINode* node = mSoftBegin.mNode; |
|
488 int32_t firstOffsetInNode = 0; |
|
489 int32_t checkBeforeOffset = mSoftBegin.mOffset; |
|
490 while (node) { |
|
491 if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) { |
|
492 if (node == mSoftBegin.mNode) { |
|
493 // If we find a word separator on the first node, look at the preceding |
|
494 // word on the text node as well. |
|
495 int32_t newOffset = 0; |
|
496 if (firstOffsetInNode > 0) { |
|
497 // Try to find the previous word boundary in the current node. If |
|
498 // we can't find one, start checking previous sibling nodes (if any |
|
499 // adjacent ones exist) to see if we can find any text nodes with |
|
500 // DOM word separators. We bail out as soon as we see a node that is |
|
501 // not a text node, or we run out of previous sibling nodes. In the |
|
502 // event that we simply cannot find any preceding word separator, the |
|
503 // offset is set to 0, and the soft text beginning node is set to the |
|
504 // "most previous" text node before the original starting node, or |
|
505 // kept at the original starting node if no previous text nodes exist. |
|
506 if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1, |
|
507 &newOffset)) { |
|
508 nsINode* prevNode = node->GetPreviousSibling(); |
|
509 while (prevNode && IsTextNode(prevNode)) { |
|
510 mSoftBegin.mNode = prevNode; |
|
511 if (TextNodeContainsDOMWordSeparator(prevNode, INT32_MAX, |
|
512 &newOffset)) { |
|
513 break; |
|
514 } |
|
515 prevNode = prevNode->GetPreviousSibling(); |
|
516 } |
|
517 } |
|
518 } |
|
519 firstOffsetInNode = newOffset; |
|
520 mSoftBegin.mOffset = newOffset; |
|
521 } |
|
522 break; |
|
523 } |
|
524 checkBeforeOffset = INT32_MAX; |
|
525 if (IsBreakElement(node)) { |
|
526 // Since GetPreviousContent follows tree *preorder*, we're about to traverse |
|
527 // up out of 'node'. Since node induces breaks (e.g., it's a block), |
|
528 // don't bother trying to look outside it, just stop now. |
|
529 break; |
|
530 } |
|
531 // GetPreviousContent below expects mRootNode to be an ancestor of node. |
|
532 if (!nsContentUtils::ContentIsDescendantOf(node, mRootNode)) { |
|
533 break; |
|
534 } |
|
535 node = node->GetPreviousContent(mRootNode); |
|
536 } |
|
537 |
|
538 // Now build up the string moving forward through the DOM until we reach |
|
539 // the soft end and *then* see a DOM word separator, a non-inline-element |
|
540 // boundary, or the hard end node. |
|
541 mSoftText.Truncate(); |
|
542 mSoftTextDOMMapping.Clear(); |
|
543 bool seenSoftEnd = false; |
|
544 // Leave this outside the loop so large heap string allocations can be reused |
|
545 // across iterations |
|
546 while (node) { |
|
547 if (node == mSoftEnd.mNode) { |
|
548 seenSoftEnd = true; |
|
549 } |
|
550 |
|
551 bool exit = false; |
|
552 if (IsTextNode(node)) { |
|
553 nsIContent* content = static_cast<nsIContent*>(node); |
|
554 NS_ASSERTION(content, "Where is our content?"); |
|
555 const nsTextFragment* textFragment = content->GetText(); |
|
556 NS_ASSERTION(textFragment, "Where is our text?"); |
|
557 int32_t lastOffsetInNode = textFragment->GetLength(); |
|
558 |
|
559 if (seenSoftEnd) { |
|
560 // check whether we can stop after this |
|
561 for (int32_t i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0; |
|
562 i < int32_t(textFragment->GetLength()); ++i) { |
|
563 if (IsDOMWordSeparator(textFragment->CharAt(i))) { |
|
564 exit = true; |
|
565 // stop at the first separator after the soft end point |
|
566 lastOffsetInNode = i; |
|
567 break; |
|
568 } |
|
569 } |
|
570 } |
|
571 |
|
572 if (firstOffsetInNode < lastOffsetInNode) { |
|
573 int32_t len = lastOffsetInNode - firstOffsetInNode; |
|
574 mSoftTextDOMMapping.AppendElement( |
|
575 DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len)); |
|
576 |
|
577 bool ok = textFragment->AppendTo(mSoftText, firstOffsetInNode, len, |
|
578 mozilla::fallible_t()); |
|
579 if (!ok) { |
|
580 // probably out of memory, remove from mSoftTextDOMMapping |
|
581 mSoftTextDOMMapping.RemoveElementAt(mSoftTextDOMMapping.Length() - 1); |
|
582 exit = true; |
|
583 } |
|
584 } |
|
585 |
|
586 firstOffsetInNode = 0; |
|
587 } |
|
588 |
|
589 if (exit) |
|
590 break; |
|
591 |
|
592 CheckLeavingBreakElementClosure closure = { false }; |
|
593 node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure); |
|
594 if (closure.mLeftBreakElement || (node && IsBreakElement(node))) { |
|
595 // We left, or are entering, a break element (e.g., block). Maybe we can |
|
596 // stop now. |
|
597 if (seenSoftEnd) |
|
598 break; |
|
599 // Record the break |
|
600 mSoftText.Append(' '); |
|
601 } |
|
602 } |
|
603 |
|
604 #ifdef DEBUG_SPELLCHECK |
|
605 printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get()); |
|
606 #endif |
|
607 } |
|
608 |
|
609 void |
|
610 mozInlineSpellWordUtil::BuildRealWords() |
|
611 { |
|
612 // This is pretty simple. We just have to walk mSoftText, tokenizing it |
|
613 // into "real words". |
|
614 // We do an outer traversal of words delimited by IsDOMWordSeparator, calling |
|
615 // SplitDOMWord on each of those DOM words |
|
616 int32_t wordStart = -1; |
|
617 mRealWords.Clear(); |
|
618 for (int32_t i = 0; i < int32_t(mSoftText.Length()); ++i) { |
|
619 if (IsDOMWordSeparator(mSoftText.CharAt(i))) { |
|
620 if (wordStart >= 0) { |
|
621 SplitDOMWord(wordStart, i); |
|
622 wordStart = -1; |
|
623 } |
|
624 } else { |
|
625 if (wordStart < 0) { |
|
626 wordStart = i; |
|
627 } |
|
628 } |
|
629 } |
|
630 if (wordStart >= 0) { |
|
631 SplitDOMWord(wordStart, mSoftText.Length()); |
|
632 } |
|
633 } |
|
634 |
|
635 /*********** DOM/realwords<->mSoftText mapping functions ************/ |
|
636 |
|
637 int32_t |
|
638 mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset) |
|
639 { |
|
640 if (!mSoftTextValid) { |
|
641 NS_ERROR("Soft text must be valid if we're to map into it"); |
|
642 return -1; |
|
643 } |
|
644 |
|
645 for (int32_t i = 0; i < int32_t(mSoftTextDOMMapping.Length()); ++i) { |
|
646 const DOMTextMapping& map = mSoftTextDOMMapping[i]; |
|
647 if (map.mNodeOffset.mNode == aNodeOffset.mNode) { |
|
648 // Allow offsets at either end of the string, in particular, allow the |
|
649 // offset that's at the end of the contributed string |
|
650 int32_t offsetInContributedString = |
|
651 aNodeOffset.mOffset - map.mNodeOffset.mOffset; |
|
652 if (offsetInContributedString >= 0 && |
|
653 offsetInContributedString <= map.mLength) |
|
654 return map.mSoftTextOffset + offsetInContributedString; |
|
655 return -1; |
|
656 } |
|
657 } |
|
658 return -1; |
|
659 } |
|
660 |
|
661 mozInlineSpellWordUtil::NodeOffset |
|
662 mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset, |
|
663 DOMMapHint aHint) |
|
664 { |
|
665 NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it"); |
|
666 if (!mSoftTextValid) |
|
667 return NodeOffset(nullptr, -1); |
|
668 |
|
669 // The invariant is that the range start..end includes the last mapping, |
|
670 // if any, such that mSoftTextOffset <= aSoftTextOffset |
|
671 int32_t start = 0; |
|
672 int32_t end = mSoftTextDOMMapping.Length(); |
|
673 while (end - start >= 2) { |
|
674 int32_t mid = (start + end)/2; |
|
675 const DOMTextMapping& map = mSoftTextDOMMapping[mid]; |
|
676 if (map.mSoftTextOffset > aSoftTextOffset) { |
|
677 end = mid; |
|
678 } else { |
|
679 start = mid; |
|
680 } |
|
681 } |
|
682 |
|
683 if (start >= end) |
|
684 return NodeOffset(nullptr, -1); |
|
685 |
|
686 // 'start' is now the last mapping, if any, such that |
|
687 // mSoftTextOffset <= aSoftTextOffset. |
|
688 // If we're doing HINT_END, then we may want to return the end of the |
|
689 // the previous mapping instead of the start of this mapping |
|
690 if (aHint == HINT_END && start > 0) { |
|
691 const DOMTextMapping& map = mSoftTextDOMMapping[start - 1]; |
|
692 if (map.mSoftTextOffset + map.mLength == aSoftTextOffset) |
|
693 return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength); |
|
694 } |
|
695 |
|
696 // We allow ourselves to return the end of this mapping even if we're |
|
697 // doing HINT_START. This will only happen if there is no mapping which this |
|
698 // point is the start of. I'm not 100% sure this is OK... |
|
699 const DOMTextMapping& map = mSoftTextDOMMapping[start]; |
|
700 int32_t offset = aSoftTextOffset - map.mSoftTextOffset; |
|
701 if (offset >= 0 && offset <= map.mLength) |
|
702 return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset); |
|
703 |
|
704 return NodeOffset(nullptr, -1); |
|
705 } |
|
706 |
|
707 int32_t |
|
708 mozInlineSpellWordUtil::FindRealWordContaining(int32_t aSoftTextOffset, |
|
709 DOMMapHint aHint, bool aSearchForward) |
|
710 { |
|
711 NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it"); |
|
712 if (!mSoftTextValid) |
|
713 return -1; |
|
714 |
|
715 // The invariant is that the range start..end includes the last word, |
|
716 // if any, such that mSoftTextOffset <= aSoftTextOffset |
|
717 int32_t start = 0; |
|
718 int32_t end = mRealWords.Length(); |
|
719 while (end - start >= 2) { |
|
720 int32_t mid = (start + end)/2; |
|
721 const RealWord& word = mRealWords[mid]; |
|
722 if (word.mSoftTextOffset > aSoftTextOffset) { |
|
723 end = mid; |
|
724 } else { |
|
725 start = mid; |
|
726 } |
|
727 } |
|
728 |
|
729 if (start >= end) |
|
730 return -1; |
|
731 |
|
732 // 'start' is now the last word, if any, such that |
|
733 // mSoftTextOffset <= aSoftTextOffset. |
|
734 // If we're doing HINT_END, then we may want to return the end of the |
|
735 // the previous word instead of the start of this word |
|
736 if (aHint == HINT_END && start > 0) { |
|
737 const RealWord& word = mRealWords[start - 1]; |
|
738 if (word.mSoftTextOffset + word.mLength == aSoftTextOffset) |
|
739 return start - 1; |
|
740 } |
|
741 |
|
742 // We allow ourselves to return the end of this word even if we're |
|
743 // doing HINT_START. This will only happen if there is no word which this |
|
744 // point is the start of. I'm not 100% sure this is OK... |
|
745 const RealWord& word = mRealWords[start]; |
|
746 int32_t offset = aSoftTextOffset - word.mSoftTextOffset; |
|
747 if (offset >= 0 && offset <= word.mLength) |
|
748 return start; |
|
749 |
|
750 if (aSearchForward) { |
|
751 if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) { |
|
752 // All words have mSoftTextOffset > aSoftTextOffset |
|
753 return 0; |
|
754 } |
|
755 // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset. |
|
756 // Word start+1, if it exists, will be the first with |
|
757 // mSoftTextOffset > aSoftTextOffset. |
|
758 if (start + 1 < int32_t(mRealWords.Length())) |
|
759 return start + 1; |
|
760 } |
|
761 |
|
762 return -1; |
|
763 } |
|
764 |
|
765 /*********** Word Splitting ************/ |
|
766 |
|
767 // classifies a given character in the DOM word |
|
768 enum CharClass { |
|
769 CHAR_CLASS_WORD, |
|
770 CHAR_CLASS_SEPARATOR, |
|
771 CHAR_CLASS_END_OF_INPUT }; |
|
772 |
|
773 // Encapsulates DOM-word to real-word splitting |
|
774 struct MOZ_STACK_CLASS WordSplitState |
|
775 { |
|
776 mozInlineSpellWordUtil* mWordUtil; |
|
777 const nsDependentSubstring mDOMWordText; |
|
778 int32_t mDOMWordOffset; |
|
779 CharClass mCurCharClass; |
|
780 |
|
781 WordSplitState(mozInlineSpellWordUtil* aWordUtil, |
|
782 const nsString& aString, int32_t aStart, int32_t aLen) |
|
783 : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen), |
|
784 mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {} |
|
785 |
|
786 CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const; |
|
787 void Advance(); |
|
788 void AdvanceThroughSeparators(); |
|
789 void AdvanceThroughWord(); |
|
790 |
|
791 // Finds special words like email addresses and URLs that may start at the |
|
792 // current position, and returns their length, or 0 if not found. This allows |
|
793 // arbitrary word breaking rules to be used for these special entities, as |
|
794 // long as they can not contain whitespace. |
|
795 bool IsSpecialWord(); |
|
796 |
|
797 // Similar to IsSpecialWord except that this takes a split word as |
|
798 // input. This checks for things that do not require special word-breaking |
|
799 // rules. |
|
800 bool ShouldSkipWord(int32_t aStart, int32_t aLength); |
|
801 }; |
|
802 |
|
803 // WordSplitState::ClassifyCharacter |
|
804 |
|
805 CharClass |
|
806 WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const |
|
807 { |
|
808 NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()), |
|
809 "Index out of range"); |
|
810 if (aIndex == int32_t(mDOMWordText.Length())) |
|
811 return CHAR_CLASS_SEPARATOR; |
|
812 |
|
813 // this will classify the character, we want to treat "ignorable" characters |
|
814 // such as soft hyphens, and also ZWJ and ZWNJ as word characters. |
|
815 nsIUGenCategory::nsUGenCategory |
|
816 charCategory = mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]); |
|
817 if (charCategory == nsIUGenCategory::kLetter || |
|
818 IsIgnorableCharacter(mDOMWordText[aIndex]) || |
|
819 mDOMWordText[aIndex] == 0x200C /* ZWNJ */ || |
|
820 mDOMWordText[aIndex] == 0x200D /* ZWJ */) |
|
821 return CHAR_CLASS_WORD; |
|
822 |
|
823 // If conditional punctuation is surrounded immediately on both sides by word |
|
824 // characters it also counts as a word character. |
|
825 if (IsConditionalPunctuation(mDOMWordText[aIndex])) { |
|
826 if (!aRecurse) { |
|
827 // not allowed to look around, this punctuation counts like a separator |
|
828 return CHAR_CLASS_SEPARATOR; |
|
829 } |
|
830 |
|
831 // check the left-hand character |
|
832 if (aIndex == 0) |
|
833 return CHAR_CLASS_SEPARATOR; |
|
834 if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) |
|
835 return CHAR_CLASS_SEPARATOR; |
|
836 // If the previous charatcer is a word-char, make sure that it's not a |
|
837 // special dot character. |
|
838 if (mDOMWordText[aIndex - 1] == '.') |
|
839 return CHAR_CLASS_SEPARATOR; |
|
840 |
|
841 // now we know left char is a word-char, check the right-hand character |
|
842 if (aIndex == int32_t(mDOMWordText.Length()) - 1) |
|
843 return CHAR_CLASS_SEPARATOR; |
|
844 if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD) |
|
845 return CHAR_CLASS_SEPARATOR; |
|
846 // If the next charatcer is a word-char, make sure that it's not a |
|
847 // special dot character. |
|
848 if (mDOMWordText[aIndex + 1] == '.') |
|
849 return CHAR_CLASS_SEPARATOR; |
|
850 |
|
851 // char on either side is a word, this counts as a word |
|
852 return CHAR_CLASS_WORD; |
|
853 } |
|
854 |
|
855 // The dot character, if appearing at the end of a word, should |
|
856 // be considered part of that word. Example: "etc.", or |
|
857 // abbreviations |
|
858 if (aIndex > 0 && |
|
859 mDOMWordText[aIndex] == '.' && |
|
860 mDOMWordText[aIndex - 1] != '.' && |
|
861 ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) { |
|
862 return CHAR_CLASS_WORD; |
|
863 } |
|
864 |
|
865 // all other punctuation |
|
866 if (charCategory == nsIUGenCategory::kSeparator || |
|
867 charCategory == nsIUGenCategory::kOther || |
|
868 charCategory == nsIUGenCategory::kPunctuation || |
|
869 charCategory == nsIUGenCategory::kSymbol) { |
|
870 // Don't break on hyphens, as hunspell handles them on its own. |
|
871 if (aIndex > 0 && |
|
872 mDOMWordText[aIndex] == '-' && |
|
873 mDOMWordText[aIndex - 1] != '-' && |
|
874 ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) { |
|
875 // A hyphen is only meaningful as a separator inside a word |
|
876 // if the previous and next characters are a word character. |
|
877 if (aIndex == int32_t(mDOMWordText.Length()) - 1) |
|
878 return CHAR_CLASS_SEPARATOR; |
|
879 if (mDOMWordText[aIndex + 1] != '.' && |
|
880 ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD) |
|
881 return CHAR_CLASS_WORD; |
|
882 } |
|
883 return CHAR_CLASS_SEPARATOR; |
|
884 } |
|
885 |
|
886 // any other character counts as a word |
|
887 return CHAR_CLASS_WORD; |
|
888 } |
|
889 |
|
890 |
|
891 // WordSplitState::Advance |
|
892 |
|
893 void |
|
894 WordSplitState::Advance() |
|
895 { |
|
896 NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index"); |
|
897 NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(), |
|
898 "Length beyond end"); |
|
899 |
|
900 mDOMWordOffset ++; |
|
901 if (mDOMWordOffset >= (int32_t)mDOMWordText.Length()) |
|
902 mCurCharClass = CHAR_CLASS_END_OF_INPUT; |
|
903 else |
|
904 mCurCharClass = ClassifyCharacter(mDOMWordOffset, true); |
|
905 } |
|
906 |
|
907 |
|
908 // WordSplitState::AdvanceThroughSeparators |
|
909 |
|
910 void |
|
911 WordSplitState::AdvanceThroughSeparators() |
|
912 { |
|
913 while (mCurCharClass == CHAR_CLASS_SEPARATOR) |
|
914 Advance(); |
|
915 } |
|
916 |
|
917 // WordSplitState::AdvanceThroughWord |
|
918 |
|
919 void |
|
920 WordSplitState::AdvanceThroughWord() |
|
921 { |
|
922 while (mCurCharClass == CHAR_CLASS_WORD) |
|
923 Advance(); |
|
924 } |
|
925 |
|
926 |
|
927 // WordSplitState::IsSpecialWord |
|
928 |
|
929 bool |
|
930 WordSplitState::IsSpecialWord() |
|
931 { |
|
932 // Search for email addresses. We simply define these as any sequence of |
|
933 // characters with an '@' character in the middle. The DOM word is already |
|
934 // split on whitepace, so we know that everything to the end is the address |
|
935 int32_t firstColon = -1; |
|
936 for (int32_t i = mDOMWordOffset; |
|
937 i < int32_t(mDOMWordText.Length()); i ++) { |
|
938 if (mDOMWordText[i] == '@') { |
|
939 // only accept this if there are unambiguous word characters (don't bother |
|
940 // recursing to disambiguate apostrophes) on each side. This prevents |
|
941 // classifying, e.g. "@home" as an email address |
|
942 |
|
943 // Use this condition to only accept words with '@' in the middle of |
|
944 // them. It works, but the inlinespellcker doesn't like this. The problem |
|
945 // is that you type "fhsgfh@" that's a misspelled word followed by a |
|
946 // symbol, but when you type another letter "fhsgfh@g" that first word |
|
947 // need to be unmarked misspelled. It doesn't do this. it only checks the |
|
948 // current position for potentially removing a spelling range. |
|
949 if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD && |
|
950 i < (int32_t)mDOMWordText.Length() - 1 && |
|
951 ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) { |
|
952 return true; |
|
953 } |
|
954 } else if (mDOMWordText[i] == ':' && firstColon < 0) { |
|
955 firstColon = i; |
|
956 |
|
957 // If the first colon is followed by a slash, consider it a URL |
|
958 // This will catch things like asdf://foo.com |
|
959 if (firstColon < (int32_t)mDOMWordText.Length() - 1 && |
|
960 mDOMWordText[firstColon + 1] == '/') { |
|
961 return true; |
|
962 } |
|
963 } |
|
964 } |
|
965 |
|
966 // Check the text before the first colon against some known protocols. It |
|
967 // is impossible to check against all protocols, especially since you can |
|
968 // plug in new protocols. We also don't want to waste time here checking |
|
969 // against a lot of obscure protocols. |
|
970 if (firstColon > mDOMWordOffset) { |
|
971 nsString protocol(Substring(mDOMWordText, mDOMWordOffset, |
|
972 firstColon - mDOMWordOffset)); |
|
973 if (protocol.EqualsIgnoreCase("http") || |
|
974 protocol.EqualsIgnoreCase("https") || |
|
975 protocol.EqualsIgnoreCase("news") || |
|
976 protocol.EqualsIgnoreCase("file") || |
|
977 protocol.EqualsIgnoreCase("javascript") || |
|
978 protocol.EqualsIgnoreCase("data") || |
|
979 protocol.EqualsIgnoreCase("ftp")) { |
|
980 return true; |
|
981 } |
|
982 } |
|
983 |
|
984 // not anything special |
|
985 return false; |
|
986 } |
|
987 |
|
988 // WordSplitState::ShouldSkipWord |
|
989 |
|
990 bool |
|
991 WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength) |
|
992 { |
|
993 int32_t last = aStart + aLength; |
|
994 |
|
995 // check to see if the word contains a digit |
|
996 for (int32_t i = aStart; i < last; i ++) { |
|
997 if (unicode::GetGenCategory(mDOMWordText[i]) == nsIUGenCategory::kNumber) { |
|
998 return true; |
|
999 } |
|
1000 } |
|
1001 |
|
1002 // not special |
|
1003 return false; |
|
1004 } |
|
1005 |
|
1006 // mozInlineSpellWordUtil::SplitDOMWord |
|
1007 |
|
1008 void |
|
1009 mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd) |
|
1010 { |
|
1011 WordSplitState state(this, mSoftText, aStart, aEnd - aStart); |
|
1012 state.mCurCharClass = state.ClassifyCharacter(0, true); |
|
1013 |
|
1014 state.AdvanceThroughSeparators(); |
|
1015 if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT && |
|
1016 state.IsSpecialWord()) { |
|
1017 int32_t specialWordLength = state.mDOMWordText.Length() - state.mDOMWordOffset; |
|
1018 mRealWords.AppendElement( |
|
1019 RealWord(aStart + state.mDOMWordOffset, specialWordLength, false)); |
|
1020 |
|
1021 return; |
|
1022 } |
|
1023 |
|
1024 while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) { |
|
1025 state.AdvanceThroughSeparators(); |
|
1026 if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT) |
|
1027 break; |
|
1028 |
|
1029 // save the beginning of the word |
|
1030 int32_t wordOffset = state.mDOMWordOffset; |
|
1031 |
|
1032 // find the end of the word |
|
1033 state.AdvanceThroughWord(); |
|
1034 int32_t wordLen = state.mDOMWordOffset - wordOffset; |
|
1035 mRealWords.AppendElement( |
|
1036 RealWord(aStart + wordOffset, wordLen, |
|
1037 !state.ShouldSkipWord(wordOffset, wordLen))); |
|
1038 } |
|
1039 } |