michael@0: /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: #ifndef mozInlineSpellWordUtil_h
michael@0: #define mozInlineSpellWordUtil_h
michael@0: 
michael@0: #include "nsCOMPtr.h"
michael@0: #include "nsIDOMDocument.h"
michael@0: #include "nsIDocument.h"
michael@0: #include "nsString.h"
michael@0: #include "nsTArray.h"
michael@0: 
michael@0: //#define DEBUG_SPELLCHECK
michael@0: 
michael@0: class nsRange;
michael@0: class nsINode;
michael@0: 
michael@0: /**
michael@0:  *    This class extracts text from the DOM and builds it into a single string.
michael@0:  *    The string includes whitespace breaks whereever non-inline elements begin
michael@0:  *    and end. This string is broken into "real words", following somewhat
michael@0:  *    complex rules; for example substrings that look like URLs or
michael@0:  *    email addresses are treated as single words, but otherwise many kinds of
michael@0:  *    punctuation are treated as word separators. GetNextWord provides a way
michael@0:  *    to iterate over these "real words".
michael@0:  *
michael@0:  *    The basic operation is:
michael@0:  *
michael@0:  *    1. Call Init with the weak pointer to the editor that you're using.
michael@0:  *    2. Call SetEnd to set where you want to stop spellchecking. We'll stop
michael@0:  *       at the word boundary after that. If SetEnd is not called, we'll stop
michael@0:  *       at the end of the document's root element.
michael@0:  *    3. Call SetPosition to initialize the current position inside the
michael@0:  *       previously given range.
michael@0:  *    4. Call GetNextWord over and over until it returns false.
michael@0:  */
michael@0: 
michael@0: class mozInlineSpellWordUtil
michael@0: {
michael@0: public:
michael@0:   struct NodeOffset {
michael@0:     nsINode* mNode;
michael@0:     int32_t  mOffset;
michael@0:     
michael@0:     NodeOffset(nsINode* aNode, int32_t aOffset) :
michael@0:       mNode(aNode), mOffset(aOffset) {}
michael@0:   };
michael@0: 
michael@0:   mozInlineSpellWordUtil()
michael@0:     : mRootNode(nullptr),
michael@0:       mSoftBegin(nullptr, 0), mSoftEnd(nullptr, 0),
michael@0:       mNextWordIndex(-1), mSoftTextValid(false) {}
michael@0: 
michael@0:   nsresult Init(nsWeakPtr aWeakEditor);
michael@0: 
michael@0:   nsresult SetEnd(nsINode* aEndNode, int32_t aEndOffset);
michael@0: 
michael@0:   // sets the current position, this should be inside the range. If we are in
michael@0:   // the middle of a word, we'll move to its start.
michael@0:   nsresult SetPosition(nsINode* aNode, int32_t aOffset);
michael@0: 
michael@0:   // Given a point inside or immediately following a word, this returns the
michael@0:   // DOM range that exactly encloses that word's characters. The current
michael@0:   // position will be at the end of the word. This will find the previous
michael@0:   // word if the current position is space, so if you care that the point is
michael@0:   // inside the word, you should check the range.
michael@0:   //
michael@0:   // THIS CHANGES THE CURRENT POSITION AND RANGE. It is designed to be called
michael@0:   // before you actually generate the range you are interested in and iterate
michael@0:   // the words in it.
michael@0:   nsresult GetRangeForWord(nsIDOMNode* aWordNode, int32_t aWordOffset,
michael@0:                            nsRange** aRange);
michael@0: 
michael@0:   // Moves to the the next word in the range, and retrieves it's text and range.
michael@0:   // An empty word and a nullptr range are returned when we are done checking.
michael@0:   // aSkipChecking will be set if the word is "special" and shouldn't be
michael@0:   // checked (e.g., an email address).
michael@0:   nsresult GetNextWord(nsAString& aText, nsRange** aRange,
michael@0:                        bool* aSkipChecking);
michael@0: 
michael@0:   // Call to normalize some punctuation. This function takes an autostring
michael@0:   // so we can access characters directly.
michael@0:   static void NormalizeWord(nsSubstring& aWord);
michael@0: 
michael@0:   nsIDOMDocument* GetDOMDocument() const { return mDOMDocument; }
michael@0:   nsIDocument* GetDocument() const { return mDocument; }
michael@0:   nsINode* GetRootNode() { return mRootNode; }
michael@0:   
michael@0: private:
michael@0: 
michael@0:   // cached stuff for the editor, set by Init
michael@0:   nsCOMPtr<nsIDOMDocument> mDOMDocument;
michael@0:   nsCOMPtr<nsIDocument>         mDocument;
michael@0: 
michael@0:   // range to check, see SetPosition and SetEnd
michael@0:   nsINode*    mRootNode;
michael@0:   NodeOffset  mSoftBegin;
michael@0:   NodeOffset  mSoftEnd;
michael@0: 
michael@0:   // DOM text covering the soft range, with newlines added at block boundaries
michael@0:   nsString mSoftText;
michael@0:   // A list of where we extracted text from, ordered by mSoftTextOffset. A given
michael@0:   // DOM node appears at most once in this list.
michael@0:   struct DOMTextMapping {
michael@0:     NodeOffset mNodeOffset;
michael@0:     int32_t    mSoftTextOffset;
michael@0:     int32_t    mLength;
michael@0:     
michael@0:     DOMTextMapping(NodeOffset aNodeOffset, int32_t aSoftTextOffset, int32_t aLength)
michael@0:       : mNodeOffset(aNodeOffset), mSoftTextOffset(aSoftTextOffset),
michael@0:         mLength(aLength) {}
michael@0:   };
michael@0:   nsTArray<DOMTextMapping> mSoftTextDOMMapping;
michael@0:   
michael@0:   // A list of the "real words" in mSoftText, ordered by mSoftTextOffset
michael@0:   struct RealWord {
michael@0:     int32_t      mSoftTextOffset;
michael@0:     int32_t      mLength;
michael@0:     bool mCheckableWord;
michael@0:     
michael@0:     RealWord(int32_t aOffset, int32_t aLength, bool aCheckable)
michael@0:       : mSoftTextOffset(aOffset), mLength(aLength), mCheckableWord(aCheckable) {}
michael@0:     int32_t EndOffset() const { return mSoftTextOffset + mLength; }
michael@0:   };
michael@0:   nsTArray<RealWord> mRealWords;
michael@0:   int32_t            mNextWordIndex;
michael@0: 
michael@0:   bool mSoftTextValid;
michael@0: 
michael@0:   void InvalidateWords() { mSoftTextValid = false; }
michael@0:   void EnsureWords();
michael@0:   
michael@0:   int32_t MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset);
michael@0:   // Map an offset into mSoftText to a DOM position. Note that two DOM positions
michael@0:   // can map to the same mSoftText offset, e.g. given nodes A=aaaa and B=bbbb
michael@0:   // forming aaaabbbb, (A,4) and (B,0) give the same string offset. So,
michael@0:   // aHintBefore controls which position we return ... if aHint is eEnd
michael@0:   // then the position indicates the END of a range so we return (A,4). Otherwise
michael@0:   // the position indicates the START of a range so we return (B,0).
michael@0:   enum DOMMapHint { HINT_BEGIN, HINT_END };
michael@0:   NodeOffset MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset,
michael@0:                                             DOMMapHint aHint);
michael@0:   // Finds the index of the real word containing aSoftTextOffset, or -1 if none
michael@0:   // If it's exactly between two words, then if aHint is HINT_BEGIN, return the
michael@0:   // later word (favouring the assumption that it's the BEGINning of a word),
michael@0:   // otherwise return the earlier word (assuming it's the END of a word).
michael@0:   // If aSearchForward is true, then if we don't find a word at the given
michael@0:   // position, search forward until we do find a word and return that (if found).
michael@0:   int32_t FindRealWordContaining(int32_t aSoftTextOffset, DOMMapHint aHint,
michael@0:                                  bool aSearchForward);
michael@0:     
michael@0:   // build mSoftText and mSoftTextDOMMapping
michael@0:   void BuildSoftText();
michael@0:   // Build mRealWords array
michael@0:   void BuildRealWords();
michael@0: 
michael@0:   void SplitDOMWord(int32_t aStart, int32_t aEnd);
michael@0: 
michael@0:   // Convenience functions, object must be initialized
michael@0:   nsresult MakeRange(NodeOffset aBegin, NodeOffset aEnd, nsRange** aRange);
michael@0:   nsresult MakeRangeForWord(const RealWord& aWord, nsRange** aRange);
michael@0: };
michael@0: 
michael@0: #endif