michael@0: /** michael@0: ************************************************************************************ michael@0: * Copyright (C) 2006-2012, International Business Machines Corporation and others. * michael@0: * All Rights Reserved. * michael@0: ************************************************************************************ michael@0: */ michael@0: michael@0: #ifndef BRKENG_H michael@0: #define BRKENG_H michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uobject.h" michael@0: #include "unicode/utext.h" michael@0: #include "unicode/uscript.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: class UnicodeSet; michael@0: class UStack; michael@0: class DictionaryMatcher; michael@0: michael@0: /******************************************************************* michael@0: * LanguageBreakEngine michael@0: */ michael@0: michael@0: /** michael@0: *
LanguageBreakEngines implement language-specific knowledge for michael@0: * finding text boundaries within a run of characters belonging to a michael@0: * specific set. The boundaries will be of a specific kind, e.g. word, michael@0: * line, etc.
michael@0: * michael@0: *LanguageBreakEngines should normally be implemented so as to michael@0: * be shared between threads without locking.
michael@0: */ michael@0: class LanguageBreakEngine : public UMemory { michael@0: public: michael@0: michael@0: /** michael@0: *Default constructor.
michael@0: * michael@0: */ michael@0: LanguageBreakEngine(); michael@0: michael@0: /** michael@0: *Virtual destructor.
michael@0: */ michael@0: virtual ~LanguageBreakEngine(); michael@0: michael@0: /** michael@0: *Indicate whether this engine handles a particular character for michael@0: * a particular kind of break.
michael@0: * michael@0: * @param c A character which begins a run that the engine might handle michael@0: * @param breakType The type of text break which the caller wants to determine michael@0: * @return TRUE if this engine handles the particular character and break michael@0: * type. michael@0: */ michael@0: virtual UBool handles(UChar32 c, int32_t breakType) const = 0; michael@0: michael@0: /** michael@0: *Find any breaks within a run in the supplied text.
michael@0: * michael@0: * @param text A UText representing the text. The michael@0: * iterator is left at the end of the run of characters which the engine michael@0: * is capable of handling. michael@0: * @param startPos The start of the run within the supplied text. michael@0: * @param endPos The end of the run within the supplied text. michael@0: * @param reverse Whether the caller is looking for breaks in a reverse michael@0: * direction. michael@0: * @param breakType The type of break desired, or -1. michael@0: * @param foundBreaks An allocated C array of the breaks found, if any michael@0: * @return The number of breaks found. michael@0: */ michael@0: virtual int32_t findBreaks( UText *text, michael@0: int32_t startPos, michael@0: int32_t endPos, michael@0: UBool reverse, michael@0: int32_t breakType, michael@0: UStack &foundBreaks ) const = 0; michael@0: michael@0: }; michael@0: michael@0: /******************************************************************* michael@0: * LanguageBreakFactory michael@0: */ michael@0: michael@0: /** michael@0: *LanguageBreakFactorys find and return a LanguageBreakEngine michael@0: * that can determine breaks for characters in a specific set, if michael@0: * such an object can be found.
michael@0: * michael@0: *If a LanguageBreakFactory is to be shared between threads, michael@0: * appropriate synchronization must be used; there is none internal michael@0: * to the factory.
michael@0: * michael@0: *A LanguageBreakEngine returned by a LanguageBreakFactory can michael@0: * normally be shared between threads without synchronization, unless michael@0: * the specific subclass of LanguageBreakFactory indicates otherwise.
michael@0: * michael@0: *A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine michael@0: * it returns when it itself is deleted, unless the specific subclass of michael@0: * LanguageBreakFactory indicates otherwise. Naturally, the factory should michael@0: * not be deleted until the LanguageBreakEngines it has returned are no michael@0: * longer needed.
michael@0: */ michael@0: class LanguageBreakFactory : public UMemory { michael@0: public: michael@0: michael@0: /** michael@0: *Default constructor.
michael@0: * michael@0: */ michael@0: LanguageBreakFactory(); michael@0: michael@0: /** michael@0: *Virtual destructor.
michael@0: */ michael@0: virtual ~LanguageBreakFactory(); michael@0: michael@0: /** michael@0: *Find and return a LanguageBreakEngine that can find the desired michael@0: * kind of break for the set of characters to which the supplied michael@0: * character belongs. It is up to the set of available engines to michael@0: * determine what the sets of characters are.
michael@0: * michael@0: * @param c A character that begins a run for which a LanguageBreakEngine is michael@0: * sought. michael@0: * @param breakType The kind of text break for which a LanguageBreakEngine is michael@0: * sought. michael@0: * @return A LanguageBreakEngine with the desired characteristics, or 0. michael@0: */ michael@0: virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0; michael@0: michael@0: }; michael@0: michael@0: /******************************************************************* michael@0: * UnhandledEngine michael@0: */ michael@0: michael@0: /** michael@0: *UnhandledEngine is a special subclass of LanguageBreakEngine that michael@0: * handles characters that no other LanguageBreakEngine is available to michael@0: * handle. It is told the character and the type of break; at its michael@0: * discretion it may handle more than the specified character (e.g., michael@0: * the entire script to which that character belongs.
michael@0: * michael@0: *UnhandledEngines may not be shared between threads without michael@0: * external synchronization.
michael@0: */ michael@0: michael@0: class UnhandledEngine : public LanguageBreakEngine { michael@0: private: michael@0: michael@0: /** michael@0: * The sets of characters handled, for each break type michael@0: * @internal michael@0: */ michael@0: michael@0: UnicodeSet *fHandled[4]; michael@0: michael@0: public: michael@0: michael@0: /** michael@0: *Default constructor.
michael@0: * michael@0: */ michael@0: UnhandledEngine(UErrorCode &status); michael@0: michael@0: /** michael@0: *Virtual destructor.
michael@0: */ michael@0: virtual ~UnhandledEngine(); michael@0: michael@0: /** michael@0: *Indicate whether this engine handles a particular character for michael@0: * a particular kind of break.
michael@0: * michael@0: * @param c A character which begins a run that the engine might handle michael@0: * @param breakType The type of text break which the caller wants to determine michael@0: * @return TRUE if this engine handles the particular character and break michael@0: * type. michael@0: */ michael@0: virtual UBool handles(UChar32 c, int32_t breakType) const; michael@0: michael@0: /** michael@0: *Find any breaks within a run in the supplied text.
michael@0: * michael@0: * @param text A UText representing the text (TODO: UText). The michael@0: * iterator is left at the end of the run of characters which the engine michael@0: * is capable of handling. michael@0: * @param startPos The start of the run within the supplied text. michael@0: * @param endPos The end of the run within the supplied text. michael@0: * @param reverse Whether the caller is looking for breaks in a reverse michael@0: * direction. michael@0: * @param breakType The type of break desired, or -1. michael@0: * @param foundBreaks An allocated C array of the breaks found, if any michael@0: * @return The number of breaks found. michael@0: */ michael@0: virtual int32_t findBreaks( UText *text, michael@0: int32_t startPos, michael@0: int32_t endPos, michael@0: UBool reverse, michael@0: int32_t breakType, michael@0: UStack &foundBreaks ) const; michael@0: michael@0: /** michael@0: *Tell the engine to handle a particular character and break type.
michael@0: * michael@0: * @param c A character which the engine should handle michael@0: * @param breakType The type of text break for which the engine should handle c michael@0: */ michael@0: virtual void handleCharacter(UChar32 c, int32_t breakType); michael@0: michael@0: }; michael@0: michael@0: /******************************************************************* michael@0: * ICULanguageBreakFactory michael@0: */ michael@0: michael@0: /** michael@0: *ICULanguageBreakFactory is the default LanguageBreakFactory for michael@0: * ICU. It creates dictionary-based LanguageBreakEngines from dictionary michael@0: * data in the ICU data file.
michael@0: */ michael@0: class ICULanguageBreakFactory : public LanguageBreakFactory { michael@0: private: michael@0: michael@0: /** michael@0: * The stack of break engines created by this factory michael@0: * @internal michael@0: */ michael@0: michael@0: UStack *fEngines; michael@0: michael@0: public: michael@0: michael@0: /** michael@0: *Standard constructor.
michael@0: * michael@0: */ michael@0: ICULanguageBreakFactory(UErrorCode &status); michael@0: michael@0: /** michael@0: *Virtual destructor.
michael@0: */ michael@0: virtual ~ICULanguageBreakFactory(); michael@0: michael@0: /** michael@0: *Find and return a LanguageBreakEngine that can find the desired michael@0: * kind of break for the set of characters to which the supplied michael@0: * character belongs. It is up to the set of available engines to michael@0: * determine what the sets of characters are.
michael@0: * michael@0: * @param c A character that begins a run for which a LanguageBreakEngine is michael@0: * sought. michael@0: * @param breakType The kind of text break for which a LanguageBreakEngine is michael@0: * sought. michael@0: * @return A LanguageBreakEngine with the desired characteristics, or 0. michael@0: */ michael@0: virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType); michael@0: michael@0: protected: michael@0: /** michael@0: *Create a LanguageBreakEngine for the set of characters to which michael@0: * the supplied character belongs, for the specified break type.
michael@0: * michael@0: * @param c A character that begins a run for which a LanguageBreakEngine is michael@0: * sought. michael@0: * @param breakType The kind of text break for which a LanguageBreakEngine is michael@0: * sought. michael@0: * @return A LanguageBreakEngine with the desired characteristics, or 0. michael@0: */ michael@0: virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType); michael@0: michael@0: /** michael@0: *Create a DictionaryMatcher for the specified script and break type.
michael@0: * @param script An ISO 15924 script code that identifies the dictionary to be michael@0: * created. michael@0: * @param breakType The kind of text break for which a dictionary is michael@0: * sought. michael@0: * @return A DictionaryMatcher with the desired characteristics, or NULL. michael@0: */ michael@0: virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); michael@0: }; michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: /* BRKENG_H */ michael@0: #endif