michael@0: /** michael@0: ******************************************************************************* michael@0: * Copyright (C) 2006,2012-2013, International Business Machines Corporation * michael@0: * and others. All Rights Reserved. * michael@0: ******************************************************************************* michael@0: */ michael@0: michael@0: #ifndef DICTBE_H michael@0: #define DICTBE_H michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/utext.h" michael@0: michael@0: #include "brkeng.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: class DictionaryMatcher; michael@0: michael@0: /******************************************************************* michael@0: * DictionaryBreakEngine michael@0: */ michael@0: michael@0: /** michael@0: *

DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a michael@0: * dictionary to determine language-specific breaks.

michael@0: * michael@0: *

After it is constructed a DictionaryBreakEngine may be shared between michael@0: * threads without synchronization.

michael@0: */ michael@0: class DictionaryBreakEngine : public LanguageBreakEngine { michael@0: private: michael@0: /** michael@0: * The set of characters handled by this engine michael@0: * @internal michael@0: */ michael@0: michael@0: UnicodeSet fSet; michael@0: michael@0: /** michael@0: * The set of break types handled by this engine michael@0: * @internal michael@0: */ michael@0: michael@0: uint32_t fTypes; michael@0: michael@0: /** michael@0: *

Default constructor.

michael@0: * michael@0: */ michael@0: DictionaryBreakEngine(); michael@0: michael@0: public: michael@0: michael@0: /** michael@0: *

Constructor setting the break types handled.

michael@0: * michael@0: * @param breakTypes A bitmap of types handled by the engine. michael@0: */ michael@0: DictionaryBreakEngine( uint32_t breakTypes ); michael@0: michael@0: /** michael@0: *

Virtual destructor.

michael@0: */ michael@0: virtual ~DictionaryBreakEngine(); michael@0: michael@0: /** michael@0: *

Indicate whether this engine handles a particular character for michael@0: * a particular kind of break.

michael@0: * michael@0: * @param c A character which begins a run that the engine might handle michael@0: * @param breakType The type of text break which the caller wants to determine michael@0: * @return TRUE if this engine handles the particular character and break michael@0: * type. michael@0: */ michael@0: virtual UBool handles( UChar32 c, int32_t breakType ) const; michael@0: michael@0: /** michael@0: *

Find any breaks within a run in the supplied text.

michael@0: * michael@0: * @param text A UText representing the text. The iterator is left at michael@0: * the end of the run of characters which the engine is capable of handling michael@0: * that starts from the first (or last) character in the range. michael@0: * @param startPos The start of the run within the supplied text. michael@0: * @param endPos The end of the run within the supplied text. michael@0: * @param reverse Whether the caller is looking for breaks in a reverse michael@0: * direction. michael@0: * @param breakType The type of break desired, or -1. michael@0: * @param foundBreaks An allocated C array of the breaks found, if any michael@0: * @return The number of breaks found. michael@0: */ michael@0: virtual int32_t findBreaks( UText *text, michael@0: int32_t startPos, michael@0: int32_t endPos, michael@0: UBool reverse, michael@0: int32_t breakType, michael@0: UStack &foundBreaks ) const; michael@0: michael@0: protected: michael@0: michael@0: /** michael@0: *

Set the character set handled by this engine.

michael@0: * michael@0: * @param set A UnicodeSet of the set of characters handled by the engine michael@0: */ michael@0: virtual void setCharacters( const UnicodeSet &set ); michael@0: michael@0: /** michael@0: *

Set the break types handled by this engine.

michael@0: * michael@0: * @param breakTypes A bitmap of types handled by the engine. michael@0: */ michael@0: // virtual void setBreakTypes( uint32_t breakTypes ); michael@0: michael@0: /** michael@0: *

Divide up a range of known dictionary characters handled by this break engine.

michael@0: * michael@0: * @param text A UText representing the text michael@0: * @param rangeStart The start of the range of dictionary characters michael@0: * @param rangeEnd The end of the range of dictionary characters michael@0: * @param foundBreaks Output of C array of int32_t break positions, or 0 michael@0: * @return The number of breaks found michael@0: */ michael@0: virtual int32_t divideUpDictionaryRange( UText *text, michael@0: int32_t rangeStart, michael@0: int32_t rangeEnd, michael@0: UStack &foundBreaks ) const = 0; michael@0: michael@0: }; michael@0: michael@0: /******************************************************************* michael@0: * ThaiBreakEngine michael@0: */ michael@0: michael@0: /** michael@0: *

ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a michael@0: * dictionary and heuristics to determine Thai-specific breaks.

michael@0: * michael@0: *

After it is constructed a ThaiBreakEngine may be shared between michael@0: * threads without synchronization.

michael@0: */ michael@0: class ThaiBreakEngine : public DictionaryBreakEngine { michael@0: private: michael@0: /** michael@0: * The set of characters handled by this engine michael@0: * @internal michael@0: */ michael@0: michael@0: UnicodeSet fThaiWordSet; michael@0: UnicodeSet fEndWordSet; michael@0: UnicodeSet fBeginWordSet; michael@0: UnicodeSet fSuffixSet; michael@0: UnicodeSet fMarkSet; michael@0: DictionaryMatcher *fDictionary; michael@0: michael@0: public: michael@0: michael@0: /** michael@0: *

Default constructor.

michael@0: * michael@0: * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the michael@0: * engine is deleted. michael@0: */ michael@0: ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); michael@0: michael@0: /** michael@0: *

Virtual destructor.

michael@0: */ michael@0: virtual ~ThaiBreakEngine(); michael@0: michael@0: protected: michael@0: /** michael@0: *

Divide up a range of known dictionary characters handled by this break engine.

michael@0: * michael@0: * @param text A UText representing the text michael@0: * @param rangeStart The start of the range of dictionary characters michael@0: * @param rangeEnd The end of the range of dictionary characters michael@0: * @param foundBreaks Output of C array of int32_t break positions, or 0 michael@0: * @return The number of breaks found michael@0: */ michael@0: virtual int32_t divideUpDictionaryRange( UText *text, michael@0: int32_t rangeStart, michael@0: int32_t rangeEnd, michael@0: UStack &foundBreaks ) const; michael@0: michael@0: }; michael@0: michael@0: /******************************************************************* michael@0: * LaoBreakEngine michael@0: */ michael@0: michael@0: /** michael@0: *

LaoBreakEngine is a kind of DictionaryBreakEngine that uses a michael@0: * dictionary and heuristics to determine Lao-specific breaks.

michael@0: * michael@0: *

After it is constructed a LaoBreakEngine may be shared between michael@0: * threads without synchronization.

michael@0: */ michael@0: class LaoBreakEngine : public DictionaryBreakEngine { michael@0: private: michael@0: /** michael@0: * The set of characters handled by this engine michael@0: * @internal michael@0: */ michael@0: michael@0: UnicodeSet fLaoWordSet; michael@0: UnicodeSet fEndWordSet; michael@0: UnicodeSet fBeginWordSet; michael@0: UnicodeSet fMarkSet; michael@0: DictionaryMatcher *fDictionary; michael@0: michael@0: public: michael@0: michael@0: /** michael@0: *

Default constructor.

michael@0: * michael@0: * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the michael@0: * engine is deleted. michael@0: */ michael@0: LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); michael@0: michael@0: /** michael@0: *

Virtual destructor.

michael@0: */ michael@0: virtual ~LaoBreakEngine(); michael@0: michael@0: protected: michael@0: /** michael@0: *

Divide up a range of known dictionary characters handled by this break engine.

michael@0: * michael@0: * @param text A UText representing the text michael@0: * @param rangeStart The start of the range of dictionary characters michael@0: * @param rangeEnd The end of the range of dictionary characters michael@0: * @param foundBreaks Output of C array of int32_t break positions, or 0 michael@0: * @return The number of breaks found michael@0: */ michael@0: virtual int32_t divideUpDictionaryRange( UText *text, michael@0: int32_t rangeStart, michael@0: int32_t rangeEnd, michael@0: UStack &foundBreaks ) const; michael@0: michael@0: }; michael@0: michael@0: /******************************************************************* michael@0: * KhmerBreakEngine michael@0: */ michael@0: michael@0: /** michael@0: *

KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a michael@0: * DictionaryMatcher and heuristics to determine Khmer-specific breaks.

michael@0: * michael@0: *

After it is constructed a KhmerBreakEngine may be shared between michael@0: * threads without synchronization.

michael@0: */ michael@0: class KhmerBreakEngine : public DictionaryBreakEngine { michael@0: private: michael@0: /** michael@0: * The set of characters handled by this engine michael@0: * @internal michael@0: */ michael@0: michael@0: UnicodeSet fKhmerWordSet; michael@0: UnicodeSet fEndWordSet; michael@0: UnicodeSet fBeginWordSet; michael@0: UnicodeSet fMarkSet; michael@0: DictionaryMatcher *fDictionary; michael@0: michael@0: public: michael@0: michael@0: /** michael@0: *

Default constructor.

michael@0: * michael@0: * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the michael@0: * engine is deleted. michael@0: */ michael@0: KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); michael@0: michael@0: /** michael@0: *

Virtual destructor.

michael@0: */ michael@0: virtual ~KhmerBreakEngine(); michael@0: michael@0: protected: michael@0: /** michael@0: *

Divide up a range of known dictionary characters.

michael@0: * michael@0: * @param text A UText representing the text michael@0: * @param rangeStart The start of the range of dictionary characters michael@0: * @param rangeEnd The end of the range of dictionary characters michael@0: * @param foundBreaks Output of C array of int32_t break positions, or 0 michael@0: * @return The number of breaks found michael@0: */ michael@0: virtual int32_t divideUpDictionaryRange( UText *text, michael@0: int32_t rangeStart, michael@0: int32_t rangeEnd, michael@0: UStack &foundBreaks ) const; michael@0: michael@0: }; michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: /******************************************************************* michael@0: * CjkBreakEngine michael@0: */ michael@0: michael@0: //indicates language/script that the CjkBreakEngine will handle michael@0: enum LanguageType { michael@0: kKorean, michael@0: kChineseJapanese michael@0: }; michael@0: michael@0: /** michael@0: *

CjkBreakEngine is a kind of DictionaryBreakEngine that uses a michael@0: * dictionary with costs associated with each word and michael@0: * Viterbi decoding to determine CJK-specific breaks.

michael@0: */ michael@0: class CjkBreakEngine : public DictionaryBreakEngine { michael@0: protected: michael@0: /** michael@0: * The set of characters handled by this engine michael@0: * @internal michael@0: */ michael@0: UnicodeSet fHangulWordSet; michael@0: UnicodeSet fHanWordSet; michael@0: UnicodeSet fKatakanaWordSet; michael@0: UnicodeSet fHiraganaWordSet; michael@0: michael@0: DictionaryMatcher *fDictionary; michael@0: michael@0: public: michael@0: michael@0: /** michael@0: *

Default constructor.

michael@0: * michael@0: * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the michael@0: * engine is deleted. The DictionaryMatcher must contain costs for each word michael@0: * in order for the dictionary to work properly. michael@0: */ michael@0: CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); michael@0: michael@0: /** michael@0: *

Virtual destructor.

michael@0: */ michael@0: virtual ~CjkBreakEngine(); michael@0: michael@0: protected: michael@0: /** michael@0: *

Divide up a range of known dictionary characters handled by this break engine.

michael@0: * michael@0: * @param text A UText representing the text michael@0: * @param rangeStart The start of the range of dictionary characters michael@0: * @param rangeEnd The end of the range of dictionary characters michael@0: * @param foundBreaks Output of C array of int32_t break positions, or 0 michael@0: * @return The number of breaks found michael@0: */ michael@0: virtual int32_t divideUpDictionaryRange( UText *text, michael@0: int32_t rangeStart, michael@0: int32_t rangeEnd, michael@0: UStack &foundBreaks ) const; michael@0: michael@0: }; michael@0: michael@0: #endif michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: /* DICTBE_H */ michael@0: #endif