1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/brkeng.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,289 @@ 1.4 +/** 1.5 + ************************************************************************************ 1.6 + * Copyright (C) 2006-2012, International Business Machines Corporation and others. * 1.7 + * All Rights Reserved. * 1.8 + ************************************************************************************ 1.9 + */ 1.10 + 1.11 +#ifndef BRKENG_H 1.12 +#define BRKENG_H 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 +#include "unicode/uobject.h" 1.16 +#include "unicode/utext.h" 1.17 +#include "unicode/uscript.h" 1.18 + 1.19 +U_NAMESPACE_BEGIN 1.20 + 1.21 +class UnicodeSet; 1.22 +class UStack; 1.23 +class DictionaryMatcher; 1.24 + 1.25 +/******************************************************************* 1.26 + * LanguageBreakEngine 1.27 + */ 1.28 + 1.29 +/** 1.30 + * <p>LanguageBreakEngines implement language-specific knowledge for 1.31 + * finding text boundaries within a run of characters belonging to a 1.32 + * specific set. The boundaries will be of a specific kind, e.g. word, 1.33 + * line, etc.</p> 1.34 + * 1.35 + * <p>LanguageBreakEngines should normally be implemented so as to 1.36 + * be shared between threads without locking.</p> 1.37 + */ 1.38 +class LanguageBreakEngine : public UMemory { 1.39 + public: 1.40 + 1.41 + /** 1.42 + * <p>Default constructor.</p> 1.43 + * 1.44 + */ 1.45 + LanguageBreakEngine(); 1.46 + 1.47 + /** 1.48 + * <p>Virtual destructor.</p> 1.49 + */ 1.50 + virtual ~LanguageBreakEngine(); 1.51 + 1.52 + /** 1.53 + * <p>Indicate whether this engine handles a particular character for 1.54 + * a particular kind of break.</p> 1.55 + * 1.56 + * @param c A character which begins a run that the engine might handle 1.57 + * @param breakType The type of text break which the caller wants to determine 1.58 + * @return TRUE if this engine handles the particular character and break 1.59 + * type. 1.60 + */ 1.61 + virtual UBool handles(UChar32 c, int32_t breakType) const = 0; 1.62 + 1.63 + /** 1.64 + * <p>Find any breaks within a run in the supplied text.</p> 1.65 + * 1.66 + * @param text A UText representing the text. The 1.67 + * iterator is left at the end of the run of characters which the engine 1.68 + * is capable of handling. 1.69 + * @param startPos The start of the run within the supplied text. 1.70 + * @param endPos The end of the run within the supplied text. 1.71 + * @param reverse Whether the caller is looking for breaks in a reverse 1.72 + * direction. 1.73 + * @param breakType The type of break desired, or -1. 1.74 + * @param foundBreaks An allocated C array of the breaks found, if any 1.75 + * @return The number of breaks found. 1.76 + */ 1.77 + virtual int32_t findBreaks( UText *text, 1.78 + int32_t startPos, 1.79 + int32_t endPos, 1.80 + UBool reverse, 1.81 + int32_t breakType, 1.82 + UStack &foundBreaks ) const = 0; 1.83 + 1.84 +}; 1.85 + 1.86 +/******************************************************************* 1.87 + * LanguageBreakFactory 1.88 + */ 1.89 + 1.90 +/** 1.91 + * <p>LanguageBreakFactorys find and return a LanguageBreakEngine 1.92 + * that can determine breaks for characters in a specific set, if 1.93 + * such an object can be found.</p> 1.94 + * 1.95 + * <p>If a LanguageBreakFactory is to be shared between threads, 1.96 + * appropriate synchronization must be used; there is none internal 1.97 + * to the factory.</p> 1.98 + * 1.99 + * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can 1.100 + * normally be shared between threads without synchronization, unless 1.101 + * the specific subclass of LanguageBreakFactory indicates otherwise.</p> 1.102 + * 1.103 + * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine 1.104 + * it returns when it itself is deleted, unless the specific subclass of 1.105 + * LanguageBreakFactory indicates otherwise. Naturally, the factory should 1.106 + * not be deleted until the LanguageBreakEngines it has returned are no 1.107 + * longer needed.</p> 1.108 + */ 1.109 +class LanguageBreakFactory : public UMemory { 1.110 + public: 1.111 + 1.112 + /** 1.113 + * <p>Default constructor.</p> 1.114 + * 1.115 + */ 1.116 + LanguageBreakFactory(); 1.117 + 1.118 + /** 1.119 + * <p>Virtual destructor.</p> 1.120 + */ 1.121 + virtual ~LanguageBreakFactory(); 1.122 + 1.123 + /** 1.124 + * <p>Find and return a LanguageBreakEngine that can find the desired 1.125 + * kind of break for the set of characters to which the supplied 1.126 + * character belongs. It is up to the set of available engines to 1.127 + * determine what the sets of characters are.</p> 1.128 + * 1.129 + * @param c A character that begins a run for which a LanguageBreakEngine is 1.130 + * sought. 1.131 + * @param breakType The kind of text break for which a LanguageBreakEngine is 1.132 + * sought. 1.133 + * @return A LanguageBreakEngine with the desired characteristics, or 0. 1.134 + */ 1.135 + virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0; 1.136 + 1.137 +}; 1.138 + 1.139 +/******************************************************************* 1.140 + * UnhandledEngine 1.141 + */ 1.142 + 1.143 +/** 1.144 + * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that 1.145 + * handles characters that no other LanguageBreakEngine is available to 1.146 + * handle. It is told the character and the type of break; at its 1.147 + * discretion it may handle more than the specified character (e.g., 1.148 + * the entire script to which that character belongs.</p> 1.149 + * 1.150 + * <p>UnhandledEngines may not be shared between threads without 1.151 + * external synchronization.</p> 1.152 + */ 1.153 + 1.154 +class UnhandledEngine : public LanguageBreakEngine { 1.155 + private: 1.156 + 1.157 + /** 1.158 + * The sets of characters handled, for each break type 1.159 + * @internal 1.160 + */ 1.161 + 1.162 + UnicodeSet *fHandled[4]; 1.163 + 1.164 + public: 1.165 + 1.166 + /** 1.167 + * <p>Default constructor.</p> 1.168 + * 1.169 + */ 1.170 + UnhandledEngine(UErrorCode &status); 1.171 + 1.172 + /** 1.173 + * <p>Virtual destructor.</p> 1.174 + */ 1.175 + virtual ~UnhandledEngine(); 1.176 + 1.177 + /** 1.178 + * <p>Indicate whether this engine handles a particular character for 1.179 + * a particular kind of break.</p> 1.180 + * 1.181 + * @param c A character which begins a run that the engine might handle 1.182 + * @param breakType The type of text break which the caller wants to determine 1.183 + * @return TRUE if this engine handles the particular character and break 1.184 + * type. 1.185 + */ 1.186 + virtual UBool handles(UChar32 c, int32_t breakType) const; 1.187 + 1.188 + /** 1.189 + * <p>Find any breaks within a run in the supplied text.</p> 1.190 + * 1.191 + * @param text A UText representing the text (TODO: UText). The 1.192 + * iterator is left at the end of the run of characters which the engine 1.193 + * is capable of handling. 1.194 + * @param startPos The start of the run within the supplied text. 1.195 + * @param endPos The end of the run within the supplied text. 1.196 + * @param reverse Whether the caller is looking for breaks in a reverse 1.197 + * direction. 1.198 + * @param breakType The type of break desired, or -1. 1.199 + * @param foundBreaks An allocated C array of the breaks found, if any 1.200 + * @return The number of breaks found. 1.201 + */ 1.202 + virtual int32_t findBreaks( UText *text, 1.203 + int32_t startPos, 1.204 + int32_t endPos, 1.205 + UBool reverse, 1.206 + int32_t breakType, 1.207 + UStack &foundBreaks ) const; 1.208 + 1.209 + /** 1.210 + * <p>Tell the engine to handle a particular character and break type.</p> 1.211 + * 1.212 + * @param c A character which the engine should handle 1.213 + * @param breakType The type of text break for which the engine should handle c 1.214 + */ 1.215 + virtual void handleCharacter(UChar32 c, int32_t breakType); 1.216 + 1.217 +}; 1.218 + 1.219 +/******************************************************************* 1.220 + * ICULanguageBreakFactory 1.221 + */ 1.222 + 1.223 +/** 1.224 + * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for 1.225 + * ICU. It creates dictionary-based LanguageBreakEngines from dictionary 1.226 + * data in the ICU data file.</p> 1.227 + */ 1.228 +class ICULanguageBreakFactory : public LanguageBreakFactory { 1.229 + private: 1.230 + 1.231 + /** 1.232 + * The stack of break engines created by this factory 1.233 + * @internal 1.234 + */ 1.235 + 1.236 + UStack *fEngines; 1.237 + 1.238 + public: 1.239 + 1.240 + /** 1.241 + * <p>Standard constructor.</p> 1.242 + * 1.243 + */ 1.244 + ICULanguageBreakFactory(UErrorCode &status); 1.245 + 1.246 + /** 1.247 + * <p>Virtual destructor.</p> 1.248 + */ 1.249 + virtual ~ICULanguageBreakFactory(); 1.250 + 1.251 + /** 1.252 + * <p>Find and return a LanguageBreakEngine that can find the desired 1.253 + * kind of break for the set of characters to which the supplied 1.254 + * character belongs. It is up to the set of available engines to 1.255 + * determine what the sets of characters are.</p> 1.256 + * 1.257 + * @param c A character that begins a run for which a LanguageBreakEngine is 1.258 + * sought. 1.259 + * @param breakType The kind of text break for which a LanguageBreakEngine is 1.260 + * sought. 1.261 + * @return A LanguageBreakEngine with the desired characteristics, or 0. 1.262 + */ 1.263 + virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType); 1.264 + 1.265 +protected: 1.266 + /** 1.267 + * <p>Create a LanguageBreakEngine for the set of characters to which 1.268 + * the supplied character belongs, for the specified break type.</p> 1.269 + * 1.270 + * @param c A character that begins a run for which a LanguageBreakEngine is 1.271 + * sought. 1.272 + * @param breakType The kind of text break for which a LanguageBreakEngine is 1.273 + * sought. 1.274 + * @return A LanguageBreakEngine with the desired characteristics, or 0. 1.275 + */ 1.276 + virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType); 1.277 + 1.278 + /** 1.279 + * <p>Create a DictionaryMatcher for the specified script and break type.</p> 1.280 + * @param script An ISO 15924 script code that identifies the dictionary to be 1.281 + * created. 1.282 + * @param breakType The kind of text break for which a dictionary is 1.283 + * sought. 1.284 + * @return A DictionaryMatcher with the desired characteristics, or NULL. 1.285 + */ 1.286 + virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); 1.287 +}; 1.288 + 1.289 +U_NAMESPACE_END 1.290 + 1.291 + /* BRKENG_H */ 1.292 +#endif