intl/icu/source/i18n/rbt_pars.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 1999-2011, International Business Machines Corporation
michael@0 4 * and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * Date Name Description
michael@0 7 * 11/17/99 aliu Creation.
michael@0 8 **********************************************************************
michael@0 9 */
michael@0 10 #ifndef RBT_PARS_H
michael@0 11 #define RBT_PARS_H
michael@0 12
michael@0 13 #include "unicode/utypes.h"
michael@0 14
michael@0 15 #if !UCONFIG_NO_TRANSLITERATION
michael@0 16 #ifdef __cplusplus
michael@0 17
michael@0 18 #include "unicode/uobject.h"
michael@0 19 #include "unicode/parseerr.h"
michael@0 20 #include "unicode/unorm.h"
michael@0 21 #include "rbt.h"
michael@0 22 #include "hash.h"
michael@0 23 #include "uvector.h"
michael@0 24
michael@0 25 U_NAMESPACE_BEGIN
michael@0 26
michael@0 27 class TransliterationRuleData;
michael@0 28 class UnicodeFunctor;
michael@0 29 class ParseData;
michael@0 30 class RuleHalf;
michael@0 31 class ParsePosition;
michael@0 32 class StringMatcher;
michael@0 33
michael@0 34 class TransliteratorParser : public UMemory {
michael@0 35
michael@0 36 public:
michael@0 37
michael@0 38 /**
michael@0 39 * A Vector of TransliterationRuleData objects, one for each discrete group
michael@0 40 * of rules in the rule set
michael@0 41 */
michael@0 42 UVector dataVector;
michael@0 43
michael@0 44 /**
michael@0 45 * PUBLIC data member.
michael@0 46 * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
michael@0 47 */
michael@0 48 UVector idBlockVector;
michael@0 49
michael@0 50 /**
michael@0 51 * PUBLIC data member containing the parsed compound filter, if any.
michael@0 52 */
michael@0 53 UnicodeSet* compoundFilter;
michael@0 54
michael@0 55 private:
michael@0 56
michael@0 57 /**
michael@0 58 * The current data object for which we are parsing rules
michael@0 59 */
michael@0 60 TransliterationRuleData* curData;
michael@0 61
michael@0 62 UTransDirection direction;
michael@0 63
michael@0 64 /**
michael@0 65 * Parse error information.
michael@0 66 */
michael@0 67 UParseError parseError;
michael@0 68
michael@0 69 /**
michael@0 70 * Temporary symbol table used during parsing.
michael@0 71 */
michael@0 72 ParseData* parseData;
michael@0 73
michael@0 74 /**
michael@0 75 * Temporary vector of matcher variables. When parsing is complete, this
michael@0 76 * is copied into the array data.variables. As with data.variables,
michael@0 77 * element 0 corresponds to character data.variablesBase.
michael@0 78 */
michael@0 79 UVector variablesVector;
michael@0 80
michael@0 81 /**
michael@0 82 * Temporary table of variable names. When parsing is complete, this is
michael@0 83 * copied into data.variableNames.
michael@0 84 */
michael@0 85 Hashtable variableNames;
michael@0 86
michael@0 87 /**
michael@0 88 * String of standins for segments. Used during the parsing of a single
michael@0 89 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
michael@0 90 * to StringMatcher object segmentObjects.elementAt(0), etc.
michael@0 91 */
michael@0 92 UnicodeString segmentStandins;
michael@0 93
michael@0 94 /**
michael@0 95 * Vector of StringMatcher objects for segments. Used during the
michael@0 96 * parsing of a single rule.
michael@0 97 * segmentStandins.charAt(0) is the standin for "$1" and corresponds
michael@0 98 * to StringMatcher object segmentObjects.elementAt(0), etc.
michael@0 99 */
michael@0 100 UVector segmentObjects;
michael@0 101
michael@0 102 /**
michael@0 103 * The next available stand-in for variables. This starts at some point in
michael@0 104 * the private use area (discovered dynamically) and increments up toward
michael@0 105 * <code>variableLimit</code>. At any point during parsing, available
michael@0 106 * variables are <code>variableNext..variableLimit-1</code>.
michael@0 107 */
michael@0 108 UChar variableNext;
michael@0 109
michael@0 110 /**
michael@0 111 * The last available stand-in for variables. This is discovered
michael@0 112 * dynamically. At any point during parsing, available variables are
michael@0 113 * <code>variableNext..variableLimit-1</code>.
michael@0 114 */
michael@0 115 UChar variableLimit;
michael@0 116
michael@0 117 /**
michael@0 118 * When we encounter an undefined variable, we do not immediately signal
michael@0 119 * an error, in case we are defining this variable, e.g., "$a = [a-z];".
michael@0 120 * Instead, we save the name of the undefined variable, and substitute
michael@0 121 * in the placeholder char variableLimit - 1, and decrement
michael@0 122 * variableLimit.
michael@0 123 */
michael@0 124 UnicodeString undefinedVariableName;
michael@0 125
michael@0 126 /**
michael@0 127 * The stand-in character for the 'dot' set, represented by '.' in
michael@0 128 * patterns. This is allocated the first time it is needed, and
michael@0 129 * reused thereafter.
michael@0 130 */
michael@0 131 UChar dotStandIn;
michael@0 132
michael@0 133 public:
michael@0 134
michael@0 135 /**
michael@0 136 * Constructor.
michael@0 137 */
michael@0 138 TransliteratorParser(UErrorCode &statusReturn);
michael@0 139
michael@0 140 /**
michael@0 141 * Destructor.
michael@0 142 */
michael@0 143 ~TransliteratorParser();
michael@0 144
michael@0 145 /**
michael@0 146 * Parse the given string as a sequence of rules, separated by newline
michael@0 147 * characters ('\n'), and cause this object to implement those rules. Any
michael@0 148 * previous rules are discarded. Typically this method is called exactly
michael@0 149 * once after construction.
michael@0 150 *
michael@0 151 * Parse the given rules, in the given direction. After this call
michael@0 152 * returns, query the public data members for results. The caller
michael@0 153 * owns the 'data' and 'compoundFilter' data members after this
michael@0 154 * call returns.
michael@0 155 * @param rules rules, separated by ';'
michael@0 156 * @param direction either FORWARD or REVERSE.
michael@0 157 * @param pe Struct to recieve information on position
michael@0 158 * of error if an error is encountered
michael@0 159 * @param ec Output param set to success/failure code.
michael@0 160 */
michael@0 161 void parse(const UnicodeString& rules,
michael@0 162 UTransDirection direction,
michael@0 163 UParseError& pe,
michael@0 164 UErrorCode& ec);
michael@0 165
michael@0 166 /**
michael@0 167 * Return the compound filter parsed by parse(). Caller owns result.
michael@0 168 * @return the compound filter parsed by parse().
michael@0 169 */
michael@0 170 UnicodeSet* orphanCompoundFilter();
michael@0 171
michael@0 172 private:
michael@0 173
michael@0 174 /**
michael@0 175 * Return a representation of this transliterator as source rules.
michael@0 176 * @param rules Output param to receive the rules.
michael@0 177 * @param direction either FORWARD or REVERSE.
michael@0 178 */
michael@0 179 void parseRules(const UnicodeString& rules,
michael@0 180 UTransDirection direction,
michael@0 181 UErrorCode& status);
michael@0 182
michael@0 183 /**
michael@0 184 * MAIN PARSER. Parse the next rule in the given rule string, starting
michael@0 185 * at pos. Return the index after the last character parsed. Do not
michael@0 186 * parse characters at or after limit.
michael@0 187 *
michael@0 188 * Important: The character at pos must be a non-whitespace character
michael@0 189 * that is not the comment character.
michael@0 190 *
michael@0 191 * This method handles quoting, escaping, and whitespace removal. It
michael@0 192 * parses the end-of-rule character. It recognizes context and cursor
michael@0 193 * indicators. Once it does a lexical breakdown of the rule at pos, it
michael@0 194 * creates a rule object and adds it to our rule list.
michael@0 195 * @param rules Output param to receive the rules.
michael@0 196 * @param pos the starting position.
michael@0 197 * @param limit pointer past the last character of the rule.
michael@0 198 * @return the index after the last character parsed.
michael@0 199 */
michael@0 200 int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
michael@0 201
michael@0 202 /**
michael@0 203 * Set the variable range to [start, end] (inclusive).
michael@0 204 * @param start the start value of the range.
michael@0 205 * @param end the end value of the range.
michael@0 206 */
michael@0 207 void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
michael@0 208
michael@0 209 /**
michael@0 210 * Assert that the given character is NOT within the variable range.
michael@0 211 * If it is, return FALSE. This is neccesary to ensure that the
michael@0 212 * variable range does not overlap characters used in a rule.
michael@0 213 * @param ch the given character.
michael@0 214 * @return True, if the given character is NOT within the variable range.
michael@0 215 */
michael@0 216 UBool checkVariableRange(UChar32 ch) const;
michael@0 217
michael@0 218 /**
michael@0 219 * Set the maximum backup to 'backup', in response to a pragma
michael@0 220 * statement.
michael@0 221 * @param backup the new value to be set.
michael@0 222 */
michael@0 223 void pragmaMaximumBackup(int32_t backup);
michael@0 224
michael@0 225 /**
michael@0 226 * Begin normalizing all rules using the given mode, in response
michael@0 227 * to a pragma statement.
michael@0 228 * @param mode the given mode.
michael@0 229 */
michael@0 230 void pragmaNormalizeRules(UNormalizationMode mode);
michael@0 231
michael@0 232 /**
michael@0 233 * Return true if the given rule looks like a pragma.
michael@0 234 * @param pos offset to the first non-whitespace character
michael@0 235 * of the rule.
michael@0 236 * @param limit pointer past the last character of the rule.
michael@0 237 * @return true if the given rule looks like a pragma.
michael@0 238 */
michael@0 239 static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
michael@0 240
michael@0 241 /**
michael@0 242 * Parse a pragma. This method assumes resemblesPragma() has
michael@0 243 * already returned true.
michael@0 244 * @param pos offset to the first non-whitespace character
michael@0 245 * of the rule.
michael@0 246 * @param limit pointer past the last character of the rule.
michael@0 247 * @return the position index after the final ';' of the pragma,
michael@0 248 * or -1 on failure.
michael@0 249 */
michael@0 250 int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
michael@0 251
michael@0 252 /**
michael@0 253 * Called by main parser upon syntax error. Search the rule string
michael@0 254 * for the probable end of the rule. Of course, if the error is that
michael@0 255 * the end of rule marker is missing, then the rule end will not be found.
michael@0 256 * In any case the rule start will be correctly reported.
michael@0 257 * @param parseErrorCode error code.
michael@0 258 * @param msg error description.
michael@0 259 * @param start position of first character of current rule.
michael@0 260 * @return start position of first character of current rule.
michael@0 261 */
michael@0 262 int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
michael@0 263 UErrorCode& status);
michael@0 264
michael@0 265 /**
michael@0 266 * Parse a UnicodeSet out, store it, and return the stand-in character
michael@0 267 * used to represent it.
michael@0 268 *
michael@0 269 * @param rule the rule for UnicodeSet.
michael@0 270 * @param pos the position in pattern at which to start parsing.
michael@0 271 * @return the stand-in character used to represent it.
michael@0 272 */
michael@0 273 UChar parseSet(const UnicodeString& rule,
michael@0 274 ParsePosition& pos,
michael@0 275 UErrorCode& status);
michael@0 276
michael@0 277 /**
michael@0 278 * Generate and return a stand-in for a new UnicodeFunctor. Store
michael@0 279 * the matcher (adopt it).
michael@0 280 * @param adopted the UnicodeFunctor to be adopted.
michael@0 281 * @return a stand-in for a new UnicodeFunctor.
michael@0 282 */
michael@0 283 UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
michael@0 284
michael@0 285 /**
michael@0 286 * Return the standin for segment seg (1-based).
michael@0 287 * @param seg the given segment.
michael@0 288 * @return the standIn character for the given segment.
michael@0 289 */
michael@0 290 UChar getSegmentStandin(int32_t seg, UErrorCode& status);
michael@0 291
michael@0 292 /**
michael@0 293 * Set the object for segment seg (1-based).
michael@0 294 * @param seg the given segment.
michael@0 295 * @param adopted the StringMatcher to be adopted.
michael@0 296 */
michael@0 297 void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
michael@0 298
michael@0 299 /**
michael@0 300 * Return the stand-in for the dot set. It is allocated the first
michael@0 301 * time and reused thereafter.
michael@0 302 * @return the stand-in for the dot set.
michael@0 303 */
michael@0 304 UChar getDotStandIn(UErrorCode& status);
michael@0 305
michael@0 306 /**
michael@0 307 * Append the value of the given variable name to the given
michael@0 308 * UnicodeString.
michael@0 309 * @param name the variable name to be appended.
michael@0 310 * @param buf the given UnicodeString to append to.
michael@0 311 */
michael@0 312 void appendVariableDef(const UnicodeString& name,
michael@0 313 UnicodeString& buf,
michael@0 314 UErrorCode& status);
michael@0 315
michael@0 316 /**
michael@0 317 * Glue method to get around access restrictions in C++.
michael@0 318 */
michael@0 319 /*static Transliterator* createBasicInstance(const UnicodeString& id,
michael@0 320 const UnicodeString* canonID);*/
michael@0 321
michael@0 322 friend class RuleHalf;
michael@0 323
michael@0 324 // Disallowed methods; no impl.
michael@0 325 /**
michael@0 326 * Copy constructor
michael@0 327 */
michael@0 328 TransliteratorParser(const TransliteratorParser&);
michael@0 329
michael@0 330 /**
michael@0 331 * Assignment operator
michael@0 332 */
michael@0 333 TransliteratorParser& operator=(const TransliteratorParser&);
michael@0 334 };
michael@0 335
michael@0 336 U_NAMESPACE_END
michael@0 337
michael@0 338 #endif /* #ifdef __cplusplus */
michael@0 339
michael@0 340 /**
michael@0 341 * Strip/convert the following from the transliterator rules:
michael@0 342 * comments
michael@0 343 * newlines
michael@0 344 * white space at the beginning and end of a line
michael@0 345 * unescape \u notation
michael@0 346 *
michael@0 347 * The target must be equal in size as the source.
michael@0 348 * @internal
michael@0 349 */
michael@0 350 U_CAPI int32_t
michael@0 351 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
michael@0 352
michael@0 353 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
michael@0 354
michael@0 355 #endif

mercurial