Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 1999-2011, International Business Machines Corporation |
michael@0 | 4 | * and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * Date Name Description |
michael@0 | 7 | * 11/17/99 aliu Creation. |
michael@0 | 8 | ********************************************************************** |
michael@0 | 9 | */ |
michael@0 | 10 | #ifndef RBT_PARS_H |
michael@0 | 11 | #define RBT_PARS_H |
michael@0 | 12 | |
michael@0 | 13 | #include "unicode/utypes.h" |
michael@0 | 14 | |
michael@0 | 15 | #if !UCONFIG_NO_TRANSLITERATION |
michael@0 | 16 | #ifdef __cplusplus |
michael@0 | 17 | |
michael@0 | 18 | #include "unicode/uobject.h" |
michael@0 | 19 | #include "unicode/parseerr.h" |
michael@0 | 20 | #include "unicode/unorm.h" |
michael@0 | 21 | #include "rbt.h" |
michael@0 | 22 | #include "hash.h" |
michael@0 | 23 | #include "uvector.h" |
michael@0 | 24 | |
michael@0 | 25 | U_NAMESPACE_BEGIN |
michael@0 | 26 | |
michael@0 | 27 | class TransliterationRuleData; |
michael@0 | 28 | class UnicodeFunctor; |
michael@0 | 29 | class ParseData; |
michael@0 | 30 | class RuleHalf; |
michael@0 | 31 | class ParsePosition; |
michael@0 | 32 | class StringMatcher; |
michael@0 | 33 | |
michael@0 | 34 | class TransliteratorParser : public UMemory { |
michael@0 | 35 | |
michael@0 | 36 | public: |
michael@0 | 37 | |
michael@0 | 38 | /** |
michael@0 | 39 | * A Vector of TransliterationRuleData objects, one for each discrete group |
michael@0 | 40 | * of rules in the rule set |
michael@0 | 41 | */ |
michael@0 | 42 | UVector dataVector; |
michael@0 | 43 | |
michael@0 | 44 | /** |
michael@0 | 45 | * PUBLIC data member. |
michael@0 | 46 | * A Vector of UnicodeStrings containing all of the ID blocks in the rule set |
michael@0 | 47 | */ |
michael@0 | 48 | UVector idBlockVector; |
michael@0 | 49 | |
michael@0 | 50 | /** |
michael@0 | 51 | * PUBLIC data member containing the parsed compound filter, if any. |
michael@0 | 52 | */ |
michael@0 | 53 | UnicodeSet* compoundFilter; |
michael@0 | 54 | |
michael@0 | 55 | private: |
michael@0 | 56 | |
michael@0 | 57 | /** |
michael@0 | 58 | * The current data object for which we are parsing rules |
michael@0 | 59 | */ |
michael@0 | 60 | TransliterationRuleData* curData; |
michael@0 | 61 | |
michael@0 | 62 | UTransDirection direction; |
michael@0 | 63 | |
michael@0 | 64 | /** |
michael@0 | 65 | * Parse error information. |
michael@0 | 66 | */ |
michael@0 | 67 | UParseError parseError; |
michael@0 | 68 | |
michael@0 | 69 | /** |
michael@0 | 70 | * Temporary symbol table used during parsing. |
michael@0 | 71 | */ |
michael@0 | 72 | ParseData* parseData; |
michael@0 | 73 | |
michael@0 | 74 | /** |
michael@0 | 75 | * Temporary vector of matcher variables. When parsing is complete, this |
michael@0 | 76 | * is copied into the array data.variables. As with data.variables, |
michael@0 | 77 | * element 0 corresponds to character data.variablesBase. |
michael@0 | 78 | */ |
michael@0 | 79 | UVector variablesVector; |
michael@0 | 80 | |
michael@0 | 81 | /** |
michael@0 | 82 | * Temporary table of variable names. When parsing is complete, this is |
michael@0 | 83 | * copied into data.variableNames. |
michael@0 | 84 | */ |
michael@0 | 85 | Hashtable variableNames; |
michael@0 | 86 | |
michael@0 | 87 | /** |
michael@0 | 88 | * String of standins for segments. Used during the parsing of a single |
michael@0 | 89 | * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds |
michael@0 | 90 | * to StringMatcher object segmentObjects.elementAt(0), etc. |
michael@0 | 91 | */ |
michael@0 | 92 | UnicodeString segmentStandins; |
michael@0 | 93 | |
michael@0 | 94 | /** |
michael@0 | 95 | * Vector of StringMatcher objects for segments. Used during the |
michael@0 | 96 | * parsing of a single rule. |
michael@0 | 97 | * segmentStandins.charAt(0) is the standin for "$1" and corresponds |
michael@0 | 98 | * to StringMatcher object segmentObjects.elementAt(0), etc. |
michael@0 | 99 | */ |
michael@0 | 100 | UVector segmentObjects; |
michael@0 | 101 | |
michael@0 | 102 | /** |
michael@0 | 103 | * The next available stand-in for variables. This starts at some point in |
michael@0 | 104 | * the private use area (discovered dynamically) and increments up toward |
michael@0 | 105 | * <code>variableLimit</code>. At any point during parsing, available |
michael@0 | 106 | * variables are <code>variableNext..variableLimit-1</code>. |
michael@0 | 107 | */ |
michael@0 | 108 | UChar variableNext; |
michael@0 | 109 | |
michael@0 | 110 | /** |
michael@0 | 111 | * The last available stand-in for variables. This is discovered |
michael@0 | 112 | * dynamically. At any point during parsing, available variables are |
michael@0 | 113 | * <code>variableNext..variableLimit-1</code>. |
michael@0 | 114 | */ |
michael@0 | 115 | UChar variableLimit; |
michael@0 | 116 | |
michael@0 | 117 | /** |
michael@0 | 118 | * When we encounter an undefined variable, we do not immediately signal |
michael@0 | 119 | * an error, in case we are defining this variable, e.g., "$a = [a-z];". |
michael@0 | 120 | * Instead, we save the name of the undefined variable, and substitute |
michael@0 | 121 | * in the placeholder char variableLimit - 1, and decrement |
michael@0 | 122 | * variableLimit. |
michael@0 | 123 | */ |
michael@0 | 124 | UnicodeString undefinedVariableName; |
michael@0 | 125 | |
michael@0 | 126 | /** |
michael@0 | 127 | * The stand-in character for the 'dot' set, represented by '.' in |
michael@0 | 128 | * patterns. This is allocated the first time it is needed, and |
michael@0 | 129 | * reused thereafter. |
michael@0 | 130 | */ |
michael@0 | 131 | UChar dotStandIn; |
michael@0 | 132 | |
michael@0 | 133 | public: |
michael@0 | 134 | |
michael@0 | 135 | /** |
michael@0 | 136 | * Constructor. |
michael@0 | 137 | */ |
michael@0 | 138 | TransliteratorParser(UErrorCode &statusReturn); |
michael@0 | 139 | |
michael@0 | 140 | /** |
michael@0 | 141 | * Destructor. |
michael@0 | 142 | */ |
michael@0 | 143 | ~TransliteratorParser(); |
michael@0 | 144 | |
michael@0 | 145 | /** |
michael@0 | 146 | * Parse the given string as a sequence of rules, separated by newline |
michael@0 | 147 | * characters ('\n'), and cause this object to implement those rules. Any |
michael@0 | 148 | * previous rules are discarded. Typically this method is called exactly |
michael@0 | 149 | * once after construction. |
michael@0 | 150 | * |
michael@0 | 151 | * Parse the given rules, in the given direction. After this call |
michael@0 | 152 | * returns, query the public data members for results. The caller |
michael@0 | 153 | * owns the 'data' and 'compoundFilter' data members after this |
michael@0 | 154 | * call returns. |
michael@0 | 155 | * @param rules rules, separated by ';' |
michael@0 | 156 | * @param direction either FORWARD or REVERSE. |
michael@0 | 157 | * @param pe Struct to recieve information on position |
michael@0 | 158 | * of error if an error is encountered |
michael@0 | 159 | * @param ec Output param set to success/failure code. |
michael@0 | 160 | */ |
michael@0 | 161 | void parse(const UnicodeString& rules, |
michael@0 | 162 | UTransDirection direction, |
michael@0 | 163 | UParseError& pe, |
michael@0 | 164 | UErrorCode& ec); |
michael@0 | 165 | |
michael@0 | 166 | /** |
michael@0 | 167 | * Return the compound filter parsed by parse(). Caller owns result. |
michael@0 | 168 | * @return the compound filter parsed by parse(). |
michael@0 | 169 | */ |
michael@0 | 170 | UnicodeSet* orphanCompoundFilter(); |
michael@0 | 171 | |
michael@0 | 172 | private: |
michael@0 | 173 | |
michael@0 | 174 | /** |
michael@0 | 175 | * Return a representation of this transliterator as source rules. |
michael@0 | 176 | * @param rules Output param to receive the rules. |
michael@0 | 177 | * @param direction either FORWARD or REVERSE. |
michael@0 | 178 | */ |
michael@0 | 179 | void parseRules(const UnicodeString& rules, |
michael@0 | 180 | UTransDirection direction, |
michael@0 | 181 | UErrorCode& status); |
michael@0 | 182 | |
michael@0 | 183 | /** |
michael@0 | 184 | * MAIN PARSER. Parse the next rule in the given rule string, starting |
michael@0 | 185 | * at pos. Return the index after the last character parsed. Do not |
michael@0 | 186 | * parse characters at or after limit. |
michael@0 | 187 | * |
michael@0 | 188 | * Important: The character at pos must be a non-whitespace character |
michael@0 | 189 | * that is not the comment character. |
michael@0 | 190 | * |
michael@0 | 191 | * This method handles quoting, escaping, and whitespace removal. It |
michael@0 | 192 | * parses the end-of-rule character. It recognizes context and cursor |
michael@0 | 193 | * indicators. Once it does a lexical breakdown of the rule at pos, it |
michael@0 | 194 | * creates a rule object and adds it to our rule list. |
michael@0 | 195 | * @param rules Output param to receive the rules. |
michael@0 | 196 | * @param pos the starting position. |
michael@0 | 197 | * @param limit pointer past the last character of the rule. |
michael@0 | 198 | * @return the index after the last character parsed. |
michael@0 | 199 | */ |
michael@0 | 200 | int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); |
michael@0 | 201 | |
michael@0 | 202 | /** |
michael@0 | 203 | * Set the variable range to [start, end] (inclusive). |
michael@0 | 204 | * @param start the start value of the range. |
michael@0 | 205 | * @param end the end value of the range. |
michael@0 | 206 | */ |
michael@0 | 207 | void setVariableRange(int32_t start, int32_t end, UErrorCode& status); |
michael@0 | 208 | |
michael@0 | 209 | /** |
michael@0 | 210 | * Assert that the given character is NOT within the variable range. |
michael@0 | 211 | * If it is, return FALSE. This is neccesary to ensure that the |
michael@0 | 212 | * variable range does not overlap characters used in a rule. |
michael@0 | 213 | * @param ch the given character. |
michael@0 | 214 | * @return True, if the given character is NOT within the variable range. |
michael@0 | 215 | */ |
michael@0 | 216 | UBool checkVariableRange(UChar32 ch) const; |
michael@0 | 217 | |
michael@0 | 218 | /** |
michael@0 | 219 | * Set the maximum backup to 'backup', in response to a pragma |
michael@0 | 220 | * statement. |
michael@0 | 221 | * @param backup the new value to be set. |
michael@0 | 222 | */ |
michael@0 | 223 | void pragmaMaximumBackup(int32_t backup); |
michael@0 | 224 | |
michael@0 | 225 | /** |
michael@0 | 226 | * Begin normalizing all rules using the given mode, in response |
michael@0 | 227 | * to a pragma statement. |
michael@0 | 228 | * @param mode the given mode. |
michael@0 | 229 | */ |
michael@0 | 230 | void pragmaNormalizeRules(UNormalizationMode mode); |
michael@0 | 231 | |
michael@0 | 232 | /** |
michael@0 | 233 | * Return true if the given rule looks like a pragma. |
michael@0 | 234 | * @param pos offset to the first non-whitespace character |
michael@0 | 235 | * of the rule. |
michael@0 | 236 | * @param limit pointer past the last character of the rule. |
michael@0 | 237 | * @return true if the given rule looks like a pragma. |
michael@0 | 238 | */ |
michael@0 | 239 | static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit); |
michael@0 | 240 | |
michael@0 | 241 | /** |
michael@0 | 242 | * Parse a pragma. This method assumes resemblesPragma() has |
michael@0 | 243 | * already returned true. |
michael@0 | 244 | * @param pos offset to the first non-whitespace character |
michael@0 | 245 | * of the rule. |
michael@0 | 246 | * @param limit pointer past the last character of the rule. |
michael@0 | 247 | * @return the position index after the final ';' of the pragma, |
michael@0 | 248 | * or -1 on failure. |
michael@0 | 249 | */ |
michael@0 | 250 | int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); |
michael@0 | 251 | |
michael@0 | 252 | /** |
michael@0 | 253 | * Called by main parser upon syntax error. Search the rule string |
michael@0 | 254 | * for the probable end of the rule. Of course, if the error is that |
michael@0 | 255 | * the end of rule marker is missing, then the rule end will not be found. |
michael@0 | 256 | * In any case the rule start will be correctly reported. |
michael@0 | 257 | * @param parseErrorCode error code. |
michael@0 | 258 | * @param msg error description. |
michael@0 | 259 | * @param start position of first character of current rule. |
michael@0 | 260 | * @return start position of first character of current rule. |
michael@0 | 261 | */ |
michael@0 | 262 | int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start, |
michael@0 | 263 | UErrorCode& status); |
michael@0 | 264 | |
michael@0 | 265 | /** |
michael@0 | 266 | * Parse a UnicodeSet out, store it, and return the stand-in character |
michael@0 | 267 | * used to represent it. |
michael@0 | 268 | * |
michael@0 | 269 | * @param rule the rule for UnicodeSet. |
michael@0 | 270 | * @param pos the position in pattern at which to start parsing. |
michael@0 | 271 | * @return the stand-in character used to represent it. |
michael@0 | 272 | */ |
michael@0 | 273 | UChar parseSet(const UnicodeString& rule, |
michael@0 | 274 | ParsePosition& pos, |
michael@0 | 275 | UErrorCode& status); |
michael@0 | 276 | |
michael@0 | 277 | /** |
michael@0 | 278 | * Generate and return a stand-in for a new UnicodeFunctor. Store |
michael@0 | 279 | * the matcher (adopt it). |
michael@0 | 280 | * @param adopted the UnicodeFunctor to be adopted. |
michael@0 | 281 | * @return a stand-in for a new UnicodeFunctor. |
michael@0 | 282 | */ |
michael@0 | 283 | UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); |
michael@0 | 284 | |
michael@0 | 285 | /** |
michael@0 | 286 | * Return the standin for segment seg (1-based). |
michael@0 | 287 | * @param seg the given segment. |
michael@0 | 288 | * @return the standIn character for the given segment. |
michael@0 | 289 | */ |
michael@0 | 290 | UChar getSegmentStandin(int32_t seg, UErrorCode& status); |
michael@0 | 291 | |
michael@0 | 292 | /** |
michael@0 | 293 | * Set the object for segment seg (1-based). |
michael@0 | 294 | * @param seg the given segment. |
michael@0 | 295 | * @param adopted the StringMatcher to be adopted. |
michael@0 | 296 | */ |
michael@0 | 297 | void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status); |
michael@0 | 298 | |
michael@0 | 299 | /** |
michael@0 | 300 | * Return the stand-in for the dot set. It is allocated the first |
michael@0 | 301 | * time and reused thereafter. |
michael@0 | 302 | * @return the stand-in for the dot set. |
michael@0 | 303 | */ |
michael@0 | 304 | UChar getDotStandIn(UErrorCode& status); |
michael@0 | 305 | |
michael@0 | 306 | /** |
michael@0 | 307 | * Append the value of the given variable name to the given |
michael@0 | 308 | * UnicodeString. |
michael@0 | 309 | * @param name the variable name to be appended. |
michael@0 | 310 | * @param buf the given UnicodeString to append to. |
michael@0 | 311 | */ |
michael@0 | 312 | void appendVariableDef(const UnicodeString& name, |
michael@0 | 313 | UnicodeString& buf, |
michael@0 | 314 | UErrorCode& status); |
michael@0 | 315 | |
michael@0 | 316 | /** |
michael@0 | 317 | * Glue method to get around access restrictions in C++. |
michael@0 | 318 | */ |
michael@0 | 319 | /*static Transliterator* createBasicInstance(const UnicodeString& id, |
michael@0 | 320 | const UnicodeString* canonID);*/ |
michael@0 | 321 | |
michael@0 | 322 | friend class RuleHalf; |
michael@0 | 323 | |
michael@0 | 324 | // Disallowed methods; no impl. |
michael@0 | 325 | /** |
michael@0 | 326 | * Copy constructor |
michael@0 | 327 | */ |
michael@0 | 328 | TransliteratorParser(const TransliteratorParser&); |
michael@0 | 329 | |
michael@0 | 330 | /** |
michael@0 | 331 | * Assignment operator |
michael@0 | 332 | */ |
michael@0 | 333 | TransliteratorParser& operator=(const TransliteratorParser&); |
michael@0 | 334 | }; |
michael@0 | 335 | |
michael@0 | 336 | U_NAMESPACE_END |
michael@0 | 337 | |
michael@0 | 338 | #endif /* #ifdef __cplusplus */ |
michael@0 | 339 | |
michael@0 | 340 | /** |
michael@0 | 341 | * Strip/convert the following from the transliterator rules: |
michael@0 | 342 | * comments |
michael@0 | 343 | * newlines |
michael@0 | 344 | * white space at the beginning and end of a line |
michael@0 | 345 | * unescape \u notation |
michael@0 | 346 | * |
michael@0 | 347 | * The target must be equal in size as the source. |
michael@0 | 348 | * @internal |
michael@0 | 349 | */ |
michael@0 | 350 | U_CAPI int32_t |
michael@0 | 351 | utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status); |
michael@0 | 352 | |
michael@0 | 353 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
michael@0 | 354 | |
michael@0 | 355 | #endif |