1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/rbt_pars.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,355 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 1999-2011, International Business Machines Corporation 1.7 +* and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* Date Name Description 1.10 +* 11/17/99 aliu Creation. 1.11 +********************************************************************** 1.12 +*/ 1.13 +#ifndef RBT_PARS_H 1.14 +#define RBT_PARS_H 1.15 + 1.16 +#include "unicode/utypes.h" 1.17 + 1.18 +#if !UCONFIG_NO_TRANSLITERATION 1.19 +#ifdef __cplusplus 1.20 + 1.21 +#include "unicode/uobject.h" 1.22 +#include "unicode/parseerr.h" 1.23 +#include "unicode/unorm.h" 1.24 +#include "rbt.h" 1.25 +#include "hash.h" 1.26 +#include "uvector.h" 1.27 + 1.28 +U_NAMESPACE_BEGIN 1.29 + 1.30 +class TransliterationRuleData; 1.31 +class UnicodeFunctor; 1.32 +class ParseData; 1.33 +class RuleHalf; 1.34 +class ParsePosition; 1.35 +class StringMatcher; 1.36 + 1.37 +class TransliteratorParser : public UMemory { 1.38 + 1.39 + public: 1.40 + 1.41 + /** 1.42 + * A Vector of TransliterationRuleData objects, one for each discrete group 1.43 + * of rules in the rule set 1.44 + */ 1.45 + UVector dataVector; 1.46 + 1.47 + /** 1.48 + * PUBLIC data member. 1.49 + * A Vector of UnicodeStrings containing all of the ID blocks in the rule set 1.50 + */ 1.51 + UVector idBlockVector; 1.52 + 1.53 + /** 1.54 + * PUBLIC data member containing the parsed compound filter, if any. 1.55 + */ 1.56 + UnicodeSet* compoundFilter; 1.57 + 1.58 + private: 1.59 + 1.60 + /** 1.61 + * The current data object for which we are parsing rules 1.62 + */ 1.63 + TransliterationRuleData* curData; 1.64 + 1.65 + UTransDirection direction; 1.66 + 1.67 + /** 1.68 + * Parse error information. 1.69 + */ 1.70 + UParseError parseError; 1.71 + 1.72 + /** 1.73 + * Temporary symbol table used during parsing. 1.74 + */ 1.75 + ParseData* parseData; 1.76 + 1.77 + /** 1.78 + * Temporary vector of matcher variables. When parsing is complete, this 1.79 + * is copied into the array data.variables. As with data.variables, 1.80 + * element 0 corresponds to character data.variablesBase. 1.81 + */ 1.82 + UVector variablesVector; 1.83 + 1.84 + /** 1.85 + * Temporary table of variable names. When parsing is complete, this is 1.86 + * copied into data.variableNames. 1.87 + */ 1.88 + Hashtable variableNames; 1.89 + 1.90 + /** 1.91 + * String of standins for segments. Used during the parsing of a single 1.92 + * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds 1.93 + * to StringMatcher object segmentObjects.elementAt(0), etc. 1.94 + */ 1.95 + UnicodeString segmentStandins; 1.96 + 1.97 + /** 1.98 + * Vector of StringMatcher objects for segments. Used during the 1.99 + * parsing of a single rule. 1.100 + * segmentStandins.charAt(0) is the standin for "$1" and corresponds 1.101 + * to StringMatcher object segmentObjects.elementAt(0), etc. 1.102 + */ 1.103 + UVector segmentObjects; 1.104 + 1.105 + /** 1.106 + * The next available stand-in for variables. This starts at some point in 1.107 + * the private use area (discovered dynamically) and increments up toward 1.108 + * <code>variableLimit</code>. At any point during parsing, available 1.109 + * variables are <code>variableNext..variableLimit-1</code>. 1.110 + */ 1.111 + UChar variableNext; 1.112 + 1.113 + /** 1.114 + * The last available stand-in for variables. This is discovered 1.115 + * dynamically. At any point during parsing, available variables are 1.116 + * <code>variableNext..variableLimit-1</code>. 1.117 + */ 1.118 + UChar variableLimit; 1.119 + 1.120 + /** 1.121 + * When we encounter an undefined variable, we do not immediately signal 1.122 + * an error, in case we are defining this variable, e.g., "$a = [a-z];". 1.123 + * Instead, we save the name of the undefined variable, and substitute 1.124 + * in the placeholder char variableLimit - 1, and decrement 1.125 + * variableLimit. 1.126 + */ 1.127 + UnicodeString undefinedVariableName; 1.128 + 1.129 + /** 1.130 + * The stand-in character for the 'dot' set, represented by '.' in 1.131 + * patterns. This is allocated the first time it is needed, and 1.132 + * reused thereafter. 1.133 + */ 1.134 + UChar dotStandIn; 1.135 + 1.136 +public: 1.137 + 1.138 + /** 1.139 + * Constructor. 1.140 + */ 1.141 + TransliteratorParser(UErrorCode &statusReturn); 1.142 + 1.143 + /** 1.144 + * Destructor. 1.145 + */ 1.146 + ~TransliteratorParser(); 1.147 + 1.148 + /** 1.149 + * Parse the given string as a sequence of rules, separated by newline 1.150 + * characters ('\n'), and cause this object to implement those rules. Any 1.151 + * previous rules are discarded. Typically this method is called exactly 1.152 + * once after construction. 1.153 + * 1.154 + * Parse the given rules, in the given direction. After this call 1.155 + * returns, query the public data members for results. The caller 1.156 + * owns the 'data' and 'compoundFilter' data members after this 1.157 + * call returns. 1.158 + * @param rules rules, separated by ';' 1.159 + * @param direction either FORWARD or REVERSE. 1.160 + * @param pe Struct to recieve information on position 1.161 + * of error if an error is encountered 1.162 + * @param ec Output param set to success/failure code. 1.163 + */ 1.164 + void parse(const UnicodeString& rules, 1.165 + UTransDirection direction, 1.166 + UParseError& pe, 1.167 + UErrorCode& ec); 1.168 + 1.169 + /** 1.170 + * Return the compound filter parsed by parse(). Caller owns result. 1.171 + * @return the compound filter parsed by parse(). 1.172 + */ 1.173 + UnicodeSet* orphanCompoundFilter(); 1.174 + 1.175 +private: 1.176 + 1.177 + /** 1.178 + * Return a representation of this transliterator as source rules. 1.179 + * @param rules Output param to receive the rules. 1.180 + * @param direction either FORWARD or REVERSE. 1.181 + */ 1.182 + void parseRules(const UnicodeString& rules, 1.183 + UTransDirection direction, 1.184 + UErrorCode& status); 1.185 + 1.186 + /** 1.187 + * MAIN PARSER. Parse the next rule in the given rule string, starting 1.188 + * at pos. Return the index after the last character parsed. Do not 1.189 + * parse characters at or after limit. 1.190 + * 1.191 + * Important: The character at pos must be a non-whitespace character 1.192 + * that is not the comment character. 1.193 + * 1.194 + * This method handles quoting, escaping, and whitespace removal. It 1.195 + * parses the end-of-rule character. It recognizes context and cursor 1.196 + * indicators. Once it does a lexical breakdown of the rule at pos, it 1.197 + * creates a rule object and adds it to our rule list. 1.198 + * @param rules Output param to receive the rules. 1.199 + * @param pos the starting position. 1.200 + * @param limit pointer past the last character of the rule. 1.201 + * @return the index after the last character parsed. 1.202 + */ 1.203 + int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); 1.204 + 1.205 + /** 1.206 + * Set the variable range to [start, end] (inclusive). 1.207 + * @param start the start value of the range. 1.208 + * @param end the end value of the range. 1.209 + */ 1.210 + void setVariableRange(int32_t start, int32_t end, UErrorCode& status); 1.211 + 1.212 + /** 1.213 + * Assert that the given character is NOT within the variable range. 1.214 + * If it is, return FALSE. This is neccesary to ensure that the 1.215 + * variable range does not overlap characters used in a rule. 1.216 + * @param ch the given character. 1.217 + * @return True, if the given character is NOT within the variable range. 1.218 + */ 1.219 + UBool checkVariableRange(UChar32 ch) const; 1.220 + 1.221 + /** 1.222 + * Set the maximum backup to 'backup', in response to a pragma 1.223 + * statement. 1.224 + * @param backup the new value to be set. 1.225 + */ 1.226 + void pragmaMaximumBackup(int32_t backup); 1.227 + 1.228 + /** 1.229 + * Begin normalizing all rules using the given mode, in response 1.230 + * to a pragma statement. 1.231 + * @param mode the given mode. 1.232 + */ 1.233 + void pragmaNormalizeRules(UNormalizationMode mode); 1.234 + 1.235 + /** 1.236 + * Return true if the given rule looks like a pragma. 1.237 + * @param pos offset to the first non-whitespace character 1.238 + * of the rule. 1.239 + * @param limit pointer past the last character of the rule. 1.240 + * @return true if the given rule looks like a pragma. 1.241 + */ 1.242 + static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit); 1.243 + 1.244 + /** 1.245 + * Parse a pragma. This method assumes resemblesPragma() has 1.246 + * already returned true. 1.247 + * @param pos offset to the first non-whitespace character 1.248 + * of the rule. 1.249 + * @param limit pointer past the last character of the rule. 1.250 + * @return the position index after the final ';' of the pragma, 1.251 + * or -1 on failure. 1.252 + */ 1.253 + int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); 1.254 + 1.255 + /** 1.256 + * Called by main parser upon syntax error. Search the rule string 1.257 + * for the probable end of the rule. Of course, if the error is that 1.258 + * the end of rule marker is missing, then the rule end will not be found. 1.259 + * In any case the rule start will be correctly reported. 1.260 + * @param parseErrorCode error code. 1.261 + * @param msg error description. 1.262 + * @param start position of first character of current rule. 1.263 + * @return start position of first character of current rule. 1.264 + */ 1.265 + int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start, 1.266 + UErrorCode& status); 1.267 + 1.268 + /** 1.269 + * Parse a UnicodeSet out, store it, and return the stand-in character 1.270 + * used to represent it. 1.271 + * 1.272 + * @param rule the rule for UnicodeSet. 1.273 + * @param pos the position in pattern at which to start parsing. 1.274 + * @return the stand-in character used to represent it. 1.275 + */ 1.276 + UChar parseSet(const UnicodeString& rule, 1.277 + ParsePosition& pos, 1.278 + UErrorCode& status); 1.279 + 1.280 + /** 1.281 + * Generate and return a stand-in for a new UnicodeFunctor. Store 1.282 + * the matcher (adopt it). 1.283 + * @param adopted the UnicodeFunctor to be adopted. 1.284 + * @return a stand-in for a new UnicodeFunctor. 1.285 + */ 1.286 + UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); 1.287 + 1.288 + /** 1.289 + * Return the standin for segment seg (1-based). 1.290 + * @param seg the given segment. 1.291 + * @return the standIn character for the given segment. 1.292 + */ 1.293 + UChar getSegmentStandin(int32_t seg, UErrorCode& status); 1.294 + 1.295 + /** 1.296 + * Set the object for segment seg (1-based). 1.297 + * @param seg the given segment. 1.298 + * @param adopted the StringMatcher to be adopted. 1.299 + */ 1.300 + void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status); 1.301 + 1.302 + /** 1.303 + * Return the stand-in for the dot set. It is allocated the first 1.304 + * time and reused thereafter. 1.305 + * @return the stand-in for the dot set. 1.306 + */ 1.307 + UChar getDotStandIn(UErrorCode& status); 1.308 + 1.309 + /** 1.310 + * Append the value of the given variable name to the given 1.311 + * UnicodeString. 1.312 + * @param name the variable name to be appended. 1.313 + * @param buf the given UnicodeString to append to. 1.314 + */ 1.315 + void appendVariableDef(const UnicodeString& name, 1.316 + UnicodeString& buf, 1.317 + UErrorCode& status); 1.318 + 1.319 + /** 1.320 + * Glue method to get around access restrictions in C++. 1.321 + */ 1.322 + /*static Transliterator* createBasicInstance(const UnicodeString& id, 1.323 + const UnicodeString* canonID);*/ 1.324 + 1.325 + friend class RuleHalf; 1.326 + 1.327 + // Disallowed methods; no impl. 1.328 + /** 1.329 + * Copy constructor 1.330 + */ 1.331 + TransliteratorParser(const TransliteratorParser&); 1.332 + 1.333 + /** 1.334 + * Assignment operator 1.335 + */ 1.336 + TransliteratorParser& operator=(const TransliteratorParser&); 1.337 +}; 1.338 + 1.339 +U_NAMESPACE_END 1.340 + 1.341 +#endif /* #ifdef __cplusplus */ 1.342 + 1.343 +/** 1.344 + * Strip/convert the following from the transliterator rules: 1.345 + * comments 1.346 + * newlines 1.347 + * white space at the beginning and end of a line 1.348 + * unescape \u notation 1.349 + * 1.350 + * The target must be equal in size as the source. 1.351 + * @internal 1.352 + */ 1.353 +U_CAPI int32_t 1.354 +utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status); 1.355 + 1.356 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1.357 + 1.358 +#endif