intl/icu/source/i18n/rbt_pars.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/rbt_pars.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,355 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +* Copyright (C) 1999-2011, International Business Machines Corporation
     1.7 +* and others. All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   Date        Name        Description
    1.10 +*   11/17/99    aliu        Creation.
    1.11 +**********************************************************************
    1.12 +*/
    1.13 +#ifndef RBT_PARS_H
    1.14 +#define RBT_PARS_H
    1.15 +
    1.16 +#include "unicode/utypes.h"
    1.17 +
    1.18 +#if !UCONFIG_NO_TRANSLITERATION
    1.19 +#ifdef __cplusplus
    1.20 +
    1.21 +#include "unicode/uobject.h"
    1.22 +#include "unicode/parseerr.h"
    1.23 +#include "unicode/unorm.h"
    1.24 +#include "rbt.h"
    1.25 +#include "hash.h"
    1.26 +#include "uvector.h"
    1.27 +
    1.28 +U_NAMESPACE_BEGIN
    1.29 +
    1.30 +class TransliterationRuleData;
    1.31 +class UnicodeFunctor;
    1.32 +class ParseData;
    1.33 +class RuleHalf;
    1.34 +class ParsePosition;
    1.35 +class StringMatcher;
    1.36 +
    1.37 +class TransliteratorParser : public UMemory {
    1.38 +
    1.39 + public:
    1.40 +
    1.41 +    /**
    1.42 +     * A Vector of TransliterationRuleData objects, one for each discrete group
    1.43 +     * of rules in the rule set
    1.44 +     */
    1.45 +    UVector dataVector;
    1.46 +
    1.47 +    /**
    1.48 +     * PUBLIC data member.
    1.49 +     * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
    1.50 +     */
    1.51 +    UVector idBlockVector;
    1.52 +
    1.53 +    /**
    1.54 +     * PUBLIC data member containing the parsed compound filter, if any.
    1.55 +     */
    1.56 +    UnicodeSet* compoundFilter;
    1.57 +
    1.58 + private:
    1.59 +
    1.60 +    /**
    1.61 +     * The current data object for which we are parsing rules
    1.62 +     */
    1.63 +    TransliterationRuleData* curData;
    1.64 +
    1.65 +    UTransDirection direction;
    1.66 +
    1.67 +    /**
    1.68 +     * Parse error information.
    1.69 +     */
    1.70 +    UParseError parseError;
    1.71 +
    1.72 +    /**
    1.73 +     * Temporary symbol table used during parsing.
    1.74 +     */
    1.75 +    ParseData* parseData;
    1.76 +
    1.77 +    /**
    1.78 +     * Temporary vector of matcher variables.  When parsing is complete, this
    1.79 +     * is copied into the array data.variables.  As with data.variables,
    1.80 +     * element 0 corresponds to character data.variablesBase.
    1.81 +     */
    1.82 +    UVector variablesVector;
    1.83 +
    1.84 +    /**
    1.85 +     * Temporary table of variable names.  When parsing is complete, this is
    1.86 +     * copied into data.variableNames.
    1.87 +     */
    1.88 +    Hashtable variableNames;    
    1.89 +    
    1.90 +    /**
    1.91 +     * String of standins for segments.  Used during the parsing of a single
    1.92 +     * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
    1.93 +     * to StringMatcher object segmentObjects.elementAt(0), etc.
    1.94 +     */
    1.95 +    UnicodeString segmentStandins;
    1.96 +
    1.97 +    /**
    1.98 +     * Vector of StringMatcher objects for segments.  Used during the
    1.99 +     * parsing of a single rule.  
   1.100 +     * segmentStandins.charAt(0) is the standin for "$1" and corresponds
   1.101 +     * to StringMatcher object segmentObjects.elementAt(0), etc.
   1.102 +     */
   1.103 +    UVector segmentObjects;
   1.104 +
   1.105 +    /**
   1.106 +     * The next available stand-in for variables.  This starts at some point in
   1.107 +     * the private use area (discovered dynamically) and increments up toward
   1.108 +     * <code>variableLimit</code>.  At any point during parsing, available
   1.109 +     * variables are <code>variableNext..variableLimit-1</code>.
   1.110 +     */
   1.111 +    UChar variableNext;
   1.112 +
   1.113 +    /**
   1.114 +     * The last available stand-in for variables.  This is discovered
   1.115 +     * dynamically.  At any point during parsing, available variables are
   1.116 +     * <code>variableNext..variableLimit-1</code>.
   1.117 +     */
   1.118 +    UChar variableLimit;
   1.119 +
   1.120 +    /**
   1.121 +     * When we encounter an undefined variable, we do not immediately signal
   1.122 +     * an error, in case we are defining this variable, e.g., "$a = [a-z];".
   1.123 +     * Instead, we save the name of the undefined variable, and substitute
   1.124 +     * in the placeholder char variableLimit - 1, and decrement
   1.125 +     * variableLimit.
   1.126 +     */
   1.127 +    UnicodeString undefinedVariableName;
   1.128 +
   1.129 +    /**
   1.130 +     * The stand-in character for the 'dot' set, represented by '.' in
   1.131 +     * patterns.  This is allocated the first time it is needed, and
   1.132 +     * reused thereafter.
   1.133 +     */
   1.134 +    UChar dotStandIn;
   1.135 +
   1.136 +public:
   1.137 +
   1.138 +    /**
   1.139 +     * Constructor.
   1.140 +     */
   1.141 +    TransliteratorParser(UErrorCode &statusReturn);
   1.142 +
   1.143 +    /**
   1.144 +     * Destructor.
   1.145 +     */
   1.146 +    ~TransliteratorParser();
   1.147 +
   1.148 +    /**
   1.149 +     * Parse the given string as a sequence of rules, separated by newline
   1.150 +     * characters ('\n'), and cause this object to implement those rules.  Any
   1.151 +     * previous rules are discarded.  Typically this method is called exactly
   1.152 +     * once after construction.
   1.153 +     *
   1.154 +     * Parse the given rules, in the given direction.  After this call
   1.155 +     * returns, query the public data members for results.  The caller
   1.156 +     * owns the 'data' and 'compoundFilter' data members after this
   1.157 +     * call returns.
   1.158 +     * @param rules      rules, separated by ';'
   1.159 +     * @param direction  either FORWARD or REVERSE.
   1.160 +     * @param pe         Struct to recieve information on position 
   1.161 +     *                   of error if an error is encountered
   1.162 +     * @param ec         Output param set to success/failure code.
   1.163 +     */
   1.164 +    void parse(const UnicodeString& rules,
   1.165 +               UTransDirection direction,
   1.166 +               UParseError& pe,
   1.167 +               UErrorCode& ec);
   1.168 +
   1.169 +    /**
   1.170 +     * Return the compound filter parsed by parse().  Caller owns result.
   1.171 +     * @return the compound filter parsed by parse().
   1.172 +     */ 
   1.173 +    UnicodeSet* orphanCompoundFilter();
   1.174 +
   1.175 +private:
   1.176 +
   1.177 +    /**
   1.178 +     * Return a representation of this transliterator as source rules.
   1.179 +     * @param rules      Output param to receive the rules.
   1.180 +     * @param direction  either FORWARD or REVERSE.
   1.181 +     */
   1.182 +    void parseRules(const UnicodeString& rules,
   1.183 +                    UTransDirection direction,
   1.184 +                    UErrorCode& status);
   1.185 +
   1.186 +    /**
   1.187 +     * MAIN PARSER.  Parse the next rule in the given rule string, starting
   1.188 +     * at pos.  Return the index after the last character parsed.  Do not
   1.189 +     * parse characters at or after limit.
   1.190 +     *
   1.191 +     * Important:  The character at pos must be a non-whitespace character
   1.192 +     * that is not the comment character.
   1.193 +     *
   1.194 +     * This method handles quoting, escaping, and whitespace removal.  It
   1.195 +     * parses the end-of-rule character.  It recognizes context and cursor
   1.196 +     * indicators.  Once it does a lexical breakdown of the rule at pos, it
   1.197 +     * creates a rule object and adds it to our rule list.
   1.198 +     * @param rules      Output param to receive the rules.
   1.199 +     * @param pos        the starting position.
   1.200 +     * @param limit      pointer past the last character of the rule.
   1.201 +     * @return           the index after the last character parsed.
   1.202 +     */
   1.203 +    int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
   1.204 +
   1.205 +    /**
   1.206 +     * Set the variable range to [start, end] (inclusive).
   1.207 +     * @param start    the start value of the range.
   1.208 +     * @param end      the end value of the range.
   1.209 +     */
   1.210 +    void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
   1.211 +
   1.212 +    /**
   1.213 +     * Assert that the given character is NOT within the variable range.
   1.214 +     * If it is, return FALSE.  This is neccesary to ensure that the
   1.215 +     * variable range does not overlap characters used in a rule.
   1.216 +     * @param ch     the given character.
   1.217 +     * @return       True, if the given character is NOT within the variable range.
   1.218 +     */
   1.219 +    UBool checkVariableRange(UChar32 ch) const;
   1.220 +
   1.221 +    /**
   1.222 +     * Set the maximum backup to 'backup', in response to a pragma
   1.223 +     * statement.
   1.224 +     * @param backup    the new value to be set.
   1.225 +     */
   1.226 +    void pragmaMaximumBackup(int32_t backup);
   1.227 +
   1.228 +    /**
   1.229 +     * Begin normalizing all rules using the given mode, in response
   1.230 +     * to a pragma statement.
   1.231 +     * @param mode    the given mode.
   1.232 +     */
   1.233 +    void pragmaNormalizeRules(UNormalizationMode mode);
   1.234 +
   1.235 +    /**
   1.236 +     * Return true if the given rule looks like a pragma.
   1.237 +     * @param pos offset to the first non-whitespace character
   1.238 +     * of the rule.
   1.239 +     * @param limit pointer past the last character of the rule.
   1.240 +     * @return true if the given rule looks like a pragma.
   1.241 +     */
   1.242 +    static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
   1.243 +
   1.244 +    /**
   1.245 +     * Parse a pragma.  This method assumes resemblesPragma() has
   1.246 +     * already returned true.
   1.247 +     * @param pos offset to the first non-whitespace character
   1.248 +     * of the rule.
   1.249 +     * @param limit pointer past the last character of the rule.
   1.250 +     * @return the position index after the final ';' of the pragma,
   1.251 +     * or -1 on failure.
   1.252 +     */
   1.253 +    int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
   1.254 +
   1.255 +    /**
   1.256 +     * Called by main parser upon syntax error.  Search the rule string
   1.257 +     * for the probable end of the rule.  Of course, if the error is that
   1.258 +     * the end of rule marker is missing, then the rule end will not be found.
   1.259 +     * In any case the rule start will be correctly reported.
   1.260 +     * @param parseErrorCode error code.
   1.261 +     * @param msg error description.
   1.262 +     * @param start position of first character of current rule.
   1.263 +     * @return start position of first character of current rule.
   1.264 +     */
   1.265 +    int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
   1.266 +                        UErrorCode& status);
   1.267 +
   1.268 +    /**
   1.269 +     * Parse a UnicodeSet out, store it, and return the stand-in character
   1.270 +     * used to represent it.
   1.271 +     *
   1.272 +     * @param rule    the rule for UnicodeSet.
   1.273 +     * @param pos     the position in pattern at which to start parsing.
   1.274 +     * @return        the stand-in character used to represent it.
   1.275 +     */
   1.276 +    UChar parseSet(const UnicodeString& rule,
   1.277 +                   ParsePosition& pos,
   1.278 +                   UErrorCode& status);
   1.279 +
   1.280 +    /**
   1.281 +     * Generate and return a stand-in for a new UnicodeFunctor.  Store
   1.282 +     * the matcher (adopt it).
   1.283 +     * @param adopted the UnicodeFunctor to be adopted.
   1.284 +     * @return        a stand-in for a new UnicodeFunctor.
   1.285 +     */
   1.286 +    UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
   1.287 +
   1.288 +    /**
   1.289 +     * Return the standin for segment seg (1-based).
   1.290 +     * @param seg    the given segment.
   1.291 +     * @return       the standIn character for the given segment.
   1.292 +     */
   1.293 +    UChar getSegmentStandin(int32_t seg, UErrorCode& status);
   1.294 +
   1.295 +    /**
   1.296 +     * Set the object for segment seg (1-based).
   1.297 +     * @param seg      the given segment.
   1.298 +     * @param adopted  the StringMatcher to be adopted.
   1.299 +     */
   1.300 +    void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
   1.301 +
   1.302 +    /**
   1.303 +     * Return the stand-in for the dot set.  It is allocated the first
   1.304 +     * time and reused thereafter.
   1.305 +     * @return    the stand-in for the dot set.
   1.306 +     */
   1.307 +    UChar getDotStandIn(UErrorCode& status);
   1.308 +
   1.309 +    /**
   1.310 +     * Append the value of the given variable name to the given
   1.311 +     * UnicodeString.
   1.312 +     * @param name    the variable name to be appended.
   1.313 +     * @param buf     the given UnicodeString to append to.
   1.314 +     */
   1.315 +    void appendVariableDef(const UnicodeString& name,
   1.316 +                           UnicodeString& buf,
   1.317 +                           UErrorCode& status);
   1.318 +
   1.319 +    /**
   1.320 +     * Glue method to get around access restrictions in C++.
   1.321 +     */
   1.322 +    /*static Transliterator* createBasicInstance(const UnicodeString& id,
   1.323 +                                               const UnicodeString* canonID);*/
   1.324 +
   1.325 +    friend class RuleHalf;
   1.326 +
   1.327 +    // Disallowed methods; no impl.
   1.328 +    /**
   1.329 +     * Copy constructor
   1.330 +     */
   1.331 +    TransliteratorParser(const TransliteratorParser&);
   1.332 +    
   1.333 +    /**
   1.334 +     * Assignment operator
   1.335 +     */
   1.336 +    TransliteratorParser& operator=(const TransliteratorParser&);
   1.337 +};
   1.338 +
   1.339 +U_NAMESPACE_END
   1.340 +
   1.341 +#endif /* #ifdef __cplusplus */
   1.342 +
   1.343 +/**
   1.344 + * Strip/convert the following from the transliterator rules:
   1.345 + * comments
   1.346 + * newlines
   1.347 + * white space at the beginning and end of a line
   1.348 + * unescape \u notation
   1.349 + *
   1.350 + * The target must be equal in size as the source.
   1.351 + * @internal
   1.352 + */
   1.353 +U_CAPI int32_t
   1.354 +utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
   1.355 +
   1.356 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */
   1.357 +
   1.358 +#endif

mercurial