intl/icu/source/i18n/rbt_pars.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 **********************************************************************
     3 * Copyright (C) 1999-2011, International Business Machines Corporation
     4 * and others. All Rights Reserved.
     5 **********************************************************************
     6 *   Date        Name        Description
     7 *   11/17/99    aliu        Creation.
     8 **********************************************************************
     9 */
    10 #ifndef RBT_PARS_H
    11 #define RBT_PARS_H
    13 #include "unicode/utypes.h"
    15 #if !UCONFIG_NO_TRANSLITERATION
    16 #ifdef __cplusplus
    18 #include "unicode/uobject.h"
    19 #include "unicode/parseerr.h"
    20 #include "unicode/unorm.h"
    21 #include "rbt.h"
    22 #include "hash.h"
    23 #include "uvector.h"
    25 U_NAMESPACE_BEGIN
    27 class TransliterationRuleData;
    28 class UnicodeFunctor;
    29 class ParseData;
    30 class RuleHalf;
    31 class ParsePosition;
    32 class StringMatcher;
    34 class TransliteratorParser : public UMemory {
    36  public:
    38     /**
    39      * A Vector of TransliterationRuleData objects, one for each discrete group
    40      * of rules in the rule set
    41      */
    42     UVector dataVector;
    44     /**
    45      * PUBLIC data member.
    46      * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
    47      */
    48     UVector idBlockVector;
    50     /**
    51      * PUBLIC data member containing the parsed compound filter, if any.
    52      */
    53     UnicodeSet* compoundFilter;
    55  private:
    57     /**
    58      * The current data object for which we are parsing rules
    59      */
    60     TransliterationRuleData* curData;
    62     UTransDirection direction;
    64     /**
    65      * Parse error information.
    66      */
    67     UParseError parseError;
    69     /**
    70      * Temporary symbol table used during parsing.
    71      */
    72     ParseData* parseData;
    74     /**
    75      * Temporary vector of matcher variables.  When parsing is complete, this
    76      * is copied into the array data.variables.  As with data.variables,
    77      * element 0 corresponds to character data.variablesBase.
    78      */
    79     UVector variablesVector;
    81     /**
    82      * Temporary table of variable names.  When parsing is complete, this is
    83      * copied into data.variableNames.
    84      */
    85     Hashtable variableNames;    
    87     /**
    88      * String of standins for segments.  Used during the parsing of a single
    89      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
    90      * to StringMatcher object segmentObjects.elementAt(0), etc.
    91      */
    92     UnicodeString segmentStandins;
    94     /**
    95      * Vector of StringMatcher objects for segments.  Used during the
    96      * parsing of a single rule.  
    97      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
    98      * to StringMatcher object segmentObjects.elementAt(0), etc.
    99      */
   100     UVector segmentObjects;
   102     /**
   103      * The next available stand-in for variables.  This starts at some point in
   104      * the private use area (discovered dynamically) and increments up toward
   105      * <code>variableLimit</code>.  At any point during parsing, available
   106      * variables are <code>variableNext..variableLimit-1</code>.
   107      */
   108     UChar variableNext;
   110     /**
   111      * The last available stand-in for variables.  This is discovered
   112      * dynamically.  At any point during parsing, available variables are
   113      * <code>variableNext..variableLimit-1</code>.
   114      */
   115     UChar variableLimit;
   117     /**
   118      * When we encounter an undefined variable, we do not immediately signal
   119      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
   120      * Instead, we save the name of the undefined variable, and substitute
   121      * in the placeholder char variableLimit - 1, and decrement
   122      * variableLimit.
   123      */
   124     UnicodeString undefinedVariableName;
   126     /**
   127      * The stand-in character for the 'dot' set, represented by '.' in
   128      * patterns.  This is allocated the first time it is needed, and
   129      * reused thereafter.
   130      */
   131     UChar dotStandIn;
   133 public:
   135     /**
   136      * Constructor.
   137      */
   138     TransliteratorParser(UErrorCode &statusReturn);
   140     /**
   141      * Destructor.
   142      */
   143     ~TransliteratorParser();
   145     /**
   146      * Parse the given string as a sequence of rules, separated by newline
   147      * characters ('\n'), and cause this object to implement those rules.  Any
   148      * previous rules are discarded.  Typically this method is called exactly
   149      * once after construction.
   150      *
   151      * Parse the given rules, in the given direction.  After this call
   152      * returns, query the public data members for results.  The caller
   153      * owns the 'data' and 'compoundFilter' data members after this
   154      * call returns.
   155      * @param rules      rules, separated by ';'
   156      * @param direction  either FORWARD or REVERSE.
   157      * @param pe         Struct to recieve information on position 
   158      *                   of error if an error is encountered
   159      * @param ec         Output param set to success/failure code.
   160      */
   161     void parse(const UnicodeString& rules,
   162                UTransDirection direction,
   163                UParseError& pe,
   164                UErrorCode& ec);
   166     /**
   167      * Return the compound filter parsed by parse().  Caller owns result.
   168      * @return the compound filter parsed by parse().
   169      */ 
   170     UnicodeSet* orphanCompoundFilter();
   172 private:
   174     /**
   175      * Return a representation of this transliterator as source rules.
   176      * @param rules      Output param to receive the rules.
   177      * @param direction  either FORWARD or REVERSE.
   178      */
   179     void parseRules(const UnicodeString& rules,
   180                     UTransDirection direction,
   181                     UErrorCode& status);
   183     /**
   184      * MAIN PARSER.  Parse the next rule in the given rule string, starting
   185      * at pos.  Return the index after the last character parsed.  Do not
   186      * parse characters at or after limit.
   187      *
   188      * Important:  The character at pos must be a non-whitespace character
   189      * that is not the comment character.
   190      *
   191      * This method handles quoting, escaping, and whitespace removal.  It
   192      * parses the end-of-rule character.  It recognizes context and cursor
   193      * indicators.  Once it does a lexical breakdown of the rule at pos, it
   194      * creates a rule object and adds it to our rule list.
   195      * @param rules      Output param to receive the rules.
   196      * @param pos        the starting position.
   197      * @param limit      pointer past the last character of the rule.
   198      * @return           the index after the last character parsed.
   199      */
   200     int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
   202     /**
   203      * Set the variable range to [start, end] (inclusive).
   204      * @param start    the start value of the range.
   205      * @param end      the end value of the range.
   206      */
   207     void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
   209     /**
   210      * Assert that the given character is NOT within the variable range.
   211      * If it is, return FALSE.  This is neccesary to ensure that the
   212      * variable range does not overlap characters used in a rule.
   213      * @param ch     the given character.
   214      * @return       True, if the given character is NOT within the variable range.
   215      */
   216     UBool checkVariableRange(UChar32 ch) const;
   218     /**
   219      * Set the maximum backup to 'backup', in response to a pragma
   220      * statement.
   221      * @param backup    the new value to be set.
   222      */
   223     void pragmaMaximumBackup(int32_t backup);
   225     /**
   226      * Begin normalizing all rules using the given mode, in response
   227      * to a pragma statement.
   228      * @param mode    the given mode.
   229      */
   230     void pragmaNormalizeRules(UNormalizationMode mode);
   232     /**
   233      * Return true if the given rule looks like a pragma.
   234      * @param pos offset to the first non-whitespace character
   235      * of the rule.
   236      * @param limit pointer past the last character of the rule.
   237      * @return true if the given rule looks like a pragma.
   238      */
   239     static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
   241     /**
   242      * Parse a pragma.  This method assumes resemblesPragma() has
   243      * already returned true.
   244      * @param pos offset to the first non-whitespace character
   245      * of the rule.
   246      * @param limit pointer past the last character of the rule.
   247      * @return the position index after the final ';' of the pragma,
   248      * or -1 on failure.
   249      */
   250     int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
   252     /**
   253      * Called by main parser upon syntax error.  Search the rule string
   254      * for the probable end of the rule.  Of course, if the error is that
   255      * the end of rule marker is missing, then the rule end will not be found.
   256      * In any case the rule start will be correctly reported.
   257      * @param parseErrorCode error code.
   258      * @param msg error description.
   259      * @param start position of first character of current rule.
   260      * @return start position of first character of current rule.
   261      */
   262     int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
   263                         UErrorCode& status);
   265     /**
   266      * Parse a UnicodeSet out, store it, and return the stand-in character
   267      * used to represent it.
   268      *
   269      * @param rule    the rule for UnicodeSet.
   270      * @param pos     the position in pattern at which to start parsing.
   271      * @return        the stand-in character used to represent it.
   272      */
   273     UChar parseSet(const UnicodeString& rule,
   274                    ParsePosition& pos,
   275                    UErrorCode& status);
   277     /**
   278      * Generate and return a stand-in for a new UnicodeFunctor.  Store
   279      * the matcher (adopt it).
   280      * @param adopted the UnicodeFunctor to be adopted.
   281      * @return        a stand-in for a new UnicodeFunctor.
   282      */
   283     UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
   285     /**
   286      * Return the standin for segment seg (1-based).
   287      * @param seg    the given segment.
   288      * @return       the standIn character for the given segment.
   289      */
   290     UChar getSegmentStandin(int32_t seg, UErrorCode& status);
   292     /**
   293      * Set the object for segment seg (1-based).
   294      * @param seg      the given segment.
   295      * @param adopted  the StringMatcher to be adopted.
   296      */
   297     void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
   299     /**
   300      * Return the stand-in for the dot set.  It is allocated the first
   301      * time and reused thereafter.
   302      * @return    the stand-in for the dot set.
   303      */
   304     UChar getDotStandIn(UErrorCode& status);
   306     /**
   307      * Append the value of the given variable name to the given
   308      * UnicodeString.
   309      * @param name    the variable name to be appended.
   310      * @param buf     the given UnicodeString to append to.
   311      */
   312     void appendVariableDef(const UnicodeString& name,
   313                            UnicodeString& buf,
   314                            UErrorCode& status);
   316     /**
   317      * Glue method to get around access restrictions in C++.
   318      */
   319     /*static Transliterator* createBasicInstance(const UnicodeString& id,
   320                                                const UnicodeString* canonID);*/
   322     friend class RuleHalf;
   324     // Disallowed methods; no impl.
   325     /**
   326      * Copy constructor
   327      */
   328     TransliteratorParser(const TransliteratorParser&);
   330     /**
   331      * Assignment operator
   332      */
   333     TransliteratorParser& operator=(const TransliteratorParser&);
   334 };
   336 U_NAMESPACE_END
   338 #endif /* #ifdef __cplusplus */
   340 /**
   341  * Strip/convert the following from the transliterator rules:
   342  * comments
   343  * newlines
   344  * white space at the beginning and end of a line
   345  * unescape \u notation
   346  *
   347  * The target must be equal in size as the source.
   348  * @internal
   349  */
   350 U_CAPI int32_t
   351 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
   353 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
   355 #endif

mercurial