The Tor Browser: intl/icu/source/i18n/rbt

     1 /*

     2 **********************************************************************

     3 * Copyright (C) 1999-2011, International Business Machines Corporation

     4 * and others. All Rights Reserved.

     5 **********************************************************************

     6 *   Date        Name        Description

     7 *   11/17/99    aliu        Creation.

     8 **********************************************************************

     9 */

    10 #ifndef RBT_PARS_H

    11 #define RBT_PARS_H

    13 #include "unicode/utypes.h"

    15 #if !UCONFIG_NO_TRANSLITERATION

    16 #ifdef __cplusplus

    18 #include "unicode/uobject.h"

    19 #include "unicode/parseerr.h"

    20 #include "unicode/unorm.h"

    21 #include "rbt.h"

    22 #include "hash.h"

    23 #include "uvector.h"

    25 U_NAMESPACE_BEGIN

    27 class TransliterationRuleData;

    28 class UnicodeFunctor;

    29 class ParseData;

    30 class RuleHalf;

    31 class ParsePosition;

    32 class StringMatcher;

    34 class TransliteratorParser : public UMemory {

    36  public:

    38     /**

    39      * A Vector of TransliterationRuleData objects, one for each discrete group

    40      * of rules in the rule set

    41      */

    42     UVector dataVector;

    44     /**

    45      * PUBLIC data member.

    46      * A Vector of UnicodeStrings containing all of the ID blocks in the rule set

    47      */

    48     UVector idBlockVector;

    50     /**

    51      * PUBLIC data member containing the parsed compound filter, if any.

    52      */

    53     UnicodeSet* compoundFilter;

    55  private:

    57     /**

    58      * The current data object for which we are parsing rules

    59      */

    60     TransliterationRuleData* curData;

    62     UTransDirection direction;

    64     /**

    65      * Parse error information.

    66      */

    67     UParseError parseError;

    69     /**

    70      * Temporary symbol table used during parsing.

    71      */

    72     ParseData* parseData;

    74     /**

    75      * Temporary vector of matcher variables.  When parsing is complete, this

    76      * is copied into the array data.variables.  As with data.variables,

    77      * element 0 corresponds to character data.variablesBase.

    78      */

    79     UVector variablesVector;

    81     /**

    82      * Temporary table of variable names.  When parsing is complete, this is

    83      * copied into data.variableNames.

    84      */

    85     Hashtable variableNames;

    87     /**

    88      * String of standins for segments.  Used during the parsing of a single

    89      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds

    90      * to StringMatcher object segmentObjects.elementAt(0), etc.

    91      */

    92     UnicodeString segmentStandins;

    94     /**

    95      * Vector of StringMatcher objects for segments.  Used during the

    96      * parsing of a single rule.

    97      * segmentStandins.charAt(0) is the standin for "$1" and corresponds

    98      * to StringMatcher object segmentObjects.elementAt(0), etc.

    99      */

   100     UVector segmentObjects;

   102     /**

   103      * The next available stand-in for variables.  This starts at some point in

   104      * the private use area (discovered dynamically) and increments up toward

   105      * <code>variableLimit</code>.  At any point during parsing, available

   106      * variables are <code>variableNext..variableLimit-1</code>.

   107      */

   108     UChar variableNext;

   110     /**

   111      * The last available stand-in for variables.  This is discovered

   112      * dynamically.  At any point during parsing, available variables are

   113      * <code>variableNext..variableLimit-1</code>.

   114      */

   115     UChar variableLimit;

   117     /**

   118      * When we encounter an undefined variable, we do not immediately signal

   119      * an error, in case we are defining this variable, e.g., "$a = [a-z];".

   120      * Instead, we save the name of the undefined variable, and substitute

   121      * in the placeholder char variableLimit - 1, and decrement

   122      * variableLimit.

   123      */

   124     UnicodeString undefinedVariableName;

   126     /**

   127      * The stand-in character for the 'dot' set, represented by '.' in

   128      * patterns.  This is allocated the first time it is needed, and

   129      * reused thereafter.

   130      */

   131     UChar dotStandIn;

   133 public:

   135     /**

   136      * Constructor.

   137      */

   138     TransliteratorParser(UErrorCode &statusReturn);

   140     /**

   141      * Destructor.

   142      */

   143     ~TransliteratorParser();

   145     /**

   146      * Parse the given string as a sequence of rules, separated by newline

   147      * characters ('\n'), and cause this object to implement those rules.  Any

   148      * previous rules are discarded.  Typically this method is called exactly

   149      * once after construction.

   150      *

   151      * Parse the given rules, in the given direction.  After this call

   152      * returns, query the public data members for results.  The caller

   153      * owns the 'data' and 'compoundFilter' data members after this

   154      * call returns.

   155      * @param rules      rules, separated by ';'

   156      * @param direction  either FORWARD or REVERSE.

   157      * @param pe         Struct to recieve information on position

   158      *                   of error if an error is encountered

   159      * @param ec         Output param set to success/failure code.

   160      */

   161     void parse(const UnicodeString& rules,

   162                UTransDirection direction,

   163                UParseError& pe,

   164                UErrorCode& ec);

   166     /**

   167      * Return the compound filter parsed by parse().  Caller owns result.

   168      * @return the compound filter parsed by parse().

   169      */

   170     UnicodeSet* orphanCompoundFilter();

   172 private:

   174     /**

   175      * Return a representation of this transliterator as source rules.

   176      * @param rules      Output param to receive the rules.

   177      * @param direction  either FORWARD or REVERSE.

   178      */

   179     void parseRules(const UnicodeString& rules,

   180                     UTransDirection direction,

   181                     UErrorCode& status);

   183     /**

   184      * MAIN PARSER.  Parse the next rule in the given rule string, starting

   185      * at pos.  Return the index after the last character parsed.  Do not

   186      * parse characters at or after limit.

   187      *

   188      * Important:  The character at pos must be a non-whitespace character

   189      * that is not the comment character.

   190      *

   191      * This method handles quoting, escaping, and whitespace removal.  It

   192      * parses the end-of-rule character.  It recognizes context and cursor

   193      * indicators.  Once it does a lexical breakdown of the rule at pos, it

   194      * creates a rule object and adds it to our rule list.

   195      * @param rules      Output param to receive the rules.

   196      * @param pos        the starting position.

   197      * @param limit      pointer past the last character of the rule.

   198      * @return           the index after the last character parsed.

   199      */

   200     int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

   202     /**

   203      * Set the variable range to [start, end] (inclusive).

   204      * @param start    the start value of the range.

   205      * @param end      the end value of the range.

   206      */

   207     void setVariableRange(int32_t start, int32_t end, UErrorCode& status);

   209     /**

   210      * Assert that the given character is NOT within the variable range.

   211      * If it is, return FALSE.  This is neccesary to ensure that the

   212      * variable range does not overlap characters used in a rule.

   213      * @param ch     the given character.

   214      * @return       True, if the given character is NOT within the variable range.

   215      */

   216     UBool checkVariableRange(UChar32 ch) const;

   218     /**

   219      * Set the maximum backup to 'backup', in response to a pragma

   220      * statement.

   221      * @param backup    the new value to be set.

   222      */

   223     void pragmaMaximumBackup(int32_t backup);

   225     /**

   226      * Begin normalizing all rules using the given mode, in response

   227      * to a pragma statement.

   228      * @param mode    the given mode.

   229      */

   230     void pragmaNormalizeRules(UNormalizationMode mode);

   232     /**

   233      * Return true if the given rule looks like a pragma.

   234      * @param pos offset to the first non-whitespace character

   235      * of the rule.

   236      * @param limit pointer past the last character of the rule.

   237      * @return true if the given rule looks like a pragma.

   238      */

   239     static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);

   241     /**

   242      * Parse a pragma.  This method assumes resemblesPragma() has

   243      * already returned true.

   244      * @param pos offset to the first non-whitespace character

   245      * of the rule.

   246      * @param limit pointer past the last character of the rule.

   247      * @return the position index after the final ';' of the pragma,

   248      * or -1 on failure.

   249      */

   250     int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

   252     /**

   253      * Called by main parser upon syntax error.  Search the rule string

   254      * for the probable end of the rule.  Of course, if the error is that

   255      * the end of rule marker is missing, then the rule end will not be found.

   256      * In any case the rule start will be correctly reported.

   257      * @param parseErrorCode error code.

   258      * @param msg error description.

   259      * @param start position of first character of current rule.

   260      * @return start position of first character of current rule.

   261      */

   262     int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,

   263                         UErrorCode& status);

   265     /**

   266      * Parse a UnicodeSet out, store it, and return the stand-in character

   267      * used to represent it.

   268      *

   269      * @param rule    the rule for UnicodeSet.

   270      * @param pos     the position in pattern at which to start parsing.

   271      * @return        the stand-in character used to represent it.

   272      */

   273     UChar parseSet(const UnicodeString& rule,

   274                    ParsePosition& pos,

   275                    UErrorCode& status);

   277     /**

   278      * Generate and return a stand-in for a new UnicodeFunctor.  Store

   279      * the matcher (adopt it).

   280      * @param adopted the UnicodeFunctor to be adopted.

   281      * @return        a stand-in for a new UnicodeFunctor.

   282      */

   283     UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);

   285     /**

   286      * Return the standin for segment seg (1-based).

   287      * @param seg    the given segment.

   288      * @return       the standIn character for the given segment.

   289      */

   290     UChar getSegmentStandin(int32_t seg, UErrorCode& status);

   292     /**

   293      * Set the object for segment seg (1-based).

   294      * @param seg      the given segment.

   295      * @param adopted  the StringMatcher to be adopted.

   296      */

   297     void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);

   299     /**

   300      * Return the stand-in for the dot set.  It is allocated the first

   301      * time and reused thereafter.

   302      * @return    the stand-in for the dot set.

   303      */

   304     UChar getDotStandIn(UErrorCode& status);

   306     /**

   307      * Append the value of the given variable name to the given

   308      * UnicodeString.

   309      * @param name    the variable name to be appended.

   310      * @param buf     the given UnicodeString to append to.

   311      */

   312     void appendVariableDef(const UnicodeString& name,

   313                            UnicodeString& buf,

   314                            UErrorCode& status);

   316     /**

   317      * Glue method to get around access restrictions in C++.

   318      */

   319     /*static Transliterator* createBasicInstance(const UnicodeString& id,

   320                                                const UnicodeString* canonID);*/

   322     friend class RuleHalf;

   324     // Disallowed methods; no impl.

   325     /**

   326      * Copy constructor

   327      */

   328     TransliteratorParser(const TransliteratorParser&);

   330     /**

   331      * Assignment operator

   332      */

   333     TransliteratorParser& operator=(const TransliteratorParser&);

   334 };

   336 U_NAMESPACE_END

   338 #endif /* #ifdef __cplusplus */

   340 /**

   341  * Strip/convert the following from the transliterator rules:

   342  * comments

   343  * newlines

   344  * white space at the beginning and end of a line

   345  * unescape \u notation

   346  *

   347  * The target must be equal in size as the source.

   348  * @internal

   349  */

   350 U_CAPI int32_t

   351 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);

   353 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

   355 #endif

The Tor Browser / file revision

intl/icu/source/i18n/rbt_pars.h@fc2d59ddac77

intl/icu/source/i18n/rbt_pars.h