michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 1999-2011, International Business Machines Corporation michael@0: * and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: * Date Name Description michael@0: * 11/17/99 aliu Creation. michael@0: ********************************************************************** michael@0: */ michael@0: #ifndef RBT_PARS_H michael@0: #define RBT_PARS_H michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_TRANSLITERATION michael@0: #ifdef __cplusplus michael@0: michael@0: #include "unicode/uobject.h" michael@0: #include "unicode/parseerr.h" michael@0: #include "unicode/unorm.h" michael@0: #include "rbt.h" michael@0: #include "hash.h" michael@0: #include "uvector.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: class TransliterationRuleData; michael@0: class UnicodeFunctor; michael@0: class ParseData; michael@0: class RuleHalf; michael@0: class ParsePosition; michael@0: class StringMatcher; michael@0: michael@0: class TransliteratorParser : public UMemory { michael@0: michael@0: public: michael@0: michael@0: /** michael@0: * A Vector of TransliterationRuleData objects, one for each discrete group michael@0: * of rules in the rule set michael@0: */ michael@0: UVector dataVector; michael@0: michael@0: /** michael@0: * PUBLIC data member. michael@0: * A Vector of UnicodeStrings containing all of the ID blocks in the rule set michael@0: */ michael@0: UVector idBlockVector; michael@0: michael@0: /** michael@0: * PUBLIC data member containing the parsed compound filter, if any. michael@0: */ michael@0: UnicodeSet* compoundFilter; michael@0: michael@0: private: michael@0: michael@0: /** michael@0: * The current data object for which we are parsing rules michael@0: */ michael@0: TransliterationRuleData* curData; michael@0: michael@0: UTransDirection direction; michael@0: michael@0: /** michael@0: * Parse error information. michael@0: */ michael@0: UParseError parseError; michael@0: michael@0: /** michael@0: * Temporary symbol table used during parsing. michael@0: */ michael@0: ParseData* parseData; michael@0: michael@0: /** michael@0: * Temporary vector of matcher variables. When parsing is complete, this michael@0: * is copied into the array data.variables. As with data.variables, michael@0: * element 0 corresponds to character data.variablesBase. michael@0: */ michael@0: UVector variablesVector; michael@0: michael@0: /** michael@0: * Temporary table of variable names. When parsing is complete, this is michael@0: * copied into data.variableNames. michael@0: */ michael@0: Hashtable variableNames; michael@0: michael@0: /** michael@0: * String of standins for segments. Used during the parsing of a single michael@0: * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds michael@0: * to StringMatcher object segmentObjects.elementAt(0), etc. michael@0: */ michael@0: UnicodeString segmentStandins; michael@0: michael@0: /** michael@0: * Vector of StringMatcher objects for segments. Used during the michael@0: * parsing of a single rule. michael@0: * segmentStandins.charAt(0) is the standin for "$1" and corresponds michael@0: * to StringMatcher object segmentObjects.elementAt(0), etc. michael@0: */ michael@0: UVector segmentObjects; michael@0: michael@0: /** michael@0: * The next available stand-in for variables. This starts at some point in michael@0: * the private use area (discovered dynamically) and increments up toward michael@0: * variableLimit. At any point during parsing, available michael@0: * variables are variableNext..variableLimit-1. michael@0: */ michael@0: UChar variableNext; michael@0: michael@0: /** michael@0: * The last available stand-in for variables. This is discovered michael@0: * dynamically. At any point during parsing, available variables are michael@0: * variableNext..variableLimit-1. michael@0: */ michael@0: UChar variableLimit; michael@0: michael@0: /** michael@0: * When we encounter an undefined variable, we do not immediately signal michael@0: * an error, in case we are defining this variable, e.g., "$a = [a-z];". michael@0: * Instead, we save the name of the undefined variable, and substitute michael@0: * in the placeholder char variableLimit - 1, and decrement michael@0: * variableLimit. michael@0: */ michael@0: UnicodeString undefinedVariableName; michael@0: michael@0: /** michael@0: * The stand-in character for the 'dot' set, represented by '.' in michael@0: * patterns. This is allocated the first time it is needed, and michael@0: * reused thereafter. michael@0: */ michael@0: UChar dotStandIn; michael@0: michael@0: public: michael@0: michael@0: /** michael@0: * Constructor. michael@0: */ michael@0: TransliteratorParser(UErrorCode &statusReturn); michael@0: michael@0: /** michael@0: * Destructor. michael@0: */ michael@0: ~TransliteratorParser(); michael@0: michael@0: /** michael@0: * Parse the given string as a sequence of rules, separated by newline michael@0: * characters ('\n'), and cause this object to implement those rules. Any michael@0: * previous rules are discarded. Typically this method is called exactly michael@0: * once after construction. michael@0: * michael@0: * Parse the given rules, in the given direction. After this call michael@0: * returns, query the public data members for results. The caller michael@0: * owns the 'data' and 'compoundFilter' data members after this michael@0: * call returns. michael@0: * @param rules rules, separated by ';' michael@0: * @param direction either FORWARD or REVERSE. michael@0: * @param pe Struct to recieve information on position michael@0: * of error if an error is encountered michael@0: * @param ec Output param set to success/failure code. michael@0: */ michael@0: void parse(const UnicodeString& rules, michael@0: UTransDirection direction, michael@0: UParseError& pe, michael@0: UErrorCode& ec); michael@0: michael@0: /** michael@0: * Return the compound filter parsed by parse(). Caller owns result. michael@0: * @return the compound filter parsed by parse(). michael@0: */ michael@0: UnicodeSet* orphanCompoundFilter(); michael@0: michael@0: private: michael@0: michael@0: /** michael@0: * Return a representation of this transliterator as source rules. michael@0: * @param rules Output param to receive the rules. michael@0: * @param direction either FORWARD or REVERSE. michael@0: */ michael@0: void parseRules(const UnicodeString& rules, michael@0: UTransDirection direction, michael@0: UErrorCode& status); michael@0: michael@0: /** michael@0: * MAIN PARSER. Parse the next rule in the given rule string, starting michael@0: * at pos. Return the index after the last character parsed. Do not michael@0: * parse characters at or after limit. michael@0: * michael@0: * Important: The character at pos must be a non-whitespace character michael@0: * that is not the comment character. michael@0: * michael@0: * This method handles quoting, escaping, and whitespace removal. It michael@0: * parses the end-of-rule character. It recognizes context and cursor michael@0: * indicators. Once it does a lexical breakdown of the rule at pos, it michael@0: * creates a rule object and adds it to our rule list. michael@0: * @param rules Output param to receive the rules. michael@0: * @param pos the starting position. michael@0: * @param limit pointer past the last character of the rule. michael@0: * @return the index after the last character parsed. michael@0: */ michael@0: int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); michael@0: michael@0: /** michael@0: * Set the variable range to [start, end] (inclusive). michael@0: * @param start the start value of the range. michael@0: * @param end the end value of the range. michael@0: */ michael@0: void setVariableRange(int32_t start, int32_t end, UErrorCode& status); michael@0: michael@0: /** michael@0: * Assert that the given character is NOT within the variable range. michael@0: * If it is, return FALSE. This is neccesary to ensure that the michael@0: * variable range does not overlap characters used in a rule. michael@0: * @param ch the given character. michael@0: * @return True, if the given character is NOT within the variable range. michael@0: */ michael@0: UBool checkVariableRange(UChar32 ch) const; michael@0: michael@0: /** michael@0: * Set the maximum backup to 'backup', in response to a pragma michael@0: * statement. michael@0: * @param backup the new value to be set. michael@0: */ michael@0: void pragmaMaximumBackup(int32_t backup); michael@0: michael@0: /** michael@0: * Begin normalizing all rules using the given mode, in response michael@0: * to a pragma statement. michael@0: * @param mode the given mode. michael@0: */ michael@0: void pragmaNormalizeRules(UNormalizationMode mode); michael@0: michael@0: /** michael@0: * Return true if the given rule looks like a pragma. michael@0: * @param pos offset to the first non-whitespace character michael@0: * of the rule. michael@0: * @param limit pointer past the last character of the rule. michael@0: * @return true if the given rule looks like a pragma. michael@0: */ michael@0: static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit); michael@0: michael@0: /** michael@0: * Parse a pragma. This method assumes resemblesPragma() has michael@0: * already returned true. michael@0: * @param pos offset to the first non-whitespace character michael@0: * of the rule. michael@0: * @param limit pointer past the last character of the rule. michael@0: * @return the position index after the final ';' of the pragma, michael@0: * or -1 on failure. michael@0: */ michael@0: int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); michael@0: michael@0: /** michael@0: * Called by main parser upon syntax error. Search the rule string michael@0: * for the probable end of the rule. Of course, if the error is that michael@0: * the end of rule marker is missing, then the rule end will not be found. michael@0: * In any case the rule start will be correctly reported. michael@0: * @param parseErrorCode error code. michael@0: * @param msg error description. michael@0: * @param start position of first character of current rule. michael@0: * @return start position of first character of current rule. michael@0: */ michael@0: int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start, michael@0: UErrorCode& status); michael@0: michael@0: /** michael@0: * Parse a UnicodeSet out, store it, and return the stand-in character michael@0: * used to represent it. michael@0: * michael@0: * @param rule the rule for UnicodeSet. michael@0: * @param pos the position in pattern at which to start parsing. michael@0: * @return the stand-in character used to represent it. michael@0: */ michael@0: UChar parseSet(const UnicodeString& rule, michael@0: ParsePosition& pos, michael@0: UErrorCode& status); michael@0: michael@0: /** michael@0: * Generate and return a stand-in for a new UnicodeFunctor. Store michael@0: * the matcher (adopt it). michael@0: * @param adopted the UnicodeFunctor to be adopted. michael@0: * @return a stand-in for a new UnicodeFunctor. michael@0: */ michael@0: UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); michael@0: michael@0: /** michael@0: * Return the standin for segment seg (1-based). michael@0: * @param seg the given segment. michael@0: * @return the standIn character for the given segment. michael@0: */ michael@0: UChar getSegmentStandin(int32_t seg, UErrorCode& status); michael@0: michael@0: /** michael@0: * Set the object for segment seg (1-based). michael@0: * @param seg the given segment. michael@0: * @param adopted the StringMatcher to be adopted. michael@0: */ michael@0: void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status); michael@0: michael@0: /** michael@0: * Return the stand-in for the dot set. It is allocated the first michael@0: * time and reused thereafter. michael@0: * @return the stand-in for the dot set. michael@0: */ michael@0: UChar getDotStandIn(UErrorCode& status); michael@0: michael@0: /** michael@0: * Append the value of the given variable name to the given michael@0: * UnicodeString. michael@0: * @param name the variable name to be appended. michael@0: * @param buf the given UnicodeString to append to. michael@0: */ michael@0: void appendVariableDef(const UnicodeString& name, michael@0: UnicodeString& buf, michael@0: UErrorCode& status); michael@0: michael@0: /** michael@0: * Glue method to get around access restrictions in C++. michael@0: */ michael@0: /*static Transliterator* createBasicInstance(const UnicodeString& id, michael@0: const UnicodeString* canonID);*/ michael@0: michael@0: friend class RuleHalf; michael@0: michael@0: // Disallowed methods; no impl. michael@0: /** michael@0: * Copy constructor michael@0: */ michael@0: TransliteratorParser(const TransliteratorParser&); michael@0: michael@0: /** michael@0: * Assignment operator michael@0: */ michael@0: TransliteratorParser& operator=(const TransliteratorParser&); michael@0: }; michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #ifdef __cplusplus */ michael@0: michael@0: /** michael@0: * Strip/convert the following from the transliterator rules: michael@0: * comments michael@0: * newlines michael@0: * white space at the beginning and end of a line michael@0: * unescape \u notation michael@0: * michael@0: * The target must be equal in size as the source. michael@0: * @internal michael@0: */ michael@0: U_CAPI int32_t michael@0: utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status); michael@0: michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */ michael@0: michael@0: #endif