intl/icu/source/i18n/rbt_pars.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/rbt_pars.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1738 @@
     1.4 +/*
     1.5 + **********************************************************************
     1.6 + *   Copyright (C) 1999-2011, International Business Machines
     1.7 + *   Corporation and others.  All Rights Reserved.
     1.8 + **********************************************************************
     1.9 + *   Date        Name        Description
    1.10 + *   11/17/99    aliu        Creation.
    1.11 + **********************************************************************
    1.12 + */
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +
    1.16 +#if !UCONFIG_NO_TRANSLITERATION
    1.17 +
    1.18 +#include "unicode/uobject.h"
    1.19 +#include "unicode/parseerr.h"
    1.20 +#include "unicode/parsepos.h"
    1.21 +#include "unicode/putil.h"
    1.22 +#include "unicode/uchar.h"
    1.23 +#include "unicode/ustring.h"
    1.24 +#include "unicode/uniset.h"
    1.25 +#include "unicode/utf16.h"
    1.26 +#include "cstring.h"
    1.27 +#include "funcrepl.h"
    1.28 +#include "hash.h"
    1.29 +#include "quant.h"
    1.30 +#include "rbt.h"
    1.31 +#include "rbt_data.h"
    1.32 +#include "rbt_pars.h"
    1.33 +#include "rbt_rule.h"
    1.34 +#include "strmatch.h"
    1.35 +#include "strrepl.h"
    1.36 +#include "unicode/symtable.h"
    1.37 +#include "tridpars.h"
    1.38 +#include "uvector.h"
    1.39 +#include "hash.h"
    1.40 +#include "patternprops.h"
    1.41 +#include "util.h"
    1.42 +#include "cmemory.h"
    1.43 +#include "uprops.h"
    1.44 +#include "putilimp.h"
    1.45 +
    1.46 +// Operators
    1.47 +#define VARIABLE_DEF_OP ((UChar)0x003D) /*=*/
    1.48 +#define FORWARD_RULE_OP ((UChar)0x003E) /*>*/
    1.49 +#define REVERSE_RULE_OP ((UChar)0x003C) /*<*/
    1.50 +#define FWDREV_RULE_OP  ((UChar)0x007E) /*~*/ // internal rep of <> op
    1.51 +
    1.52 +// Other special characters
    1.53 +#define QUOTE             ((UChar)0x0027) /*'*/
    1.54 +#define ESCAPE            ((UChar)0x005C) /*\*/
    1.55 +#define END_OF_RULE       ((UChar)0x003B) /*;*/
    1.56 +#define RULE_COMMENT_CHAR ((UChar)0x0023) /*#*/
    1.57 +
    1.58 +#define SEGMENT_OPEN       ((UChar)0x0028) /*(*/
    1.59 +#define SEGMENT_CLOSE      ((UChar)0x0029) /*)*/
    1.60 +#define CONTEXT_ANTE       ((UChar)0x007B) /*{*/
    1.61 +#define CONTEXT_POST       ((UChar)0x007D) /*}*/
    1.62 +#define CURSOR_POS         ((UChar)0x007C) /*|*/
    1.63 +#define CURSOR_OFFSET      ((UChar)0x0040) /*@*/
    1.64 +#define ANCHOR_START       ((UChar)0x005E) /*^*/
    1.65 +#define KLEENE_STAR        ((UChar)0x002A) /***/
    1.66 +#define ONE_OR_MORE        ((UChar)0x002B) /*+*/
    1.67 +#define ZERO_OR_ONE        ((UChar)0x003F) /*?*/
    1.68 +
    1.69 +#define DOT                ((UChar)46)     /*.*/
    1.70 +
    1.71 +static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]";
    1.72 +    91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90,
    1.73 +    108, 58, 93, 92, 114, 92, 110, 36, 93, 0
    1.74 +};
    1.75 +
    1.76 +// A function is denoted &Source-Target/Variant(text)
    1.77 +#define FUNCTION           ((UChar)38)     /*&*/
    1.78 +
    1.79 +// Aliases for some of the syntax characters. These are provided so
    1.80 +// transliteration rules can be expressed in XML without clashing with
    1.81 +// XML syntax characters '<', '>', and '&'.
    1.82 +#define ALT_REVERSE_RULE_OP ((UChar)0x2190) // Left Arrow
    1.83 +#define ALT_FORWARD_RULE_OP ((UChar)0x2192) // Right Arrow
    1.84 +#define ALT_FWDREV_RULE_OP  ((UChar)0x2194) // Left Right Arrow
    1.85 +#define ALT_FUNCTION        ((UChar)0x2206) // Increment (~Greek Capital Delta)
    1.86 +
    1.87 +// Special characters disallowed at the top level
    1.88 +static const UChar ILLEGAL_TOP[] = {41,0}; // ")"
    1.89 +
    1.90 +// Special characters disallowed within a segment
    1.91 +static const UChar ILLEGAL_SEG[] = {123,125,124,64,0}; // "{}|@"
    1.92 +
    1.93 +// Special characters disallowed within a function argument
    1.94 +static const UChar ILLEGAL_FUNC[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^(.*+?{}|@"
    1.95 +
    1.96 +// By definition, the ANCHOR_END special character is a
    1.97 +// trailing SymbolTable.SYMBOL_REF character.
    1.98 +// private static final char ANCHOR_END       = '$';
    1.99 +
   1.100 +static const UChar gOPERATORS[] = { // "=><"
   1.101 +    VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,
   1.102 +    ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,
   1.103 +    0
   1.104 +};
   1.105 +
   1.106 +static const UChar HALF_ENDERS[] = { // "=><;"
   1.107 +    VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,
   1.108 +    ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,
   1.109 +    END_OF_RULE,
   1.110 +    0
   1.111 +};
   1.112 +
   1.113 +// These are also used in Transliterator::toRules()
   1.114 +static const int32_t ID_TOKEN_LEN = 2;
   1.115 +static const UChar   ID_TOKEN[]   = { 0x3A, 0x3A }; // ':', ':'
   1.116 +
   1.117 +/*
   1.118 +commented out until we do real ::BEGIN/::END functionality
   1.119 +static const int32_t BEGIN_TOKEN_LEN = 5;
   1.120 +static const UChar BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN'
   1.121 +
   1.122 +static const int32_t END_TOKEN_LEN = 3;
   1.123 +static const UChar END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END'
   1.124 +*/
   1.125 +
   1.126 +U_NAMESPACE_BEGIN
   1.127 +
   1.128 +//----------------------------------------------------------------------
   1.129 +// BEGIN ParseData
   1.130 +//----------------------------------------------------------------------
   1.131 +
   1.132 +/**
   1.133 + * This class implements the SymbolTable interface.  It is used
   1.134 + * during parsing to give UnicodeSet access to variables that
   1.135 + * have been defined so far.  Note that it uses variablesVector,
   1.136 + * _not_ data.setVariables.
   1.137 + */
   1.138 +class ParseData : public UMemory, public SymbolTable {
   1.139 +public:
   1.140 +    const TransliterationRuleData* data; // alias
   1.141 +
   1.142 +    const UVector* variablesVector; // alias
   1.143 +
   1.144 +    const Hashtable* variableNames; // alias
   1.145 +
   1.146 +    ParseData(const TransliterationRuleData* data = 0,
   1.147 +              const UVector* variablesVector = 0,
   1.148 +              const Hashtable* variableNames = 0);
   1.149 +
   1.150 +    virtual ~ParseData();
   1.151 +
   1.152 +    virtual const UnicodeString* lookup(const UnicodeString& s) const;
   1.153 +
   1.154 +    virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
   1.155 +
   1.156 +    virtual UnicodeString parseReference(const UnicodeString& text,
   1.157 +                                         ParsePosition& pos, int32_t limit) const;
   1.158 +    /**
   1.159 +     * Return true if the given character is a matcher standin or a plain
   1.160 +     * character (non standin).
   1.161 +     */
   1.162 +    UBool isMatcher(UChar32 ch);
   1.163 +
   1.164 +    /**
   1.165 +     * Return true if the given character is a replacer standin or a plain
   1.166 +     * character (non standin).
   1.167 +     */
   1.168 +    UBool isReplacer(UChar32 ch);
   1.169 +
   1.170 +private:
   1.171 +    ParseData(const ParseData &other); // forbid copying of this class
   1.172 +    ParseData &operator=(const ParseData &other); // forbid copying of this class
   1.173 +};
   1.174 +
   1.175 +ParseData::ParseData(const TransliterationRuleData* d,
   1.176 +                     const UVector* sets,
   1.177 +                     const Hashtable* vNames) :
   1.178 +    data(d), variablesVector(sets), variableNames(vNames) {}
   1.179 +
   1.180 +ParseData::~ParseData() {}
   1.181 +
   1.182 +/**
   1.183 + * Implement SymbolTable API.
   1.184 + */
   1.185 +const UnicodeString* ParseData::lookup(const UnicodeString& name) const {
   1.186 +    return (const UnicodeString*) variableNames->get(name);
   1.187 +}
   1.188 +
   1.189 +/**
   1.190 + * Implement SymbolTable API.
   1.191 + */
   1.192 +const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const {
   1.193 +    // Note that we cannot use data.lookupSet() because the
   1.194 +    // set array has not been constructed yet.
   1.195 +    const UnicodeFunctor* set = NULL;
   1.196 +    int32_t i = ch - data->variablesBase;
   1.197 +    if (i >= 0 && i < variablesVector->size()) {
   1.198 +        int32_t i = ch - data->variablesBase;
   1.199 +        set = (i < variablesVector->size()) ?
   1.200 +            (UnicodeFunctor*) variablesVector->elementAt(i) : 0;
   1.201 +    }
   1.202 +    return set;
   1.203 +}
   1.204 +
   1.205 +/**
   1.206 + * Implement SymbolTable API.  Parse out a symbol reference
   1.207 + * name.
   1.208 + */
   1.209 +UnicodeString ParseData::parseReference(const UnicodeString& text,
   1.210 +                                        ParsePosition& pos, int32_t limit) const {
   1.211 +    int32_t start = pos.getIndex();
   1.212 +    int32_t i = start;
   1.213 +    UnicodeString result;
   1.214 +    while (i < limit) {
   1.215 +        UChar c = text.charAt(i);
   1.216 +        if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
   1.217 +            break;
   1.218 +        }
   1.219 +        ++i;
   1.220 +    }
   1.221 +    if (i == start) { // No valid name chars
   1.222 +        return result; // Indicate failure with empty string
   1.223 +    }
   1.224 +    pos.setIndex(i);
   1.225 +    text.extractBetween(start, i, result);
   1.226 +    return result;
   1.227 +}
   1.228 +
   1.229 +UBool ParseData::isMatcher(UChar32 ch) {
   1.230 +    // Note that we cannot use data.lookup() because the
   1.231 +    // set array has not been constructed yet.
   1.232 +    int32_t i = ch - data->variablesBase;
   1.233 +    if (i >= 0 && i < variablesVector->size()) {
   1.234 +        UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i);
   1.235 +        return f != NULL && f->toMatcher() != NULL;
   1.236 +    }
   1.237 +    return TRUE;
   1.238 +}
   1.239 +
   1.240 +/**
   1.241 + * Return true if the given character is a replacer standin or a plain
   1.242 + * character (non standin).
   1.243 + */
   1.244 +UBool ParseData::isReplacer(UChar32 ch) {
   1.245 +    // Note that we cannot use data.lookup() because the
   1.246 +    // set array has not been constructed yet.
   1.247 +    int i = ch - data->variablesBase;
   1.248 +    if (i >= 0 && i < variablesVector->size()) {
   1.249 +        UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i);
   1.250 +        return f != NULL && f->toReplacer() != NULL;
   1.251 +    }
   1.252 +    return TRUE;
   1.253 +}
   1.254 +
   1.255 +//----------------------------------------------------------------------
   1.256 +// BEGIN RuleHalf
   1.257 +//----------------------------------------------------------------------
   1.258 +
   1.259 +/**
   1.260 + * A class representing one side of a rule.  This class knows how to
   1.261 + * parse half of a rule.  It is tightly coupled to the method
   1.262 + * RuleBasedTransliterator.Parser.parseRule().
   1.263 + */
   1.264 +class RuleHalf : public UMemory {
   1.265 +
   1.266 +public:
   1.267 +
   1.268 +    UnicodeString text;
   1.269 +
   1.270 +    int32_t cursor; // position of cursor in text
   1.271 +    int32_t ante;   // position of ante context marker '{' in text
   1.272 +    int32_t post;   // position of post context marker '}' in text
   1.273 +
   1.274 +    // Record the offset to the cursor either to the left or to the
   1.275 +    // right of the key.  This is indicated by characters on the output
   1.276 +    // side that allow the cursor to be positioned arbitrarily within
   1.277 +    // the matching text.  For example, abc{def} > | @@@ xyz; changes
   1.278 +    // def to xyz and moves the cursor to before abc.  Offset characters
   1.279 +    // must be at the start or end, and they cannot move the cursor past
   1.280 +    // the ante- or postcontext text.  Placeholders are only valid in
   1.281 +    // output text.  The length of the ante and post context is
   1.282 +    // determined at runtime, because of supplementals and quantifiers.
   1.283 +    int32_t cursorOffset; // only nonzero on output side
   1.284 +
   1.285 +    // Position of first CURSOR_OFFSET on _right_.  This will be -1
   1.286 +    // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
   1.287 +    int32_t cursorOffsetPos;
   1.288 +
   1.289 +    UBool anchorStart;
   1.290 +    UBool anchorEnd;
   1.291 +
   1.292 +    /**
   1.293 +     * The segment number from 1..n of the next '(' we see
   1.294 +     * during parsing; 1-based.
   1.295 +     */
   1.296 +    int32_t nextSegmentNumber;
   1.297 +
   1.298 +    TransliteratorParser& parser;
   1.299 +
   1.300 +    //--------------------------------------------------
   1.301 +    // Methods
   1.302 +
   1.303 +    RuleHalf(TransliteratorParser& parser);
   1.304 +    ~RuleHalf();
   1.305 +
   1.306 +    int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
   1.307 +
   1.308 +    int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
   1.309 +                         UnicodeString& buf,
   1.310 +                         const UnicodeString& illegal,
   1.311 +                         UBool isSegment,
   1.312 +                         UErrorCode& status);
   1.313 +
   1.314 +    /**
   1.315 +     * Remove context.
   1.316 +     */
   1.317 +    void removeContext();
   1.318 +
   1.319 +    /**
   1.320 +     * Return true if this half looks like valid output, that is, does not
   1.321 +     * contain quantifiers or other special input-only elements.
   1.322 +     */
   1.323 +    UBool isValidOutput(TransliteratorParser& parser);
   1.324 +
   1.325 +    /**
   1.326 +     * Return true if this half looks like valid input, that is, does not
   1.327 +     * contain functions or other special output-only elements.
   1.328 +     */
   1.329 +    UBool isValidInput(TransliteratorParser& parser);
   1.330 +
   1.331 +    int syntaxError(UErrorCode code,
   1.332 +                    const UnicodeString& rule,
   1.333 +                    int32_t start,
   1.334 +                    UErrorCode& status) {
   1.335 +        return parser.syntaxError(code, rule, start, status);
   1.336 +    }
   1.337 +
   1.338 +private:
   1.339 +    // Disallowed methods; no impl.
   1.340 +    RuleHalf(const RuleHalf&);
   1.341 +    RuleHalf& operator=(const RuleHalf&);
   1.342 +};
   1.343 +
   1.344 +RuleHalf::RuleHalf(TransliteratorParser& p) :
   1.345 +    parser(p)
   1.346 +{
   1.347 +    cursor = -1;
   1.348 +    ante = -1;
   1.349 +    post = -1;
   1.350 +    cursorOffset = 0;
   1.351 +    cursorOffsetPos = 0;
   1.352 +    anchorStart = anchorEnd = FALSE;
   1.353 +    nextSegmentNumber = 1;
   1.354 +}
   1.355 +
   1.356 +RuleHalf::~RuleHalf() {
   1.357 +}
   1.358 +
   1.359 +/**
   1.360 + * Parse one side of a rule, stopping at either the limit,
   1.361 + * the END_OF_RULE character, or an operator.
   1.362 + * @return the index after the terminating character, or
   1.363 + * if limit was reached, limit
   1.364 + */
   1.365 +int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
   1.366 +    int32_t start = pos;
   1.367 +    text.truncate(0);
   1.368 +    pos = parseSection(rule, pos, limit, text, UnicodeString(TRUE, ILLEGAL_TOP, -1), FALSE, status);
   1.369 +
   1.370 +    if (cursorOffset > 0 && cursor != cursorOffsetPos) {
   1.371 +        return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
   1.372 +    }
   1.373 +    
   1.374 +    return pos;
   1.375 +}
   1.376 + 
   1.377 +/**
   1.378 + * Parse a section of one side of a rule, stopping at either
   1.379 + * the limit, the END_OF_RULE character, an operator, or a
   1.380 + * segment close character.  This method parses both a
   1.381 + * top-level rule half and a segment within such a rule half.
   1.382 + * It calls itself recursively to parse segments and nested
   1.383 + * segments.
   1.384 + * @param buf buffer into which to accumulate the rule pattern
   1.385 + * characters, either literal characters from the rule or
   1.386 + * standins for UnicodeMatcher objects including segments.
   1.387 + * @param illegal the set of special characters that is illegal during
   1.388 + * this parse.
   1.389 + * @param isSegment if true, then we've already seen a '(' and
   1.390 + * pos on entry points right after it.  Accumulate everything
   1.391 + * up to the closing ')', put it in a segment matcher object,
   1.392 + * generate a standin for it, and add the standin to buf.  As
   1.393 + * a side effect, update the segments vector with a reference
   1.394 + * to the segment matcher.  This works recursively for nested
   1.395 + * segments.  If isSegment is false, just accumulate
   1.396 + * characters into buf.
   1.397 + * @return the index after the terminating character, or
   1.398 + * if limit was reached, limit
   1.399 + */
   1.400 +int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
   1.401 +                               UnicodeString& buf,
   1.402 +                               const UnicodeString& illegal,
   1.403 +                               UBool isSegment, UErrorCode& status) {
   1.404 +    int32_t start = pos;
   1.405 +    ParsePosition pp;
   1.406 +    UnicodeString scratch;
   1.407 +    UBool done = FALSE;
   1.408 +    int32_t quoteStart = -1; // Most recent 'single quoted string'
   1.409 +    int32_t quoteLimit = -1;
   1.410 +    int32_t varStart = -1; // Most recent $variableReference
   1.411 +    int32_t varLimit = -1;
   1.412 +    int32_t bufStart = buf.length();
   1.413 +    
   1.414 +    while (pos < limit && !done) {
   1.415 +        // Since all syntax characters are in the BMP, fetching
   1.416 +        // 16-bit code units suffices here.
   1.417 +        UChar c = rule.charAt(pos++);
   1.418 +        if (PatternProps::isWhiteSpace(c)) {
   1.419 +            // Ignore whitespace.  Note that this is not Unicode
   1.420 +            // spaces, but Java spaces -- a subset, representing
   1.421 +            // whitespace likely to be seen in code.
   1.422 +            continue;
   1.423 +        }
   1.424 +        if (u_strchr(HALF_ENDERS, c) != NULL) {
   1.425 +            if (isSegment) {
   1.426 +                // Unclosed segment
   1.427 +                return syntaxError(U_UNCLOSED_SEGMENT, rule, start, status);
   1.428 +            }
   1.429 +            break;
   1.430 +        }
   1.431 +        if (anchorEnd) {
   1.432 +            // Text after a presumed end anchor is a syntax err
   1.433 +            return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start, status);
   1.434 +        }
   1.435 +        if (UnicodeSet::resemblesPattern(rule, pos-1)) {
   1.436 +            pp.setIndex(pos-1); // Backup to opening '['
   1.437 +            buf.append(parser.parseSet(rule, pp, status));
   1.438 +            if (U_FAILURE(status)) {
   1.439 +                return syntaxError(U_MALFORMED_SET, rule, start, status);
   1.440 +            }
   1.441 +            pos = pp.getIndex();                    
   1.442 +            continue;
   1.443 +        }
   1.444 +        // Handle escapes
   1.445 +        if (c == ESCAPE) {
   1.446 +            if (pos == limit) {
   1.447 +                return syntaxError(U_TRAILING_BACKSLASH, rule, start, status);
   1.448 +            }
   1.449 +            UChar32 escaped = rule.unescapeAt(pos); // pos is already past '\\'
   1.450 +            if (escaped == (UChar32) -1) {
   1.451 +                return syntaxError(U_MALFORMED_UNICODE_ESCAPE, rule, start, status);
   1.452 +            }
   1.453 +            if (!parser.checkVariableRange(escaped)) {
   1.454 +                return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);
   1.455 +            }
   1.456 +            buf.append(escaped);
   1.457 +            continue;
   1.458 +        }
   1.459 +        // Handle quoted matter
   1.460 +        if (c == QUOTE) {
   1.461 +            int32_t iq = rule.indexOf(QUOTE, pos);
   1.462 +            if (iq == pos) {
   1.463 +                buf.append(c); // Parse [''] outside quotes as [']
   1.464 +                ++pos;
   1.465 +            } else {
   1.466 +                /* This loop picks up a run of quoted text of the
   1.467 +                 * form 'aaaa' each time through.  If this run
   1.468 +                 * hasn't really ended ('aaaa''bbbb') then it keeps
   1.469 +                 * looping, each time adding on a new run.  When it
   1.470 +                 * reaches the final quote it breaks.
   1.471 +                 */
   1.472 +                quoteStart = buf.length();
   1.473 +                for (;;) {
   1.474 +                    if (iq < 0) {
   1.475 +                        return syntaxError(U_UNTERMINATED_QUOTE, rule, start, status);
   1.476 +                    }
   1.477 +                    scratch.truncate(0);
   1.478 +                    rule.extractBetween(pos, iq, scratch);
   1.479 +                    buf.append(scratch);
   1.480 +                    pos = iq+1;
   1.481 +                    if (pos < limit && rule.charAt(pos) == QUOTE) {
   1.482 +                        // Parse [''] inside quotes as [']
   1.483 +                        iq = rule.indexOf(QUOTE, pos+1);
   1.484 +                        // Continue looping
   1.485 +                    } else {
   1.486 +                        break;
   1.487 +                    }
   1.488 +                }
   1.489 +                quoteLimit = buf.length();
   1.490 +
   1.491 +                for (iq=quoteStart; iq<quoteLimit; ++iq) {
   1.492 +                    if (!parser.checkVariableRange(buf.charAt(iq))) {
   1.493 +                        return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);
   1.494 +                    }
   1.495 +                }
   1.496 +            }
   1.497 +            continue;
   1.498 +        }
   1.499 +
   1.500 +        if (!parser.checkVariableRange(c)) {
   1.501 +            return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);
   1.502 +        }
   1.503 +
   1.504 +        if (illegal.indexOf(c) >= 0) {
   1.505 +            syntaxError(U_ILLEGAL_CHARACTER, rule, start, status);
   1.506 +        }
   1.507 +
   1.508 +        switch (c) {
   1.509 +                    
   1.510 +        //------------------------------------------------------
   1.511 +        // Elements allowed within and out of segments
   1.512 +        //------------------------------------------------------
   1.513 +        case ANCHOR_START:
   1.514 +            if (buf.length() == 0 && !anchorStart) {
   1.515 +                anchorStart = TRUE;
   1.516 +            } else {
   1.517 +              return syntaxError(U_MISPLACED_ANCHOR_START,
   1.518 +                                 rule, start, status);
   1.519 +            }
   1.520 +          break;
   1.521 +        case SEGMENT_OPEN:
   1.522 +            {
   1.523 +                // bufSegStart is the offset in buf to the first
   1.524 +                // character of the segment we are parsing.
   1.525 +                int32_t bufSegStart = buf.length();
   1.526 +                
   1.527 +                // Record segment number now, since nextSegmentNumber
   1.528 +                // will be incremented during the call to parseSection
   1.529 +                // if there are nested segments.
   1.530 +                int32_t segmentNumber = nextSegmentNumber++; // 1-based
   1.531 +                
   1.532 +                // Parse the segment
   1.533 +                pos = parseSection(rule, pos, limit, buf, UnicodeString(TRUE, ILLEGAL_SEG, -1), TRUE, status);
   1.534 +                
   1.535 +                // After parsing a segment, the relevant characters are
   1.536 +                // in buf, starting at offset bufSegStart.  Extract them
   1.537 +                // into a string matcher, and replace them with a
   1.538 +                // standin for that matcher.
   1.539 +                StringMatcher* m =
   1.540 +                    new StringMatcher(buf, bufSegStart, buf.length(),
   1.541 +                                      segmentNumber, *parser.curData);
   1.542 +                if (m == NULL) {
   1.543 +                    return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
   1.544 +                }
   1.545 +                
   1.546 +                // Record and associate object and segment number
   1.547 +                parser.setSegmentObject(segmentNumber, m, status);
   1.548 +                buf.truncate(bufSegStart);
   1.549 +                buf.append(parser.getSegmentStandin(segmentNumber, status));
   1.550 +            }
   1.551 +            break;
   1.552 +        case FUNCTION:
   1.553 +        case ALT_FUNCTION:
   1.554 +            {
   1.555 +                int32_t iref = pos;
   1.556 +                TransliteratorIDParser::SingleID* single =
   1.557 +                    TransliteratorIDParser::parseFilterID(rule, iref);
   1.558 +                // The next character MUST be a segment open
   1.559 +                if (single == NULL ||
   1.560 +                    !ICU_Utility::parseChar(rule, iref, SEGMENT_OPEN)) {
   1.561 +                    return syntaxError(U_INVALID_FUNCTION, rule, start, status);
   1.562 +                }
   1.563 +                
   1.564 +                Transliterator *t = single->createInstance();
   1.565 +                delete single;
   1.566 +                if (t == NULL) {
   1.567 +                    return syntaxError(U_INVALID_FUNCTION, rule, start, status);
   1.568 +                }
   1.569 +                
   1.570 +                // bufSegStart is the offset in buf to the first
   1.571 +                // character of the segment we are parsing.
   1.572 +                int32_t bufSegStart = buf.length();
   1.573 +                
   1.574 +                // Parse the segment
   1.575 +                pos = parseSection(rule, iref, limit, buf, UnicodeString(TRUE, ILLEGAL_FUNC, -1), TRUE, status);
   1.576 +                
   1.577 +                // After parsing a segment, the relevant characters are
   1.578 +                // in buf, starting at offset bufSegStart.
   1.579 +                UnicodeString output;
   1.580 +                buf.extractBetween(bufSegStart, buf.length(), output);
   1.581 +                FunctionReplacer *r =
   1.582 +                    new FunctionReplacer(t, new StringReplacer(output, parser.curData));
   1.583 +                if (r == NULL) {
   1.584 +                    return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
   1.585 +                }
   1.586 +                
   1.587 +                // Replace the buffer contents with a stand-in
   1.588 +                buf.truncate(bufSegStart);
   1.589 +                buf.append(parser.generateStandInFor(r, status));
   1.590 +            }
   1.591 +            break;
   1.592 +        case SymbolTable::SYMBOL_REF:
   1.593 +            // Handle variable references and segment references "$1" .. "$9"
   1.594 +            {
   1.595 +                // A variable reference must be followed immediately
   1.596 +                // by a Unicode identifier start and zero or more
   1.597 +                // Unicode identifier part characters, or by a digit
   1.598 +                // 1..9 if it is a segment reference.
   1.599 +                if (pos == limit) {
   1.600 +                    // A variable ref character at the end acts as
   1.601 +                    // an anchor to the context limit, as in perl.
   1.602 +                    anchorEnd = TRUE;
   1.603 +                    break;
   1.604 +                }
   1.605 +                // Parse "$1" "$2" .. "$9" .. (no upper limit)
   1.606 +                c = rule.charAt(pos);
   1.607 +                int32_t r = u_digit(c, 10);
   1.608 +                if (r >= 1 && r <= 9) {
   1.609 +                    r = ICU_Utility::parseNumber(rule, pos, 10);
   1.610 +                    if (r < 0) {
   1.611 +                        return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE,
   1.612 +                                           rule, start, status);
   1.613 +                    }
   1.614 +                    buf.append(parser.getSegmentStandin(r, status));
   1.615 +                } else {
   1.616 +                    pp.setIndex(pos);
   1.617 +                    UnicodeString name = parser.parseData->
   1.618 +                                    parseReference(rule, pp, limit);
   1.619 +                    if (name.length() == 0) {
   1.620 +                        // This means the '$' was not followed by a
   1.621 +                        // valid name.  Try to interpret it as an
   1.622 +                        // end anchor then.  If this also doesn't work
   1.623 +                        // (if we see a following character) then signal
   1.624 +                        // an error.
   1.625 +                        anchorEnd = TRUE;
   1.626 +                        break;
   1.627 +                    }
   1.628 +                    pos = pp.getIndex();
   1.629 +                    // If this is a variable definition statement,
   1.630 +                    // then the LHS variable will be undefined.  In
   1.631 +                    // that case appendVariableDef() will append the
   1.632 +                    // special placeholder char variableLimit-1.
   1.633 +                    varStart = buf.length();
   1.634 +                    parser.appendVariableDef(name, buf, status);
   1.635 +                    varLimit = buf.length();
   1.636 +                }
   1.637 +            }
   1.638 +            break;
   1.639 +        case DOT:
   1.640 +            buf.append(parser.getDotStandIn(status));
   1.641 +            break;
   1.642 +        case KLEENE_STAR:
   1.643 +        case ONE_OR_MORE:
   1.644 +        case ZERO_OR_ONE:
   1.645 +            // Quantifiers.  We handle single characters, quoted strings,
   1.646 +            // variable references, and segments.
   1.647 +            //  a+      matches  aaa
   1.648 +            //  'foo'+  matches  foofoofoo
   1.649 +            //  $v+     matches  xyxyxy if $v == xy
   1.650 +            //  (seg)+  matches  segsegseg
   1.651 +            {
   1.652 +                if (isSegment && buf.length() == bufStart) {
   1.653 +                    // The */+ immediately follows '('
   1.654 +                    return syntaxError(U_MISPLACED_QUANTIFIER, rule, start, status);
   1.655 +                }
   1.656 +
   1.657 +                int32_t qstart, qlimit;
   1.658 +                // The */+ follows an isolated character or quote
   1.659 +                // or variable reference
   1.660 +                if (buf.length() == quoteLimit) {
   1.661 +                    // The */+ follows a 'quoted string'
   1.662 +                    qstart = quoteStart;
   1.663 +                    qlimit = quoteLimit;
   1.664 +                } else if (buf.length() == varLimit) {
   1.665 +                    // The */+ follows a $variableReference
   1.666 +                    qstart = varStart;
   1.667 +                    qlimit = varLimit;
   1.668 +                } else {
   1.669 +                    // The */+ follows a single character, possibly
   1.670 +                    // a segment standin
   1.671 +                    qstart = buf.length() - 1;
   1.672 +                    qlimit = qstart + 1;
   1.673 +                }
   1.674 +
   1.675 +                UnicodeFunctor *m =
   1.676 +                    new StringMatcher(buf, qstart, qlimit, 0, *parser.curData);
   1.677 +                if (m == NULL) {
   1.678 +                    return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
   1.679 +                }
   1.680 +                int32_t min = 0;
   1.681 +                int32_t max = Quantifier::MAX;
   1.682 +                switch (c) {
   1.683 +                case ONE_OR_MORE:
   1.684 +                    min = 1;
   1.685 +                    break;
   1.686 +                case ZERO_OR_ONE:
   1.687 +                    min = 0;
   1.688 +                    max = 1;
   1.689 +                    break;
   1.690 +                // case KLEENE_STAR:
   1.691 +                //    do nothing -- min, max already set
   1.692 +                }
   1.693 +                m = new Quantifier(m, min, max);
   1.694 +                if (m == NULL) {
   1.695 +                    return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
   1.696 +                }
   1.697 +                buf.truncate(qstart);
   1.698 +                buf.append(parser.generateStandInFor(m, status));
   1.699 +            }
   1.700 +            break;
   1.701 +
   1.702 +        //------------------------------------------------------
   1.703 +        // Elements allowed ONLY WITHIN segments
   1.704 +        //------------------------------------------------------
   1.705 +        case SEGMENT_CLOSE:
   1.706 +            // assert(isSegment);
   1.707 +            // We're done parsing a segment.
   1.708 +            done = TRUE;
   1.709 +            break;
   1.710 +
   1.711 +        //------------------------------------------------------
   1.712 +        // Elements allowed ONLY OUTSIDE segments
   1.713 +        //------------------------------------------------------
   1.714 +        case CONTEXT_ANTE:
   1.715 +            if (ante >= 0) {
   1.716 +                return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start, status);
   1.717 +            }
   1.718 +            ante = buf.length();
   1.719 +            break;
   1.720 +        case CONTEXT_POST:
   1.721 +            if (post >= 0) {
   1.722 +                return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start, status);
   1.723 +            }
   1.724 +            post = buf.length();
   1.725 +            break;
   1.726 +        case CURSOR_POS:
   1.727 +            if (cursor >= 0) {
   1.728 +                return syntaxError(U_MULTIPLE_CURSORS, rule, start, status);
   1.729 +            }
   1.730 +            cursor = buf.length();
   1.731 +            break;
   1.732 +        case CURSOR_OFFSET:
   1.733 +            if (cursorOffset < 0) {
   1.734 +                if (buf.length() > 0) {
   1.735 +                    return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
   1.736 +                }
   1.737 +                --cursorOffset;
   1.738 +            } else if (cursorOffset > 0) {
   1.739 +                if (buf.length() != cursorOffsetPos || cursor >= 0) {
   1.740 +                    return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
   1.741 +                }
   1.742 +                ++cursorOffset;
   1.743 +            } else {
   1.744 +                if (cursor == 0 && buf.length() == 0) {
   1.745 +                    cursorOffset = -1;
   1.746 +                } else if (cursor < 0) {
   1.747 +                    cursorOffsetPos = buf.length();
   1.748 +                    cursorOffset = 1;
   1.749 +                } else {
   1.750 +                    return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
   1.751 +                }
   1.752 +            }
   1.753 +            break;
   1.754 +
   1.755 +
   1.756 +        //------------------------------------------------------
   1.757 +        // Non-special characters
   1.758 +        //------------------------------------------------------
   1.759 +        default:
   1.760 +            // Disallow unquoted characters other than [0-9A-Za-z]
   1.761 +            // in the printable ASCII range.  These characters are
   1.762 +            // reserved for possible future use.
   1.763 +            if (c >= 0x0021 && c <= 0x007E &&
   1.764 +                !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
   1.765 +                  (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
   1.766 +                  (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) {
   1.767 +                return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status);
   1.768 +            }
   1.769 +            buf.append(c);
   1.770 +            break;
   1.771 +        }
   1.772 +    }
   1.773 +
   1.774 +    return pos;
   1.775 +}
   1.776 +
   1.777 +/**
   1.778 + * Remove context.
   1.779 + */
   1.780 +void RuleHalf::removeContext() {
   1.781 +    //text = text.substring(ante < 0 ? 0 : ante,
   1.782 +    //                      post < 0 ? text.length() : post);
   1.783 +    if (post >= 0) {
   1.784 +        text.remove(post);
   1.785 +    }
   1.786 +    if (ante >= 0) {
   1.787 +        text.removeBetween(0, ante);
   1.788 +    }
   1.789 +    ante = post = -1;
   1.790 +    anchorStart = anchorEnd = FALSE;
   1.791 +}
   1.792 +
   1.793 +/**
   1.794 + * Return true if this half looks like valid output, that is, does not
   1.795 + * contain quantifiers or other special input-only elements.
   1.796 + */
   1.797 +UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) {
   1.798 +    for (int32_t i=0; i<text.length(); ) {
   1.799 +        UChar32 c = text.char32At(i);
   1.800 +        i += U16_LENGTH(c);
   1.801 +        if (!transParser.parseData->isReplacer(c)) {
   1.802 +            return FALSE;
   1.803 +        }
   1.804 +    }
   1.805 +    return TRUE;
   1.806 +}
   1.807 +
   1.808 +/**
   1.809 + * Return true if this half looks like valid input, that is, does not
   1.810 + * contain functions or other special output-only elements.
   1.811 + */
   1.812 +UBool RuleHalf::isValidInput(TransliteratorParser& transParser) {
   1.813 +    for (int32_t i=0; i<text.length(); ) {
   1.814 +        UChar32 c = text.char32At(i);
   1.815 +        i += U16_LENGTH(c);
   1.816 +        if (!transParser.parseData->isMatcher(c)) {
   1.817 +            return FALSE;
   1.818 +        }
   1.819 +    }
   1.820 +    return TRUE;
   1.821 +}
   1.822 +
   1.823 +//----------------------------------------------------------------------
   1.824 +// PUBLIC API
   1.825 +//----------------------------------------------------------------------
   1.826 +
   1.827 +/**
   1.828 + * Constructor.
   1.829 + */
   1.830 +TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) :
   1.831 +dataVector(statusReturn),
   1.832 +idBlockVector(statusReturn),
   1.833 +variablesVector(statusReturn),
   1.834 +segmentObjects(statusReturn)
   1.835 +{
   1.836 +    idBlockVector.setDeleter(uprv_deleteUObject);
   1.837 +    curData = NULL;
   1.838 +    compoundFilter = NULL;
   1.839 +    parseData = NULL;
   1.840 +    variableNames.setValueDeleter(uprv_deleteUObject);
   1.841 +}
   1.842 +
   1.843 +/**
   1.844 + * Destructor.
   1.845 + */
   1.846 +TransliteratorParser::~TransliteratorParser() {
   1.847 +    while (!dataVector.isEmpty())
   1.848 +        delete (TransliterationRuleData*)(dataVector.orphanElementAt(0));
   1.849 +    delete compoundFilter;
   1.850 +    delete parseData;
   1.851 +    while (!variablesVector.isEmpty())
   1.852 +        delete (UnicodeFunctor*)variablesVector.orphanElementAt(0);
   1.853 +}
   1.854 +
   1.855 +void
   1.856 +TransliteratorParser::parse(const UnicodeString& rules,
   1.857 +                            UTransDirection transDirection,
   1.858 +                            UParseError& pe,
   1.859 +                            UErrorCode& ec) {
   1.860 +    if (U_SUCCESS(ec)) {
   1.861 +        parseRules(rules, transDirection, ec);
   1.862 +        pe = parseError;
   1.863 +    }
   1.864 +}
   1.865 +
   1.866 +/**
   1.867 + * Return the compound filter parsed by parse().  Caller owns result.
   1.868 + */ 
   1.869 +UnicodeSet* TransliteratorParser::orphanCompoundFilter() {
   1.870 +    UnicodeSet* f = compoundFilter;
   1.871 +    compoundFilter = NULL;
   1.872 +    return f;
   1.873 +}
   1.874 +
   1.875 +//----------------------------------------------------------------------
   1.876 +// Private implementation
   1.877 +//----------------------------------------------------------------------
   1.878 +
   1.879 +/**
   1.880 + * Parse the given string as a sequence of rules, separated by newline
   1.881 + * characters ('\n'), and cause this object to implement those rules.  Any
   1.882 + * previous rules are discarded.  Typically this method is called exactly
   1.883 + * once, during construction.
   1.884 + * @exception IllegalArgumentException if there is a syntax error in the
   1.885 + * rules
   1.886 + */
   1.887 +void TransliteratorParser::parseRules(const UnicodeString& rule,
   1.888 +                                      UTransDirection theDirection,
   1.889 +                                      UErrorCode& status)
   1.890 +{
   1.891 +    // Clear error struct
   1.892 +    uprv_memset(&parseError, 0, sizeof(parseError));
   1.893 +    parseError.line = parseError.offset = -1;
   1.894 +
   1.895 +    UBool parsingIDs = TRUE;
   1.896 +    int32_t ruleCount = 0;
   1.897 +    
   1.898 +    while (!dataVector.isEmpty()) {
   1.899 +        delete (TransliterationRuleData*)(dataVector.orphanElementAt(0));
   1.900 +    }
   1.901 +    if (U_FAILURE(status)) {
   1.902 +        return;
   1.903 +    }
   1.904 +
   1.905 +    idBlockVector.removeAllElements();
   1.906 +    curData = NULL;
   1.907 +    direction = theDirection;
   1.908 +    ruleCount = 0;
   1.909 +
   1.910 +    delete compoundFilter;
   1.911 +    compoundFilter = NULL;
   1.912 +
   1.913 +    while (!variablesVector.isEmpty()) {
   1.914 +        delete (UnicodeFunctor*)variablesVector.orphanElementAt(0);
   1.915 +    }
   1.916 +    variableNames.removeAll();
   1.917 +    parseData = new ParseData(0, &variablesVector, &variableNames);
   1.918 +    if (parseData == NULL) {
   1.919 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.920 +        return;
   1.921 +    }
   1.922 +
   1.923 +    dotStandIn = (UChar) -1;
   1.924 +
   1.925 +    UnicodeString *tempstr = NULL; // used for memory allocation error checking
   1.926 +    UnicodeString str; // scratch
   1.927 +    UnicodeString idBlockResult;
   1.928 +    int32_t pos = 0;
   1.929 +    int32_t limit = rule.length();
   1.930 +
   1.931 +    // The compound filter offset is an index into idBlockResult.
   1.932 +    // If it is 0, then the compound filter occurred at the start,
   1.933 +    // and it is the offset to the _start_ of the compound filter
   1.934 +    // pattern.  Otherwise it is the offset to the _limit_ of the
   1.935 +    // compound filter pattern within idBlockResult.
   1.936 +    compoundFilter = NULL;
   1.937 +    int32_t compoundFilterOffset = -1;
   1.938 +
   1.939 +    while (pos < limit && U_SUCCESS(status)) {
   1.940 +        UChar c = rule.charAt(pos++);
   1.941 +        if (PatternProps::isWhiteSpace(c)) {
   1.942 +            // Ignore leading whitespace.
   1.943 +            continue;
   1.944 +        }
   1.945 +        // Skip lines starting with the comment character
   1.946 +        if (c == RULE_COMMENT_CHAR) {
   1.947 +            pos = rule.indexOf((UChar)0x000A /*\n*/, pos) + 1;
   1.948 +            if (pos == 0) {
   1.949 +                break; // No "\n" found; rest of rule is a commnet
   1.950 +            }
   1.951 +            continue; // Either fall out or restart with next line
   1.952 +        }
   1.953 +
   1.954 +        // skip empty rules
   1.955 +        if (c == END_OF_RULE)
   1.956 +            continue;
   1.957 +
   1.958 +        // keep track of how many rules we've seen
   1.959 +        ++ruleCount;
   1.960 +        
   1.961 +        // We've found the start of a rule or ID.  c is its first
   1.962 +        // character, and pos points past c.
   1.963 +        --pos;
   1.964 +        // Look for an ID token.  Must have at least ID_TOKEN_LEN + 1
   1.965 +        // chars left.
   1.966 +        if ((pos + ID_TOKEN_LEN + 1) <= limit &&
   1.967 +                rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) {
   1.968 +            pos += ID_TOKEN_LEN;
   1.969 +            c = rule.charAt(pos);
   1.970 +            while (PatternProps::isWhiteSpace(c) && pos < limit) {
   1.971 +                ++pos;
   1.972 +                c = rule.charAt(pos);
   1.973 +            }
   1.974 +
   1.975 +            int32_t p = pos;
   1.976 +            
   1.977 +            if (!parsingIDs) {
   1.978 +                if (curData != NULL) {
   1.979 +                    if (direction == UTRANS_FORWARD)
   1.980 +                        dataVector.addElement(curData, status);
   1.981 +                    else
   1.982 +                        dataVector.insertElementAt(curData, 0, status);
   1.983 +                    curData = NULL;
   1.984 +                }
   1.985 +                parsingIDs = TRUE;
   1.986 +            }
   1.987 +
   1.988 +            TransliteratorIDParser::SingleID* id =
   1.989 +                TransliteratorIDParser::parseSingleID(rule, p, direction, status);
   1.990 +            if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) {
   1.991 +                // Successful ::ID parse.
   1.992 +
   1.993 +                if (direction == UTRANS_FORWARD) {
   1.994 +                    idBlockResult.append(id->canonID).append(END_OF_RULE);
   1.995 +                } else {
   1.996 +                    idBlockResult.insert(0, END_OF_RULE);
   1.997 +                    idBlockResult.insert(0, id->canonID);
   1.998 +                }
   1.999 +
  1.1000 +            } else {
  1.1001 +                // Couldn't parse an ID.  Try to parse a global filter
  1.1002 +                int32_t withParens = -1;
  1.1003 +                UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, NULL);
  1.1004 +                if (f != NULL) {
  1.1005 +                    if (ICU_Utility::parseChar(rule, p, END_OF_RULE)
  1.1006 +                        && (direction == UTRANS_FORWARD) == (withParens == 0))
  1.1007 +                    {
  1.1008 +                        if (compoundFilter != NULL) {
  1.1009 +                            // Multiple compound filters
  1.1010 +                            syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rule, pos, status);
  1.1011 +                            delete f;
  1.1012 +                        } else {
  1.1013 +                            compoundFilter = f;
  1.1014 +                            compoundFilterOffset = ruleCount;
  1.1015 +                        }
  1.1016 +                    } else {
  1.1017 +                        delete f;
  1.1018 +                    }
  1.1019 +                } else {
  1.1020 +                    // Invalid ::id
  1.1021 +                    // Can be parsed as neither an ID nor a global filter
  1.1022 +                    syntaxError(U_INVALID_ID, rule, pos, status);
  1.1023 +                }
  1.1024 +            }
  1.1025 +            delete id;
  1.1026 +            pos = p;
  1.1027 +        } else {
  1.1028 +            if (parsingIDs) {
  1.1029 +                tempstr = new UnicodeString(idBlockResult);
  1.1030 +                // NULL pointer check
  1.1031 +                if (tempstr == NULL) {
  1.1032 +                    status = U_MEMORY_ALLOCATION_ERROR;
  1.1033 +                    return;
  1.1034 +                }
  1.1035 +                if (direction == UTRANS_FORWARD)
  1.1036 +                    idBlockVector.addElement(tempstr, status);
  1.1037 +                else
  1.1038 +                    idBlockVector.insertElementAt(tempstr, 0, status);
  1.1039 +                idBlockResult.remove();
  1.1040 +                parsingIDs = FALSE;
  1.1041 +                curData = new TransliterationRuleData(status);
  1.1042 +                // NULL pointer check
  1.1043 +                if (curData == NULL) {
  1.1044 +                    status = U_MEMORY_ALLOCATION_ERROR;
  1.1045 +                    return;
  1.1046 +                }
  1.1047 +                parseData->data = curData;
  1.1048 +
  1.1049 +                // By default, rules use part of the private use area
  1.1050 +                // E000..F8FF for variables and other stand-ins.  Currently
  1.1051 +                // the range F000..F8FF is typically sufficient.  The 'use
  1.1052 +                // variable range' pragma allows rule sets to modify this.
  1.1053 +                setVariableRange(0xF000, 0xF8FF, status);
  1.1054 +            }
  1.1055 +
  1.1056 +            if (resemblesPragma(rule, pos, limit)) {
  1.1057 +                int32_t ppp = parsePragma(rule, pos, limit, status);
  1.1058 +                if (ppp < 0) {
  1.1059 +                    syntaxError(U_MALFORMED_PRAGMA, rule, pos, status);
  1.1060 +                }
  1.1061 +                pos = ppp;
  1.1062 +            // Parse a rule
  1.1063 +            } else {
  1.1064 +                pos = parseRule(rule, pos, limit, status);
  1.1065 +            }
  1.1066 +        }
  1.1067 +    }
  1.1068 +
  1.1069 +    if (parsingIDs && idBlockResult.length() > 0) {
  1.1070 +        tempstr = new UnicodeString(idBlockResult);
  1.1071 +        // NULL pointer check
  1.1072 +        if (tempstr == NULL) {
  1.1073 +            status = U_MEMORY_ALLOCATION_ERROR;
  1.1074 +            return;
  1.1075 +        }
  1.1076 +        if (direction == UTRANS_FORWARD)
  1.1077 +            idBlockVector.addElement(tempstr, status);
  1.1078 +        else
  1.1079 +            idBlockVector.insertElementAt(tempstr, 0, status);
  1.1080 +    }
  1.1081 +    else if (!parsingIDs && curData != NULL) {
  1.1082 +        if (direction == UTRANS_FORWARD)
  1.1083 +            dataVector.addElement(curData, status);
  1.1084 +        else
  1.1085 +            dataVector.insertElementAt(curData, 0, status);
  1.1086 +    }
  1.1087 +    
  1.1088 +    if (U_SUCCESS(status)) {
  1.1089 +        // Convert the set vector to an array
  1.1090 +        int32_t i, dataVectorSize = dataVector.size();
  1.1091 +        for (i = 0; i < dataVectorSize; i++) {
  1.1092 +            TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i);
  1.1093 +            data->variablesLength = variablesVector.size();
  1.1094 +            if (data->variablesLength == 0) {
  1.1095 +                data->variables = 0;
  1.1096 +            } else {
  1.1097 +                data->variables = (UnicodeFunctor**)uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor*));
  1.1098 +                // NULL pointer check
  1.1099 +                if (data->variables == NULL) {
  1.1100 +                    status = U_MEMORY_ALLOCATION_ERROR;
  1.1101 +                    return;
  1.1102 +                }
  1.1103 +                data->variablesAreOwned = (i == 0);
  1.1104 +            }
  1.1105 +
  1.1106 +            for (int32_t j = 0; j < data->variablesLength; j++) {
  1.1107 +                data->variables[j] =
  1.1108 +                    ((UnicodeSet*)variablesVector.elementAt(j));
  1.1109 +            }
  1.1110 +            
  1.1111 +            data->variableNames.removeAll();
  1.1112 +            int32_t pos = -1;
  1.1113 +            const UHashElement* he = variableNames.nextElement(pos);
  1.1114 +            while (he != NULL) {
  1.1115 +                UnicodeString* tempus = (UnicodeString*)(((UnicodeString*)(he->value.pointer))->clone());
  1.1116 +                if (tempus == NULL) {
  1.1117 +                    status = U_MEMORY_ALLOCATION_ERROR;
  1.1118 +                    return;
  1.1119 +                }
  1.1120 +                data->variableNames.put(*((UnicodeString*)(he->key.pointer)),
  1.1121 +                    tempus, status);
  1.1122 +                he = variableNames.nextElement(pos);
  1.1123 +            }
  1.1124 +        }
  1.1125 +        variablesVector.removeAllElements();   // keeps them from getting deleted when we succeed
  1.1126 +
  1.1127 +        // Index the rules
  1.1128 +        if (compoundFilter != NULL) {
  1.1129 +            if ((direction == UTRANS_FORWARD && compoundFilterOffset != 1) ||
  1.1130 +                (direction == UTRANS_REVERSE && compoundFilterOffset != ruleCount)) {
  1.1131 +                status = U_MISPLACED_COMPOUND_FILTER;
  1.1132 +            }
  1.1133 +        }        
  1.1134 +
  1.1135 +        for (i = 0; i < dataVectorSize; i++) {
  1.1136 +            TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i);
  1.1137 +            data->ruleSet.freeze(parseError, status);
  1.1138 +        }
  1.1139 +        if (idBlockVector.size() == 1 && ((UnicodeString*)idBlockVector.elementAt(0))->isEmpty()) {
  1.1140 +            idBlockVector.removeElementAt(0);
  1.1141 +        }
  1.1142 +    }
  1.1143 +}
  1.1144 +
  1.1145 +/**
  1.1146 + * Set the variable range to [start, end] (inclusive).
  1.1147 + */
  1.1148 +void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) {
  1.1149 +    if (start > end || start < 0 || end > 0xFFFF) {
  1.1150 +        status = U_MALFORMED_PRAGMA;
  1.1151 +        return;
  1.1152 +    }
  1.1153 +    
  1.1154 +    curData->variablesBase = (UChar) start;
  1.1155 +    if (dataVector.size() == 0) {
  1.1156 +        variableNext = (UChar) start;
  1.1157 +        variableLimit = (UChar) (end + 1);
  1.1158 +    }
  1.1159 +}
  1.1160 +
  1.1161 +/**
  1.1162 + * Assert that the given character is NOT within the variable range.
  1.1163 + * If it is, return FALSE.  This is neccesary to ensure that the
  1.1164 + * variable range does not overlap characters used in a rule.
  1.1165 + */
  1.1166 +UBool TransliteratorParser::checkVariableRange(UChar32 ch) const {
  1.1167 +    return !(ch >= curData->variablesBase && ch < variableLimit);
  1.1168 +}
  1.1169 +
  1.1170 +/**
  1.1171 + * Set the maximum backup to 'backup', in response to a pragma
  1.1172 + * statement.
  1.1173 + */
  1.1174 +void TransliteratorParser::pragmaMaximumBackup(int32_t /*backup*/) {
  1.1175 +    //TODO Finish
  1.1176 +}
  1.1177 +
  1.1178 +/**
  1.1179 + * Begin normalizing all rules using the given mode, in response
  1.1180 + * to a pragma statement.
  1.1181 + */
  1.1182 +void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /*mode*/) {
  1.1183 +    //TODO Finish
  1.1184 +}
  1.1185 +
  1.1186 +static const UChar PRAGMA_USE[] = {0x75,0x73,0x65,0x20,0}; // "use "
  1.1187 +
  1.1188 +static const UChar PRAGMA_VARIABLE_RANGE[] = {0x7E,0x76,0x61,0x72,0x69,0x61,0x62,0x6C,0x65,0x20,0x72,0x61,0x6E,0x67,0x65,0x20,0x23,0x20,0x23,0x7E,0x3B,0}; // "~variable range # #~;"
  1.1189 +
  1.1190 +static const UChar PRAGMA_MAXIMUM_BACKUP[] = {0x7E,0x6D,0x61,0x78,0x69,0x6D,0x75,0x6D,0x20,0x62,0x61,0x63,0x6B,0x75,0x70,0x20,0x23,0x7E,0x3B,0}; // "~maximum backup #~;"
  1.1191 +
  1.1192 +static const UChar PRAGMA_NFD_RULES[] = {0x7E,0x6E,0x66,0x64,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfd rules~;"
  1.1193 +
  1.1194 +static const UChar PRAGMA_NFC_RULES[] = {0x7E,0x6E,0x66,0x63,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfc rules~;"
  1.1195 +
  1.1196 +/**
  1.1197 + * Return true if the given rule looks like a pragma.
  1.1198 + * @param pos offset to the first non-whitespace character
  1.1199 + * of the rule.
  1.1200 + * @param limit pointer past the last character of the rule.
  1.1201 + */
  1.1202 +UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) {
  1.1203 +    // Must start with /use\s/i
  1.1204 +    return ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_USE, 4), NULL) >= 0;
  1.1205 +}
  1.1206 +
  1.1207 +/**
  1.1208 + * Parse a pragma.  This method assumes resemblesPragma() has
  1.1209 + * already returned true.
  1.1210 + * @param pos offset to the first non-whitespace character
  1.1211 + * of the rule.
  1.1212 + * @param limit pointer past the last character of the rule.
  1.1213 + * @return the position index after the final ';' of the pragma,
  1.1214 + * or -1 on failure.
  1.1215 + */
  1.1216 +int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
  1.1217 +    int32_t array[2];
  1.1218 +    
  1.1219 +    // resemblesPragma() has already returned true, so we
  1.1220 +    // know that pos points to /use\s/i; we can skip 4 characters
  1.1221 +    // immediately
  1.1222 +    pos += 4;
  1.1223 +    
  1.1224 +    // Here are the pragmas we recognize:
  1.1225 +    // use variable range 0xE000 0xEFFF;
  1.1226 +    // use maximum backup 16;
  1.1227 +    // use nfd rules;
  1.1228 +    // use nfc rules;
  1.1229 +    int p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_VARIABLE_RANGE, -1), array);
  1.1230 +    if (p >= 0) {
  1.1231 +        setVariableRange(array[0], array[1], status);
  1.1232 +        return p;
  1.1233 +    }
  1.1234 +    
  1.1235 +    p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_MAXIMUM_BACKUP, -1), array);
  1.1236 +    if (p >= 0) {
  1.1237 +        pragmaMaximumBackup(array[0]);
  1.1238 +        return p;
  1.1239 +    }
  1.1240 +    
  1.1241 +    p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_NFD_RULES, -1), NULL);
  1.1242 +    if (p >= 0) {
  1.1243 +        pragmaNormalizeRules(UNORM_NFD);
  1.1244 +        return p;
  1.1245 +    }
  1.1246 +    
  1.1247 +    p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_NFC_RULES, -1), NULL);
  1.1248 +    if (p >= 0) {
  1.1249 +        pragmaNormalizeRules(UNORM_NFC);
  1.1250 +        return p;
  1.1251 +    }
  1.1252 +    
  1.1253 +    // Syntax error: unable to parse pragma
  1.1254 +    return -1;
  1.1255 +}
  1.1256 +
  1.1257 +/**
  1.1258 + * MAIN PARSER.  Parse the next rule in the given rule string, starting
  1.1259 + * at pos.  Return the index after the last character parsed.  Do not
  1.1260 + * parse characters at or after limit.
  1.1261 + *
  1.1262 + * Important:  The character at pos must be a non-whitespace character
  1.1263 + * that is not the comment character.
  1.1264 + *
  1.1265 + * This method handles quoting, escaping, and whitespace removal.  It
  1.1266 + * parses the end-of-rule character.  It recognizes context and cursor
  1.1267 + * indicators.  Once it does a lexical breakdown of the rule at pos, it
  1.1268 + * creates a rule object and adds it to our rule list.
  1.1269 + */
  1.1270 +int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
  1.1271 +    // Locate the left side, operator, and right side
  1.1272 +    int32_t start = pos;
  1.1273 +    UChar op = 0;
  1.1274 +    int32_t i;
  1.1275 +
  1.1276 +    // Set up segments data
  1.1277 +    segmentStandins.truncate(0);
  1.1278 +    segmentObjects.removeAllElements();
  1.1279 +
  1.1280 +    // Use pointers to automatics to make swapping possible.
  1.1281 +    RuleHalf _left(*this), _right(*this);
  1.1282 +    RuleHalf* left = &_left;
  1.1283 +    RuleHalf* right = &_right;
  1.1284 +
  1.1285 +    undefinedVariableName.remove();
  1.1286 +    pos = left->parse(rule, pos, limit, status);
  1.1287 +    if (U_FAILURE(status)) {
  1.1288 +        return start;
  1.1289 +    }
  1.1290 +
  1.1291 +    if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) {
  1.1292 +        return syntaxError(U_MISSING_OPERATOR, rule, start, status);
  1.1293 +    }
  1.1294 +    ++pos;
  1.1295 +
  1.1296 +    // Found an operator char.  Check for forward-reverse operator.
  1.1297 +    if (op == REVERSE_RULE_OP &&
  1.1298 +        (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
  1.1299 +        ++pos;
  1.1300 +        op = FWDREV_RULE_OP;
  1.1301 +    }
  1.1302 +
  1.1303 +    // Translate alternate op characters.
  1.1304 +    switch (op) {
  1.1305 +    case ALT_FORWARD_RULE_OP:
  1.1306 +        op = FORWARD_RULE_OP;
  1.1307 +        break;
  1.1308 +    case ALT_REVERSE_RULE_OP:
  1.1309 +        op = REVERSE_RULE_OP;
  1.1310 +        break;
  1.1311 +    case ALT_FWDREV_RULE_OP:
  1.1312 +        op = FWDREV_RULE_OP;
  1.1313 +        break;
  1.1314 +    }
  1.1315 +
  1.1316 +    pos = right->parse(rule, pos, limit, status);
  1.1317 +    if (U_FAILURE(status)) {
  1.1318 +        return start;
  1.1319 +    }
  1.1320 +
  1.1321 +    if (pos < limit) {
  1.1322 +        if (rule.charAt(--pos) == END_OF_RULE) {
  1.1323 +            ++pos;
  1.1324 +        } else {
  1.1325 +            // RuleHalf parser must have terminated at an operator
  1.1326 +            return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status);
  1.1327 +        }
  1.1328 +    }
  1.1329 +
  1.1330 +    if (op == VARIABLE_DEF_OP) {
  1.1331 +        // LHS is the name.  RHS is a single character, either a literal
  1.1332 +        // or a set (already parsed).  If RHS is longer than one
  1.1333 +        // character, it is either a multi-character string, or multiple
  1.1334 +        // sets, or a mixture of chars and sets -- syntax error.
  1.1335 +
  1.1336 +        // We expect to see a single undefined variable (the one being
  1.1337 +        // defined).
  1.1338 +        if (undefinedVariableName.length() == 0) {
  1.1339 +            // "Missing '$' or duplicate definition"
  1.1340 +            return syntaxError(U_BAD_VARIABLE_DEFINITION, rule, start, status);
  1.1341 +        }
  1.1342 +        if (left->text.length() != 1 || left->text.charAt(0) != variableLimit) {
  1.1343 +            // "Malformed LHS"
  1.1344 +            return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status);
  1.1345 +        }
  1.1346 +        if (left->anchorStart || left->anchorEnd ||
  1.1347 +            right->anchorStart || right->anchorEnd) {
  1.1348 +            return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status);
  1.1349 +        } 
  1.1350 +        // We allow anything on the right, including an empty string.
  1.1351 +        UnicodeString* value = new UnicodeString(right->text);
  1.1352 +        // NULL pointer check
  1.1353 +        if (value == NULL) {
  1.1354 +            return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
  1.1355 +        }
  1.1356 +        variableNames.put(undefinedVariableName, value, status);
  1.1357 +        ++variableLimit;
  1.1358 +        return pos;
  1.1359 +    }
  1.1360 +
  1.1361 +    // If this is not a variable definition rule, we shouldn't have
  1.1362 +    // any undefined variable names.
  1.1363 +    if (undefinedVariableName.length() != 0) {
  1.1364 +        return syntaxError(// "Undefined variable $" + undefinedVariableName,
  1.1365 +                    U_UNDEFINED_VARIABLE,
  1.1366 +                    rule, start, status);
  1.1367 +    }
  1.1368 +
  1.1369 +    // Verify segments
  1.1370 +    if (segmentStandins.length() > segmentObjects.size()) {
  1.1371 +        syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start, status);
  1.1372 +    }
  1.1373 +    for (i=0; i<segmentStandins.length(); ++i) {
  1.1374 +        if (segmentStandins.charAt(i) == 0) {
  1.1375 +            syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen
  1.1376 +        }
  1.1377 +    }
  1.1378 +    for (i=0; i<segmentObjects.size(); ++i) {
  1.1379 +        if (segmentObjects.elementAt(i) == NULL) {
  1.1380 +            syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen
  1.1381 +        }
  1.1382 +    }
  1.1383 +    
  1.1384 +    // If the direction we want doesn't match the rule
  1.1385 +    // direction, do nothing.
  1.1386 +    if (op != FWDREV_RULE_OP &&
  1.1387 +        ((direction == UTRANS_FORWARD) != (op == FORWARD_RULE_OP))) {
  1.1388 +        return pos;
  1.1389 +    }
  1.1390 +
  1.1391 +    // Transform the rule into a forward rule by swapping the
  1.1392 +    // sides if necessary.
  1.1393 +    if (direction == UTRANS_REVERSE) {
  1.1394 +        left = &_right;
  1.1395 +        right = &_left;
  1.1396 +    }
  1.1397 +
  1.1398 +    // Remove non-applicable elements in forward-reverse
  1.1399 +    // rules.  Bidirectional rules ignore elements that do not
  1.1400 +    // apply.
  1.1401 +    if (op == FWDREV_RULE_OP) {
  1.1402 +        right->removeContext();
  1.1403 +        left->cursor = -1;
  1.1404 +        left->cursorOffset = 0;
  1.1405 +    }
  1.1406 +
  1.1407 +    // Normalize context
  1.1408 +    if (left->ante < 0) {
  1.1409 +        left->ante = 0;
  1.1410 +    }
  1.1411 +    if (left->post < 0) {
  1.1412 +        left->post = left->text.length();
  1.1413 +    }
  1.1414 +
  1.1415 +    // Context is only allowed on the input side.  Cursors are only
  1.1416 +    // allowed on the output side.  Segment delimiters can only appear
  1.1417 +    // on the left, and references on the right.  Cursor offset
  1.1418 +    // cannot appear without an explicit cursor.  Cursor offset
  1.1419 +    // cannot place the cursor outside the limits of the context.
  1.1420 +    // Anchors are only allowed on the input side.
  1.1421 +    if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 ||
  1.1422 +        (right->cursorOffset != 0 && right->cursor < 0) ||
  1.1423 +        // - The following two checks were used to ensure that the
  1.1424 +        // - the cursor offset stayed within the ante- or postcontext.
  1.1425 +        // - However, with the addition of quantifiers, we have to
  1.1426 +        // - allow arbitrary cursor offsets and do runtime checking.
  1.1427 +        //(right->cursorOffset > (left->text.length() - left->post)) ||
  1.1428 +        //(-right->cursorOffset > left->ante) ||
  1.1429 +        right->anchorStart || right->anchorEnd ||
  1.1430 +        !left->isValidInput(*this) || !right->isValidOutput(*this) ||
  1.1431 +        left->ante > left->post) {
  1.1432 +
  1.1433 +        return syntaxError(U_MALFORMED_RULE, rule, start, status);
  1.1434 +    }
  1.1435 +
  1.1436 +    // Flatten segment objects vector to an array
  1.1437 +    UnicodeFunctor** segmentsArray = NULL;
  1.1438 +    if (segmentObjects.size() > 0) {
  1.1439 +        segmentsArray = (UnicodeFunctor **)uprv_malloc(segmentObjects.size() * sizeof(UnicodeFunctor *));
  1.1440 +        // Null pointer check
  1.1441 +        if (segmentsArray == NULL) {
  1.1442 +            return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
  1.1443 +        }
  1.1444 +        segmentObjects.toArray((void**) segmentsArray);
  1.1445 +    }
  1.1446 +    TransliterationRule* temptr = new TransliterationRule(
  1.1447 +            left->text, left->ante, left->post,
  1.1448 +            right->text, right->cursor, right->cursorOffset,
  1.1449 +            segmentsArray,
  1.1450 +            segmentObjects.size(),
  1.1451 +            left->anchorStart, left->anchorEnd,
  1.1452 +            curData,
  1.1453 +            status);
  1.1454 +    //Null pointer check
  1.1455 +    if (temptr == NULL) {
  1.1456 +        uprv_free(segmentsArray);
  1.1457 +        return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
  1.1458 +    }
  1.1459 +
  1.1460 +    curData->ruleSet.addRule(temptr, status);
  1.1461 +
  1.1462 +    return pos;
  1.1463 +}
  1.1464 +
  1.1465 +/**
  1.1466 + * Called by main parser upon syntax error.  Search the rule string
  1.1467 + * for the probable end of the rule.  Of course, if the error is that
  1.1468 + * the end of rule marker is missing, then the rule end will not be found.
  1.1469 + * In any case the rule start will be correctly reported.
  1.1470 + * @param msg error description
  1.1471 + * @param rule pattern string
  1.1472 + * @param start position of first character of current rule
  1.1473 + */
  1.1474 +int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode,
  1.1475 +                                          const UnicodeString& rule,
  1.1476 +                                          int32_t pos,
  1.1477 +                                          UErrorCode& status)
  1.1478 +{
  1.1479 +    parseError.offset = pos;
  1.1480 +    parseError.line = 0 ; /* we are not using line numbers */
  1.1481 +    
  1.1482 +    // for pre-context
  1.1483 +    const int32_t LEN = U_PARSE_CONTEXT_LEN - 1;
  1.1484 +    int32_t start = uprv_max(pos - LEN, 0);
  1.1485 +    int32_t stop  = pos;
  1.1486 +    
  1.1487 +    rule.extract(start,stop-start,parseError.preContext);
  1.1488 +    //null terminate the buffer
  1.1489 +    parseError.preContext[stop-start] = 0;
  1.1490 +    
  1.1491 +    //for post-context
  1.1492 +    start = pos;
  1.1493 +    stop  = uprv_min(pos + LEN, rule.length());
  1.1494 +    
  1.1495 +    rule.extract(start,stop-start,parseError.postContext);
  1.1496 +    //null terminate the buffer
  1.1497 +    parseError.postContext[stop-start]= 0;
  1.1498 +
  1.1499 +    status = (UErrorCode)parseErrorCode;
  1.1500 +    return pos;
  1.1501 +
  1.1502 +}
  1.1503 +
  1.1504 +/**
  1.1505 + * Parse a UnicodeSet out, store it, and return the stand-in character
  1.1506 + * used to represent it.
  1.1507 + */
  1.1508 +UChar TransliteratorParser::parseSet(const UnicodeString& rule,
  1.1509 +                                          ParsePosition& pos,
  1.1510 +                                          UErrorCode& status) {
  1.1511 +    UnicodeSet* set = new UnicodeSet(rule, pos, USET_IGNORE_SPACE, parseData, status);
  1.1512 +    // Null pointer check
  1.1513 +    if (set == NULL) {
  1.1514 +        status = U_MEMORY_ALLOCATION_ERROR;
  1.1515 +        return (UChar)0x0000; // Return empty character with error.
  1.1516 +    }
  1.1517 +    set->compact();
  1.1518 +    return generateStandInFor(set, status);
  1.1519 +}
  1.1520 +
  1.1521 +/**
  1.1522 + * Generate and return a stand-in for a new UnicodeFunctor.  Store
  1.1523 + * the matcher (adopt it).
  1.1524 + */
  1.1525 +UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) {
  1.1526 +    // assert(obj != null);
  1.1527 +    
  1.1528 +    // Look up previous stand-in, if any.  This is a short list
  1.1529 +    // (typical n is 0, 1, or 2); linear search is optimal.
  1.1530 +    for (int32_t i=0; i<variablesVector.size(); ++i) {
  1.1531 +        if (variablesVector.elementAt(i) == adopted) { // [sic] pointer comparison
  1.1532 +            return (UChar) (curData->variablesBase + i);
  1.1533 +        }
  1.1534 +    }
  1.1535 +    
  1.1536 +    if (variableNext >= variableLimit) {
  1.1537 +        delete adopted;
  1.1538 +        status = U_VARIABLE_RANGE_EXHAUSTED;
  1.1539 +        return 0;
  1.1540 +    }
  1.1541 +    variablesVector.addElement(adopted, status);
  1.1542 +    return variableNext++;
  1.1543 +}
  1.1544 +
  1.1545 +/**
  1.1546 + * Return the standin for segment seg (1-based).
  1.1547 + */
  1.1548 +UChar TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) {
  1.1549 +    // Special character used to indicate an empty spot
  1.1550 +    UChar empty = curData->variablesBase - 1;
  1.1551 +    while (segmentStandins.length() < seg) {
  1.1552 +        segmentStandins.append(empty);
  1.1553 +    }
  1.1554 +    UChar c = segmentStandins.charAt(seg-1);
  1.1555 +    if (c == empty) {
  1.1556 +        if (variableNext >= variableLimit) {
  1.1557 +            status = U_VARIABLE_RANGE_EXHAUSTED;
  1.1558 +            return 0;
  1.1559 +        }
  1.1560 +        c = variableNext++;
  1.1561 +        // Set a placeholder in the master variables vector that will be
  1.1562 +        // filled in later by setSegmentObject().  We know that we will get
  1.1563 +        // called first because setSegmentObject() will call us.
  1.1564 +        variablesVector.addElement((void*) NULL, status);
  1.1565 +        segmentStandins.setCharAt(seg-1, c);
  1.1566 +    }
  1.1567 +    return c;
  1.1568 +}
  1.1569 +
  1.1570 +/**
  1.1571 + * Set the object for segment seg (1-based).
  1.1572 + */
  1.1573 +void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) {
  1.1574 +    // Since we call parseSection() recursively, nested
  1.1575 +    // segments will result in segment i+1 getting parsed
  1.1576 +    // and stored before segment i; be careful with the
  1.1577 +    // vector handling here.
  1.1578 +    if (segmentObjects.size() < seg) {
  1.1579 +        segmentObjects.setSize(seg, status);
  1.1580 +    }
  1.1581 +    int32_t index = getSegmentStandin(seg, status) - curData->variablesBase;
  1.1582 +    if (segmentObjects.elementAt(seg-1) != NULL ||
  1.1583 +        variablesVector.elementAt(index) != NULL) {
  1.1584 +        // should never happen
  1.1585 +        status = U_INTERNAL_TRANSLITERATOR_ERROR;
  1.1586 +        return;
  1.1587 +    }
  1.1588 +    segmentObjects.setElementAt(adopted, seg-1);
  1.1589 +    variablesVector.setElementAt(adopted, index);
  1.1590 +}
  1.1591 +
  1.1592 +/**
  1.1593 + * Return the stand-in for the dot set.  It is allocated the first
  1.1594 + * time and reused thereafter.
  1.1595 + */
  1.1596 +UChar TransliteratorParser::getDotStandIn(UErrorCode& status) {
  1.1597 +    if (dotStandIn == (UChar) -1) {
  1.1598 +        UnicodeSet* tempus = new UnicodeSet(UnicodeString(TRUE, DOT_SET, -1), status);
  1.1599 +        // Null pointer check.
  1.1600 +        if (tempus == NULL) {
  1.1601 +            status = U_MEMORY_ALLOCATION_ERROR;
  1.1602 +            return (UChar)0x0000;
  1.1603 +        }
  1.1604 +        dotStandIn = generateStandInFor(tempus, status);
  1.1605 +    }
  1.1606 +    return dotStandIn;
  1.1607 +}
  1.1608 +
  1.1609 +/**
  1.1610 + * Append the value of the given variable name to the given
  1.1611 + * UnicodeString.
  1.1612 + */
  1.1613 +void TransliteratorParser::appendVariableDef(const UnicodeString& name,
  1.1614 +                                                  UnicodeString& buf,
  1.1615 +                                                  UErrorCode& status) {
  1.1616 +    const UnicodeString* s = (const UnicodeString*) variableNames.get(name);
  1.1617 +    if (s == NULL) {
  1.1618 +        // We allow one undefined variable so that variable definition
  1.1619 +        // statements work.  For the first undefined variable we return
  1.1620 +        // the special placeholder variableLimit-1, and save the variable
  1.1621 +        // name.
  1.1622 +        if (undefinedVariableName.length() == 0) {
  1.1623 +            undefinedVariableName = name;
  1.1624 +            if (variableNext >= variableLimit) {
  1.1625 +                // throw new RuntimeException("Private use variables exhausted");
  1.1626 +                status = U_ILLEGAL_ARGUMENT_ERROR;
  1.1627 +                return;
  1.1628 +            }
  1.1629 +            buf.append((UChar) --variableLimit);
  1.1630 +        } else {
  1.1631 +            //throw new IllegalArgumentException("Undefined variable $"
  1.1632 +            //                                   + name);
  1.1633 +            status = U_ILLEGAL_ARGUMENT_ERROR;
  1.1634 +            return;
  1.1635 +        }
  1.1636 +    } else {
  1.1637 +        buf.append(*s);
  1.1638 +    }
  1.1639 +}
  1.1640 +
  1.1641 +/**
  1.1642 + * Glue method to get around access restrictions in C++.
  1.1643 + */
  1.1644 +/*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
  1.1645 +    return Transliterator::createBasicInstance(id, canonID);
  1.1646 +}*/
  1.1647 +
  1.1648 +U_NAMESPACE_END
  1.1649 +
  1.1650 +U_CAPI int32_t
  1.1651 +utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status) {
  1.1652 +    U_NAMESPACE_USE
  1.1653 +
  1.1654 +    //const UChar *sourceStart = source;
  1.1655 +    const UChar *targetStart = target;
  1.1656 +    const UChar *sourceLimit = source+sourceLen;
  1.1657 +    UChar *targetLimit = target+sourceLen;
  1.1658 +    UChar32 c = 0;
  1.1659 +    UBool quoted = FALSE;
  1.1660 +    int32_t index;
  1.1661 +
  1.1662 +    uprv_memset(target, 0, sourceLen*U_SIZEOF_UCHAR);
  1.1663 +
  1.1664 +    /* read the rules into the buffer */
  1.1665 +    while (source < sourceLimit)
  1.1666 +    {
  1.1667 +        index=0;
  1.1668 +        U16_NEXT_UNSAFE(source, index, c);
  1.1669 +        source+=index;
  1.1670 +        if(c == QUOTE) {
  1.1671 +            quoted = (UBool)!quoted;
  1.1672 +        }
  1.1673 +        else if (!quoted) {
  1.1674 +            if (c == RULE_COMMENT_CHAR) {
  1.1675 +                /* skip comments and all preceding spaces */
  1.1676 +                while (targetStart < target && *(target - 1) == 0x0020) {
  1.1677 +                    target--;
  1.1678 +                }
  1.1679 +                do {
  1.1680 +                    c = *(source++);
  1.1681 +                }
  1.1682 +                while (c != CR && c != LF);
  1.1683 +            }
  1.1684 +            else if (c == ESCAPE) {
  1.1685 +                UChar32   c2 = *source;
  1.1686 +                if (c2 == CR || c2 == LF) {
  1.1687 +                    /* A backslash at the end of a line. */
  1.1688 +                    /* Since we're stripping lines, ignore the backslash. */
  1.1689 +                    source++;
  1.1690 +                    continue;
  1.1691 +                }
  1.1692 +                if (c2 == 0x0075 && source+5 < sourceLimit) { /* \u seen. \U isn't unescaped. */
  1.1693 +                    int32_t escapeOffset = 0;
  1.1694 +                    UnicodeString escapedStr(source, 5);
  1.1695 +                    c2 = escapedStr.unescapeAt(escapeOffset);
  1.1696 +
  1.1697 +                    if (c2 == (UChar32)0xFFFFFFFF || escapeOffset == 0)
  1.1698 +                    {
  1.1699 +                        *status = U_PARSE_ERROR;
  1.1700 +                        return 0;
  1.1701 +                    }
  1.1702 +                    if (!PatternProps::isWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) {
  1.1703 +                        /* It was escaped for a reason. Write what it was suppose to be. */
  1.1704 +                        source+=5;
  1.1705 +                        c = c2;
  1.1706 +                    }
  1.1707 +                }
  1.1708 +                else if (c2 == QUOTE) {
  1.1709 +                    /* \' seen. Make sure we don't do anything when we see it again. */
  1.1710 +                    quoted = (UBool)!quoted;
  1.1711 +                }
  1.1712 +            }
  1.1713 +        }
  1.1714 +        if (c == CR || c == LF)
  1.1715 +        {
  1.1716 +            /* ignore spaces carriage returns, and all leading spaces on the next line.
  1.1717 +            * and line feed unless in the form \uXXXX
  1.1718 +            */
  1.1719 +            quoted = FALSE;
  1.1720 +            while (source < sourceLimit) {
  1.1721 +                c = *(source);
  1.1722 +                if (c != CR && c != LF && c != 0x0020) {
  1.1723 +                    break;
  1.1724 +                }
  1.1725 +                source++;
  1.1726 +            }
  1.1727 +            continue;
  1.1728 +        }
  1.1729 +
  1.1730 +        /* Append UChar * after dissembling if c > 0xffff*/
  1.1731 +        index=0;
  1.1732 +        U16_APPEND_UNSAFE(target, index, c);
  1.1733 +        target+=index;
  1.1734 +    }
  1.1735 +    if (target < targetLimit) {
  1.1736 +        *target = 0;
  1.1737 +    }
  1.1738 +    return (int32_t)(target-targetStart);
  1.1739 +}
  1.1740 +
  1.1741 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */

mercurial