intl/icu/source/i18n/rbt_pars.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 1999-2011, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * Date Name Description
michael@0 7 * 11/17/99 aliu Creation.
michael@0 8 **********************************************************************
michael@0 9 */
michael@0 10
michael@0 11 #include "unicode/utypes.h"
michael@0 12
michael@0 13 #if !UCONFIG_NO_TRANSLITERATION
michael@0 14
michael@0 15 #include "unicode/uobject.h"
michael@0 16 #include "unicode/parseerr.h"
michael@0 17 #include "unicode/parsepos.h"
michael@0 18 #include "unicode/putil.h"
michael@0 19 #include "unicode/uchar.h"
michael@0 20 #include "unicode/ustring.h"
michael@0 21 #include "unicode/uniset.h"
michael@0 22 #include "unicode/utf16.h"
michael@0 23 #include "cstring.h"
michael@0 24 #include "funcrepl.h"
michael@0 25 #include "hash.h"
michael@0 26 #include "quant.h"
michael@0 27 #include "rbt.h"
michael@0 28 #include "rbt_data.h"
michael@0 29 #include "rbt_pars.h"
michael@0 30 #include "rbt_rule.h"
michael@0 31 #include "strmatch.h"
michael@0 32 #include "strrepl.h"
michael@0 33 #include "unicode/symtable.h"
michael@0 34 #include "tridpars.h"
michael@0 35 #include "uvector.h"
michael@0 36 #include "hash.h"
michael@0 37 #include "patternprops.h"
michael@0 38 #include "util.h"
michael@0 39 #include "cmemory.h"
michael@0 40 #include "uprops.h"
michael@0 41 #include "putilimp.h"
michael@0 42
michael@0 43 // Operators
michael@0 44 #define VARIABLE_DEF_OP ((UChar)0x003D) /*=*/
michael@0 45 #define FORWARD_RULE_OP ((UChar)0x003E) /*>*/
michael@0 46 #define REVERSE_RULE_OP ((UChar)0x003C) /*<*/
michael@0 47 #define FWDREV_RULE_OP ((UChar)0x007E) /*~*/ // internal rep of <> op
michael@0 48
michael@0 49 // Other special characters
michael@0 50 #define QUOTE ((UChar)0x0027) /*'*/
michael@0 51 #define ESCAPE ((UChar)0x005C) /*\*/
michael@0 52 #define END_OF_RULE ((UChar)0x003B) /*;*/
michael@0 53 #define RULE_COMMENT_CHAR ((UChar)0x0023) /*#*/
michael@0 54
michael@0 55 #define SEGMENT_OPEN ((UChar)0x0028) /*(*/
michael@0 56 #define SEGMENT_CLOSE ((UChar)0x0029) /*)*/
michael@0 57 #define CONTEXT_ANTE ((UChar)0x007B) /*{*/
michael@0 58 #define CONTEXT_POST ((UChar)0x007D) /*}*/
michael@0 59 #define CURSOR_POS ((UChar)0x007C) /*|*/
michael@0 60 #define CURSOR_OFFSET ((UChar)0x0040) /*@*/
michael@0 61 #define ANCHOR_START ((UChar)0x005E) /*^*/
michael@0 62 #define KLEENE_STAR ((UChar)0x002A) /***/
michael@0 63 #define ONE_OR_MORE ((UChar)0x002B) /*+*/
michael@0 64 #define ZERO_OR_ONE ((UChar)0x003F) /*?*/
michael@0 65
michael@0 66 #define DOT ((UChar)46) /*.*/
michael@0 67
michael@0 68 static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]";
michael@0 69 91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90,
michael@0 70 108, 58, 93, 92, 114, 92, 110, 36, 93, 0
michael@0 71 };
michael@0 72
michael@0 73 // A function is denoted &Source-Target/Variant(text)
michael@0 74 #define FUNCTION ((UChar)38) /*&*/
michael@0 75
michael@0 76 // Aliases for some of the syntax characters. These are provided so
michael@0 77 // transliteration rules can be expressed in XML without clashing with
michael@0 78 // XML syntax characters '<', '>', and '&'.
michael@0 79 #define ALT_REVERSE_RULE_OP ((UChar)0x2190) // Left Arrow
michael@0 80 #define ALT_FORWARD_RULE_OP ((UChar)0x2192) // Right Arrow
michael@0 81 #define ALT_FWDREV_RULE_OP ((UChar)0x2194) // Left Right Arrow
michael@0 82 #define ALT_FUNCTION ((UChar)0x2206) // Increment (~Greek Capital Delta)
michael@0 83
michael@0 84 // Special characters disallowed at the top level
michael@0 85 static const UChar ILLEGAL_TOP[] = {41,0}; // ")"
michael@0 86
michael@0 87 // Special characters disallowed within a segment
michael@0 88 static const UChar ILLEGAL_SEG[] = {123,125,124,64,0}; // "{}|@"
michael@0 89
michael@0 90 // Special characters disallowed within a function argument
michael@0 91 static const UChar ILLEGAL_FUNC[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^(.*+?{}|@"
michael@0 92
michael@0 93 // By definition, the ANCHOR_END special character is a
michael@0 94 // trailing SymbolTable.SYMBOL_REF character.
michael@0 95 // private static final char ANCHOR_END = '$';
michael@0 96
michael@0 97 static const UChar gOPERATORS[] = { // "=><"
michael@0 98 VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,
michael@0 99 ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,
michael@0 100 0
michael@0 101 };
michael@0 102
michael@0 103 static const UChar HALF_ENDERS[] = { // "=><;"
michael@0 104 VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,
michael@0 105 ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,
michael@0 106 END_OF_RULE,
michael@0 107 0
michael@0 108 };
michael@0 109
michael@0 110 // These are also used in Transliterator::toRules()
michael@0 111 static const int32_t ID_TOKEN_LEN = 2;
michael@0 112 static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'
michael@0 113
michael@0 114 /*
michael@0 115 commented out until we do real ::BEGIN/::END functionality
michael@0 116 static const int32_t BEGIN_TOKEN_LEN = 5;
michael@0 117 static const UChar BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN'
michael@0 118
michael@0 119 static const int32_t END_TOKEN_LEN = 3;
michael@0 120 static const UChar END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END'
michael@0 121 */
michael@0 122
michael@0 123 U_NAMESPACE_BEGIN
michael@0 124
michael@0 125 //----------------------------------------------------------------------
michael@0 126 // BEGIN ParseData
michael@0 127 //----------------------------------------------------------------------
michael@0 128
michael@0 129 /**
michael@0 130 * This class implements the SymbolTable interface. It is used
michael@0 131 * during parsing to give UnicodeSet access to variables that
michael@0 132 * have been defined so far. Note that it uses variablesVector,
michael@0 133 * _not_ data.setVariables.
michael@0 134 */
michael@0 135 class ParseData : public UMemory, public SymbolTable {
michael@0 136 public:
michael@0 137 const TransliterationRuleData* data; // alias
michael@0 138
michael@0 139 const UVector* variablesVector; // alias
michael@0 140
michael@0 141 const Hashtable* variableNames; // alias
michael@0 142
michael@0 143 ParseData(const TransliterationRuleData* data = 0,
michael@0 144 const UVector* variablesVector = 0,
michael@0 145 const Hashtable* variableNames = 0);
michael@0 146
michael@0 147 virtual ~ParseData();
michael@0 148
michael@0 149 virtual const UnicodeString* lookup(const UnicodeString& s) const;
michael@0 150
michael@0 151 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
michael@0 152
michael@0 153 virtual UnicodeString parseReference(const UnicodeString& text,
michael@0 154 ParsePosition& pos, int32_t limit) const;
michael@0 155 /**
michael@0 156 * Return true if the given character is a matcher standin or a plain
michael@0 157 * character (non standin).
michael@0 158 */
michael@0 159 UBool isMatcher(UChar32 ch);
michael@0 160
michael@0 161 /**
michael@0 162 * Return true if the given character is a replacer standin or a plain
michael@0 163 * character (non standin).
michael@0 164 */
michael@0 165 UBool isReplacer(UChar32 ch);
michael@0 166
michael@0 167 private:
michael@0 168 ParseData(const ParseData &other); // forbid copying of this class
michael@0 169 ParseData &operator=(const ParseData &other); // forbid copying of this class
michael@0 170 };
michael@0 171
michael@0 172 ParseData::ParseData(const TransliterationRuleData* d,
michael@0 173 const UVector* sets,
michael@0 174 const Hashtable* vNames) :
michael@0 175 data(d), variablesVector(sets), variableNames(vNames) {}
michael@0 176
michael@0 177 ParseData::~ParseData() {}
michael@0 178
michael@0 179 /**
michael@0 180 * Implement SymbolTable API.
michael@0 181 */
michael@0 182 const UnicodeString* ParseData::lookup(const UnicodeString& name) const {
michael@0 183 return (const UnicodeString*) variableNames->get(name);
michael@0 184 }
michael@0 185
michael@0 186 /**
michael@0 187 * Implement SymbolTable API.
michael@0 188 */
michael@0 189 const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const {
michael@0 190 // Note that we cannot use data.lookupSet() because the
michael@0 191 // set array has not been constructed yet.
michael@0 192 const UnicodeFunctor* set = NULL;
michael@0 193 int32_t i = ch - data->variablesBase;
michael@0 194 if (i >= 0 && i < variablesVector->size()) {
michael@0 195 int32_t i = ch - data->variablesBase;
michael@0 196 set = (i < variablesVector->size()) ?
michael@0 197 (UnicodeFunctor*) variablesVector->elementAt(i) : 0;
michael@0 198 }
michael@0 199 return set;
michael@0 200 }
michael@0 201
michael@0 202 /**
michael@0 203 * Implement SymbolTable API. Parse out a symbol reference
michael@0 204 * name.
michael@0 205 */
michael@0 206 UnicodeString ParseData::parseReference(const UnicodeString& text,
michael@0 207 ParsePosition& pos, int32_t limit) const {
michael@0 208 int32_t start = pos.getIndex();
michael@0 209 int32_t i = start;
michael@0 210 UnicodeString result;
michael@0 211 while (i < limit) {
michael@0 212 UChar c = text.charAt(i);
michael@0 213 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
michael@0 214 break;
michael@0 215 }
michael@0 216 ++i;
michael@0 217 }
michael@0 218 if (i == start) { // No valid name chars
michael@0 219 return result; // Indicate failure with empty string
michael@0 220 }
michael@0 221 pos.setIndex(i);
michael@0 222 text.extractBetween(start, i, result);
michael@0 223 return result;
michael@0 224 }
michael@0 225
michael@0 226 UBool ParseData::isMatcher(UChar32 ch) {
michael@0 227 // Note that we cannot use data.lookup() because the
michael@0 228 // set array has not been constructed yet.
michael@0 229 int32_t i = ch - data->variablesBase;
michael@0 230 if (i >= 0 && i < variablesVector->size()) {
michael@0 231 UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i);
michael@0 232 return f != NULL && f->toMatcher() != NULL;
michael@0 233 }
michael@0 234 return TRUE;
michael@0 235 }
michael@0 236
michael@0 237 /**
michael@0 238 * Return true if the given character is a replacer standin or a plain
michael@0 239 * character (non standin).
michael@0 240 */
michael@0 241 UBool ParseData::isReplacer(UChar32 ch) {
michael@0 242 // Note that we cannot use data.lookup() because the
michael@0 243 // set array has not been constructed yet.
michael@0 244 int i = ch - data->variablesBase;
michael@0 245 if (i >= 0 && i < variablesVector->size()) {
michael@0 246 UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i);
michael@0 247 return f != NULL && f->toReplacer() != NULL;
michael@0 248 }
michael@0 249 return TRUE;
michael@0 250 }
michael@0 251
michael@0 252 //----------------------------------------------------------------------
michael@0 253 // BEGIN RuleHalf
michael@0 254 //----------------------------------------------------------------------
michael@0 255
michael@0 256 /**
michael@0 257 * A class representing one side of a rule. This class knows how to
michael@0 258 * parse half of a rule. It is tightly coupled to the method
michael@0 259 * RuleBasedTransliterator.Parser.parseRule().
michael@0 260 */
michael@0 261 class RuleHalf : public UMemory {
michael@0 262
michael@0 263 public:
michael@0 264
michael@0 265 UnicodeString text;
michael@0 266
michael@0 267 int32_t cursor; // position of cursor in text
michael@0 268 int32_t ante; // position of ante context marker '{' in text
michael@0 269 int32_t post; // position of post context marker '}' in text
michael@0 270
michael@0 271 // Record the offset to the cursor either to the left or to the
michael@0 272 // right of the key. This is indicated by characters on the output
michael@0 273 // side that allow the cursor to be positioned arbitrarily within
michael@0 274 // the matching text. For example, abc{def} > | @@@ xyz; changes
michael@0 275 // def to xyz and moves the cursor to before abc. Offset characters
michael@0 276 // must be at the start or end, and they cannot move the cursor past
michael@0 277 // the ante- or postcontext text. Placeholders are only valid in
michael@0 278 // output text. The length of the ante and post context is
michael@0 279 // determined at runtime, because of supplementals and quantifiers.
michael@0 280 int32_t cursorOffset; // only nonzero on output side
michael@0 281
michael@0 282 // Position of first CURSOR_OFFSET on _right_. This will be -1
michael@0 283 // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
michael@0 284 int32_t cursorOffsetPos;
michael@0 285
michael@0 286 UBool anchorStart;
michael@0 287 UBool anchorEnd;
michael@0 288
michael@0 289 /**
michael@0 290 * The segment number from 1..n of the next '(' we see
michael@0 291 * during parsing; 1-based.
michael@0 292 */
michael@0 293 int32_t nextSegmentNumber;
michael@0 294
michael@0 295 TransliteratorParser& parser;
michael@0 296
michael@0 297 //--------------------------------------------------
michael@0 298 // Methods
michael@0 299
michael@0 300 RuleHalf(TransliteratorParser& parser);
michael@0 301 ~RuleHalf();
michael@0 302
michael@0 303 int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
michael@0 304
michael@0 305 int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
michael@0 306 UnicodeString& buf,
michael@0 307 const UnicodeString& illegal,
michael@0 308 UBool isSegment,
michael@0 309 UErrorCode& status);
michael@0 310
michael@0 311 /**
michael@0 312 * Remove context.
michael@0 313 */
michael@0 314 void removeContext();
michael@0 315
michael@0 316 /**
michael@0 317 * Return true if this half looks like valid output, that is, does not
michael@0 318 * contain quantifiers or other special input-only elements.
michael@0 319 */
michael@0 320 UBool isValidOutput(TransliteratorParser& parser);
michael@0 321
michael@0 322 /**
michael@0 323 * Return true if this half looks like valid input, that is, does not
michael@0 324 * contain functions or other special output-only elements.
michael@0 325 */
michael@0 326 UBool isValidInput(TransliteratorParser& parser);
michael@0 327
michael@0 328 int syntaxError(UErrorCode code,
michael@0 329 const UnicodeString& rule,
michael@0 330 int32_t start,
michael@0 331 UErrorCode& status) {
michael@0 332 return parser.syntaxError(code, rule, start, status);
michael@0 333 }
michael@0 334
michael@0 335 private:
michael@0 336 // Disallowed methods; no impl.
michael@0 337 RuleHalf(const RuleHalf&);
michael@0 338 RuleHalf& operator=(const RuleHalf&);
michael@0 339 };
michael@0 340
michael@0 341 RuleHalf::RuleHalf(TransliteratorParser& p) :
michael@0 342 parser(p)
michael@0 343 {
michael@0 344 cursor = -1;
michael@0 345 ante = -1;
michael@0 346 post = -1;
michael@0 347 cursorOffset = 0;
michael@0 348 cursorOffsetPos = 0;
michael@0 349 anchorStart = anchorEnd = FALSE;
michael@0 350 nextSegmentNumber = 1;
michael@0 351 }
michael@0 352
michael@0 353 RuleHalf::~RuleHalf() {
michael@0 354 }
michael@0 355
michael@0 356 /**
michael@0 357 * Parse one side of a rule, stopping at either the limit,
michael@0 358 * the END_OF_RULE character, or an operator.
michael@0 359 * @return the index after the terminating character, or
michael@0 360 * if limit was reached, limit
michael@0 361 */
michael@0 362 int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
michael@0 363 int32_t start = pos;
michael@0 364 text.truncate(0);
michael@0 365 pos = parseSection(rule, pos, limit, text, UnicodeString(TRUE, ILLEGAL_TOP, -1), FALSE, status);
michael@0 366
michael@0 367 if (cursorOffset > 0 && cursor != cursorOffsetPos) {
michael@0 368 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
michael@0 369 }
michael@0 370
michael@0 371 return pos;
michael@0 372 }
michael@0 373
michael@0 374 /**
michael@0 375 * Parse a section of one side of a rule, stopping at either
michael@0 376 * the limit, the END_OF_RULE character, an operator, or a
michael@0 377 * segment close character. This method parses both a
michael@0 378 * top-level rule half and a segment within such a rule half.
michael@0 379 * It calls itself recursively to parse segments and nested
michael@0 380 * segments.
michael@0 381 * @param buf buffer into which to accumulate the rule pattern
michael@0 382 * characters, either literal characters from the rule or
michael@0 383 * standins for UnicodeMatcher objects including segments.
michael@0 384 * @param illegal the set of special characters that is illegal during
michael@0 385 * this parse.
michael@0 386 * @param isSegment if true, then we've already seen a '(' and
michael@0 387 * pos on entry points right after it. Accumulate everything
michael@0 388 * up to the closing ')', put it in a segment matcher object,
michael@0 389 * generate a standin for it, and add the standin to buf. As
michael@0 390 * a side effect, update the segments vector with a reference
michael@0 391 * to the segment matcher. This works recursively for nested
michael@0 392 * segments. If isSegment is false, just accumulate
michael@0 393 * characters into buf.
michael@0 394 * @return the index after the terminating character, or
michael@0 395 * if limit was reached, limit
michael@0 396 */
michael@0 397 int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
michael@0 398 UnicodeString& buf,
michael@0 399 const UnicodeString& illegal,
michael@0 400 UBool isSegment, UErrorCode& status) {
michael@0 401 int32_t start = pos;
michael@0 402 ParsePosition pp;
michael@0 403 UnicodeString scratch;
michael@0 404 UBool done = FALSE;
michael@0 405 int32_t quoteStart = -1; // Most recent 'single quoted string'
michael@0 406 int32_t quoteLimit = -1;
michael@0 407 int32_t varStart = -1; // Most recent $variableReference
michael@0 408 int32_t varLimit = -1;
michael@0 409 int32_t bufStart = buf.length();
michael@0 410
michael@0 411 while (pos < limit && !done) {
michael@0 412 // Since all syntax characters are in the BMP, fetching
michael@0 413 // 16-bit code units suffices here.
michael@0 414 UChar c = rule.charAt(pos++);
michael@0 415 if (PatternProps::isWhiteSpace(c)) {
michael@0 416 // Ignore whitespace. Note that this is not Unicode
michael@0 417 // spaces, but Java spaces -- a subset, representing
michael@0 418 // whitespace likely to be seen in code.
michael@0 419 continue;
michael@0 420 }
michael@0 421 if (u_strchr(HALF_ENDERS, c) != NULL) {
michael@0 422 if (isSegment) {
michael@0 423 // Unclosed segment
michael@0 424 return syntaxError(U_UNCLOSED_SEGMENT, rule, start, status);
michael@0 425 }
michael@0 426 break;
michael@0 427 }
michael@0 428 if (anchorEnd) {
michael@0 429 // Text after a presumed end anchor is a syntax err
michael@0 430 return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start, status);
michael@0 431 }
michael@0 432 if (UnicodeSet::resemblesPattern(rule, pos-1)) {
michael@0 433 pp.setIndex(pos-1); // Backup to opening '['
michael@0 434 buf.append(parser.parseSet(rule, pp, status));
michael@0 435 if (U_FAILURE(status)) {
michael@0 436 return syntaxError(U_MALFORMED_SET, rule, start, status);
michael@0 437 }
michael@0 438 pos = pp.getIndex();
michael@0 439 continue;
michael@0 440 }
michael@0 441 // Handle escapes
michael@0 442 if (c == ESCAPE) {
michael@0 443 if (pos == limit) {
michael@0 444 return syntaxError(U_TRAILING_BACKSLASH, rule, start, status);
michael@0 445 }
michael@0 446 UChar32 escaped = rule.unescapeAt(pos); // pos is already past '\\'
michael@0 447 if (escaped == (UChar32) -1) {
michael@0 448 return syntaxError(U_MALFORMED_UNICODE_ESCAPE, rule, start, status);
michael@0 449 }
michael@0 450 if (!parser.checkVariableRange(escaped)) {
michael@0 451 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);
michael@0 452 }
michael@0 453 buf.append(escaped);
michael@0 454 continue;
michael@0 455 }
michael@0 456 // Handle quoted matter
michael@0 457 if (c == QUOTE) {
michael@0 458 int32_t iq = rule.indexOf(QUOTE, pos);
michael@0 459 if (iq == pos) {
michael@0 460 buf.append(c); // Parse [''] outside quotes as [']
michael@0 461 ++pos;
michael@0 462 } else {
michael@0 463 /* This loop picks up a run of quoted text of the
michael@0 464 * form 'aaaa' each time through. If this run
michael@0 465 * hasn't really ended ('aaaa''bbbb') then it keeps
michael@0 466 * looping, each time adding on a new run. When it
michael@0 467 * reaches the final quote it breaks.
michael@0 468 */
michael@0 469 quoteStart = buf.length();
michael@0 470 for (;;) {
michael@0 471 if (iq < 0) {
michael@0 472 return syntaxError(U_UNTERMINATED_QUOTE, rule, start, status);
michael@0 473 }
michael@0 474 scratch.truncate(0);
michael@0 475 rule.extractBetween(pos, iq, scratch);
michael@0 476 buf.append(scratch);
michael@0 477 pos = iq+1;
michael@0 478 if (pos < limit && rule.charAt(pos) == QUOTE) {
michael@0 479 // Parse [''] inside quotes as [']
michael@0 480 iq = rule.indexOf(QUOTE, pos+1);
michael@0 481 // Continue looping
michael@0 482 } else {
michael@0 483 break;
michael@0 484 }
michael@0 485 }
michael@0 486 quoteLimit = buf.length();
michael@0 487
michael@0 488 for (iq=quoteStart; iq<quoteLimit; ++iq) {
michael@0 489 if (!parser.checkVariableRange(buf.charAt(iq))) {
michael@0 490 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);
michael@0 491 }
michael@0 492 }
michael@0 493 }
michael@0 494 continue;
michael@0 495 }
michael@0 496
michael@0 497 if (!parser.checkVariableRange(c)) {
michael@0 498 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);
michael@0 499 }
michael@0 500
michael@0 501 if (illegal.indexOf(c) >= 0) {
michael@0 502 syntaxError(U_ILLEGAL_CHARACTER, rule, start, status);
michael@0 503 }
michael@0 504
michael@0 505 switch (c) {
michael@0 506
michael@0 507 //------------------------------------------------------
michael@0 508 // Elements allowed within and out of segments
michael@0 509 //------------------------------------------------------
michael@0 510 case ANCHOR_START:
michael@0 511 if (buf.length() == 0 && !anchorStart) {
michael@0 512 anchorStart = TRUE;
michael@0 513 } else {
michael@0 514 return syntaxError(U_MISPLACED_ANCHOR_START,
michael@0 515 rule, start, status);
michael@0 516 }
michael@0 517 break;
michael@0 518 case SEGMENT_OPEN:
michael@0 519 {
michael@0 520 // bufSegStart is the offset in buf to the first
michael@0 521 // character of the segment we are parsing.
michael@0 522 int32_t bufSegStart = buf.length();
michael@0 523
michael@0 524 // Record segment number now, since nextSegmentNumber
michael@0 525 // will be incremented during the call to parseSection
michael@0 526 // if there are nested segments.
michael@0 527 int32_t segmentNumber = nextSegmentNumber++; // 1-based
michael@0 528
michael@0 529 // Parse the segment
michael@0 530 pos = parseSection(rule, pos, limit, buf, UnicodeString(TRUE, ILLEGAL_SEG, -1), TRUE, status);
michael@0 531
michael@0 532 // After parsing a segment, the relevant characters are
michael@0 533 // in buf, starting at offset bufSegStart. Extract them
michael@0 534 // into a string matcher, and replace them with a
michael@0 535 // standin for that matcher.
michael@0 536 StringMatcher* m =
michael@0 537 new StringMatcher(buf, bufSegStart, buf.length(),
michael@0 538 segmentNumber, *parser.curData);
michael@0 539 if (m == NULL) {
michael@0 540 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
michael@0 541 }
michael@0 542
michael@0 543 // Record and associate object and segment number
michael@0 544 parser.setSegmentObject(segmentNumber, m, status);
michael@0 545 buf.truncate(bufSegStart);
michael@0 546 buf.append(parser.getSegmentStandin(segmentNumber, status));
michael@0 547 }
michael@0 548 break;
michael@0 549 case FUNCTION:
michael@0 550 case ALT_FUNCTION:
michael@0 551 {
michael@0 552 int32_t iref = pos;
michael@0 553 TransliteratorIDParser::SingleID* single =
michael@0 554 TransliteratorIDParser::parseFilterID(rule, iref);
michael@0 555 // The next character MUST be a segment open
michael@0 556 if (single == NULL ||
michael@0 557 !ICU_Utility::parseChar(rule, iref, SEGMENT_OPEN)) {
michael@0 558 return syntaxError(U_INVALID_FUNCTION, rule, start, status);
michael@0 559 }
michael@0 560
michael@0 561 Transliterator *t = single->createInstance();
michael@0 562 delete single;
michael@0 563 if (t == NULL) {
michael@0 564 return syntaxError(U_INVALID_FUNCTION, rule, start, status);
michael@0 565 }
michael@0 566
michael@0 567 // bufSegStart is the offset in buf to the first
michael@0 568 // character of the segment we are parsing.
michael@0 569 int32_t bufSegStart = buf.length();
michael@0 570
michael@0 571 // Parse the segment
michael@0 572 pos = parseSection(rule, iref, limit, buf, UnicodeString(TRUE, ILLEGAL_FUNC, -1), TRUE, status);
michael@0 573
michael@0 574 // After parsing a segment, the relevant characters are
michael@0 575 // in buf, starting at offset bufSegStart.
michael@0 576 UnicodeString output;
michael@0 577 buf.extractBetween(bufSegStart, buf.length(), output);
michael@0 578 FunctionReplacer *r =
michael@0 579 new FunctionReplacer(t, new StringReplacer(output, parser.curData));
michael@0 580 if (r == NULL) {
michael@0 581 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
michael@0 582 }
michael@0 583
michael@0 584 // Replace the buffer contents with a stand-in
michael@0 585 buf.truncate(bufSegStart);
michael@0 586 buf.append(parser.generateStandInFor(r, status));
michael@0 587 }
michael@0 588 break;
michael@0 589 case SymbolTable::SYMBOL_REF:
michael@0 590 // Handle variable references and segment references "$1" .. "$9"
michael@0 591 {
michael@0 592 // A variable reference must be followed immediately
michael@0 593 // by a Unicode identifier start and zero or more
michael@0 594 // Unicode identifier part characters, or by a digit
michael@0 595 // 1..9 if it is a segment reference.
michael@0 596 if (pos == limit) {
michael@0 597 // A variable ref character at the end acts as
michael@0 598 // an anchor to the context limit, as in perl.
michael@0 599 anchorEnd = TRUE;
michael@0 600 break;
michael@0 601 }
michael@0 602 // Parse "$1" "$2" .. "$9" .. (no upper limit)
michael@0 603 c = rule.charAt(pos);
michael@0 604 int32_t r = u_digit(c, 10);
michael@0 605 if (r >= 1 && r <= 9) {
michael@0 606 r = ICU_Utility::parseNumber(rule, pos, 10);
michael@0 607 if (r < 0) {
michael@0 608 return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE,
michael@0 609 rule, start, status);
michael@0 610 }
michael@0 611 buf.append(parser.getSegmentStandin(r, status));
michael@0 612 } else {
michael@0 613 pp.setIndex(pos);
michael@0 614 UnicodeString name = parser.parseData->
michael@0 615 parseReference(rule, pp, limit);
michael@0 616 if (name.length() == 0) {
michael@0 617 // This means the '$' was not followed by a
michael@0 618 // valid name. Try to interpret it as an
michael@0 619 // end anchor then. If this also doesn't work
michael@0 620 // (if we see a following character) then signal
michael@0 621 // an error.
michael@0 622 anchorEnd = TRUE;
michael@0 623 break;
michael@0 624 }
michael@0 625 pos = pp.getIndex();
michael@0 626 // If this is a variable definition statement,
michael@0 627 // then the LHS variable will be undefined. In
michael@0 628 // that case appendVariableDef() will append the
michael@0 629 // special placeholder char variableLimit-1.
michael@0 630 varStart = buf.length();
michael@0 631 parser.appendVariableDef(name, buf, status);
michael@0 632 varLimit = buf.length();
michael@0 633 }
michael@0 634 }
michael@0 635 break;
michael@0 636 case DOT:
michael@0 637 buf.append(parser.getDotStandIn(status));
michael@0 638 break;
michael@0 639 case KLEENE_STAR:
michael@0 640 case ONE_OR_MORE:
michael@0 641 case ZERO_OR_ONE:
michael@0 642 // Quantifiers. We handle single characters, quoted strings,
michael@0 643 // variable references, and segments.
michael@0 644 // a+ matches aaa
michael@0 645 // 'foo'+ matches foofoofoo
michael@0 646 // $v+ matches xyxyxy if $v == xy
michael@0 647 // (seg)+ matches segsegseg
michael@0 648 {
michael@0 649 if (isSegment && buf.length() == bufStart) {
michael@0 650 // The */+ immediately follows '('
michael@0 651 return syntaxError(U_MISPLACED_QUANTIFIER, rule, start, status);
michael@0 652 }
michael@0 653
michael@0 654 int32_t qstart, qlimit;
michael@0 655 // The */+ follows an isolated character or quote
michael@0 656 // or variable reference
michael@0 657 if (buf.length() == quoteLimit) {
michael@0 658 // The */+ follows a 'quoted string'
michael@0 659 qstart = quoteStart;
michael@0 660 qlimit = quoteLimit;
michael@0 661 } else if (buf.length() == varLimit) {
michael@0 662 // The */+ follows a $variableReference
michael@0 663 qstart = varStart;
michael@0 664 qlimit = varLimit;
michael@0 665 } else {
michael@0 666 // The */+ follows a single character, possibly
michael@0 667 // a segment standin
michael@0 668 qstart = buf.length() - 1;
michael@0 669 qlimit = qstart + 1;
michael@0 670 }
michael@0 671
michael@0 672 UnicodeFunctor *m =
michael@0 673 new StringMatcher(buf, qstart, qlimit, 0, *parser.curData);
michael@0 674 if (m == NULL) {
michael@0 675 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
michael@0 676 }
michael@0 677 int32_t min = 0;
michael@0 678 int32_t max = Quantifier::MAX;
michael@0 679 switch (c) {
michael@0 680 case ONE_OR_MORE:
michael@0 681 min = 1;
michael@0 682 break;
michael@0 683 case ZERO_OR_ONE:
michael@0 684 min = 0;
michael@0 685 max = 1;
michael@0 686 break;
michael@0 687 // case KLEENE_STAR:
michael@0 688 // do nothing -- min, max already set
michael@0 689 }
michael@0 690 m = new Quantifier(m, min, max);
michael@0 691 if (m == NULL) {
michael@0 692 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
michael@0 693 }
michael@0 694 buf.truncate(qstart);
michael@0 695 buf.append(parser.generateStandInFor(m, status));
michael@0 696 }
michael@0 697 break;
michael@0 698
michael@0 699 //------------------------------------------------------
michael@0 700 // Elements allowed ONLY WITHIN segments
michael@0 701 //------------------------------------------------------
michael@0 702 case SEGMENT_CLOSE:
michael@0 703 // assert(isSegment);
michael@0 704 // We're done parsing a segment.
michael@0 705 done = TRUE;
michael@0 706 break;
michael@0 707
michael@0 708 //------------------------------------------------------
michael@0 709 // Elements allowed ONLY OUTSIDE segments
michael@0 710 //------------------------------------------------------
michael@0 711 case CONTEXT_ANTE:
michael@0 712 if (ante >= 0) {
michael@0 713 return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start, status);
michael@0 714 }
michael@0 715 ante = buf.length();
michael@0 716 break;
michael@0 717 case CONTEXT_POST:
michael@0 718 if (post >= 0) {
michael@0 719 return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start, status);
michael@0 720 }
michael@0 721 post = buf.length();
michael@0 722 break;
michael@0 723 case CURSOR_POS:
michael@0 724 if (cursor >= 0) {
michael@0 725 return syntaxError(U_MULTIPLE_CURSORS, rule, start, status);
michael@0 726 }
michael@0 727 cursor = buf.length();
michael@0 728 break;
michael@0 729 case CURSOR_OFFSET:
michael@0 730 if (cursorOffset < 0) {
michael@0 731 if (buf.length() > 0) {
michael@0 732 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
michael@0 733 }
michael@0 734 --cursorOffset;
michael@0 735 } else if (cursorOffset > 0) {
michael@0 736 if (buf.length() != cursorOffsetPos || cursor >= 0) {
michael@0 737 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
michael@0 738 }
michael@0 739 ++cursorOffset;
michael@0 740 } else {
michael@0 741 if (cursor == 0 && buf.length() == 0) {
michael@0 742 cursorOffset = -1;
michael@0 743 } else if (cursor < 0) {
michael@0 744 cursorOffsetPos = buf.length();
michael@0 745 cursorOffset = 1;
michael@0 746 } else {
michael@0 747 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
michael@0 748 }
michael@0 749 }
michael@0 750 break;
michael@0 751
michael@0 752
michael@0 753 //------------------------------------------------------
michael@0 754 // Non-special characters
michael@0 755 //------------------------------------------------------
michael@0 756 default:
michael@0 757 // Disallow unquoted characters other than [0-9A-Za-z]
michael@0 758 // in the printable ASCII range. These characters are
michael@0 759 // reserved for possible future use.
michael@0 760 if (c >= 0x0021 && c <= 0x007E &&
michael@0 761 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
michael@0 762 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
michael@0 763 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) {
michael@0 764 return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status);
michael@0 765 }
michael@0 766 buf.append(c);
michael@0 767 break;
michael@0 768 }
michael@0 769 }
michael@0 770
michael@0 771 return pos;
michael@0 772 }
michael@0 773
michael@0 774 /**
michael@0 775 * Remove context.
michael@0 776 */
michael@0 777 void RuleHalf::removeContext() {
michael@0 778 //text = text.substring(ante < 0 ? 0 : ante,
michael@0 779 // post < 0 ? text.length() : post);
michael@0 780 if (post >= 0) {
michael@0 781 text.remove(post);
michael@0 782 }
michael@0 783 if (ante >= 0) {
michael@0 784 text.removeBetween(0, ante);
michael@0 785 }
michael@0 786 ante = post = -1;
michael@0 787 anchorStart = anchorEnd = FALSE;
michael@0 788 }
michael@0 789
michael@0 790 /**
michael@0 791 * Return true if this half looks like valid output, that is, does not
michael@0 792 * contain quantifiers or other special input-only elements.
michael@0 793 */
michael@0 794 UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) {
michael@0 795 for (int32_t i=0; i<text.length(); ) {
michael@0 796 UChar32 c = text.char32At(i);
michael@0 797 i += U16_LENGTH(c);
michael@0 798 if (!transParser.parseData->isReplacer(c)) {
michael@0 799 return FALSE;
michael@0 800 }
michael@0 801 }
michael@0 802 return TRUE;
michael@0 803 }
michael@0 804
michael@0 805 /**
michael@0 806 * Return true if this half looks like valid input, that is, does not
michael@0 807 * contain functions or other special output-only elements.
michael@0 808 */
michael@0 809 UBool RuleHalf::isValidInput(TransliteratorParser& transParser) {
michael@0 810 for (int32_t i=0; i<text.length(); ) {
michael@0 811 UChar32 c = text.char32At(i);
michael@0 812 i += U16_LENGTH(c);
michael@0 813 if (!transParser.parseData->isMatcher(c)) {
michael@0 814 return FALSE;
michael@0 815 }
michael@0 816 }
michael@0 817 return TRUE;
michael@0 818 }
michael@0 819
michael@0 820 //----------------------------------------------------------------------
michael@0 821 // PUBLIC API
michael@0 822 //----------------------------------------------------------------------
michael@0 823
michael@0 824 /**
michael@0 825 * Constructor.
michael@0 826 */
michael@0 827 TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) :
michael@0 828 dataVector(statusReturn),
michael@0 829 idBlockVector(statusReturn),
michael@0 830 variablesVector(statusReturn),
michael@0 831 segmentObjects(statusReturn)
michael@0 832 {
michael@0 833 idBlockVector.setDeleter(uprv_deleteUObject);
michael@0 834 curData = NULL;
michael@0 835 compoundFilter = NULL;
michael@0 836 parseData = NULL;
michael@0 837 variableNames.setValueDeleter(uprv_deleteUObject);
michael@0 838 }
michael@0 839
michael@0 840 /**
michael@0 841 * Destructor.
michael@0 842 */
michael@0 843 TransliteratorParser::~TransliteratorParser() {
michael@0 844 while (!dataVector.isEmpty())
michael@0 845 delete (TransliterationRuleData*)(dataVector.orphanElementAt(0));
michael@0 846 delete compoundFilter;
michael@0 847 delete parseData;
michael@0 848 while (!variablesVector.isEmpty())
michael@0 849 delete (UnicodeFunctor*)variablesVector.orphanElementAt(0);
michael@0 850 }
michael@0 851
michael@0 852 void
michael@0 853 TransliteratorParser::parse(const UnicodeString& rules,
michael@0 854 UTransDirection transDirection,
michael@0 855 UParseError& pe,
michael@0 856 UErrorCode& ec) {
michael@0 857 if (U_SUCCESS(ec)) {
michael@0 858 parseRules(rules, transDirection, ec);
michael@0 859 pe = parseError;
michael@0 860 }
michael@0 861 }
michael@0 862
michael@0 863 /**
michael@0 864 * Return the compound filter parsed by parse(). Caller owns result.
michael@0 865 */
michael@0 866 UnicodeSet* TransliteratorParser::orphanCompoundFilter() {
michael@0 867 UnicodeSet* f = compoundFilter;
michael@0 868 compoundFilter = NULL;
michael@0 869 return f;
michael@0 870 }
michael@0 871
michael@0 872 //----------------------------------------------------------------------
michael@0 873 // Private implementation
michael@0 874 //----------------------------------------------------------------------
michael@0 875
michael@0 876 /**
michael@0 877 * Parse the given string as a sequence of rules, separated by newline
michael@0 878 * characters ('\n'), and cause this object to implement those rules. Any
michael@0 879 * previous rules are discarded. Typically this method is called exactly
michael@0 880 * once, during construction.
michael@0 881 * @exception IllegalArgumentException if there is a syntax error in the
michael@0 882 * rules
michael@0 883 */
michael@0 884 void TransliteratorParser::parseRules(const UnicodeString& rule,
michael@0 885 UTransDirection theDirection,
michael@0 886 UErrorCode& status)
michael@0 887 {
michael@0 888 // Clear error struct
michael@0 889 uprv_memset(&parseError, 0, sizeof(parseError));
michael@0 890 parseError.line = parseError.offset = -1;
michael@0 891
michael@0 892 UBool parsingIDs = TRUE;
michael@0 893 int32_t ruleCount = 0;
michael@0 894
michael@0 895 while (!dataVector.isEmpty()) {
michael@0 896 delete (TransliterationRuleData*)(dataVector.orphanElementAt(0));
michael@0 897 }
michael@0 898 if (U_FAILURE(status)) {
michael@0 899 return;
michael@0 900 }
michael@0 901
michael@0 902 idBlockVector.removeAllElements();
michael@0 903 curData = NULL;
michael@0 904 direction = theDirection;
michael@0 905 ruleCount = 0;
michael@0 906
michael@0 907 delete compoundFilter;
michael@0 908 compoundFilter = NULL;
michael@0 909
michael@0 910 while (!variablesVector.isEmpty()) {
michael@0 911 delete (UnicodeFunctor*)variablesVector.orphanElementAt(0);
michael@0 912 }
michael@0 913 variableNames.removeAll();
michael@0 914 parseData = new ParseData(0, &variablesVector, &variableNames);
michael@0 915 if (parseData == NULL) {
michael@0 916 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 917 return;
michael@0 918 }
michael@0 919
michael@0 920 dotStandIn = (UChar) -1;
michael@0 921
michael@0 922 UnicodeString *tempstr = NULL; // used for memory allocation error checking
michael@0 923 UnicodeString str; // scratch
michael@0 924 UnicodeString idBlockResult;
michael@0 925 int32_t pos = 0;
michael@0 926 int32_t limit = rule.length();
michael@0 927
michael@0 928 // The compound filter offset is an index into idBlockResult.
michael@0 929 // If it is 0, then the compound filter occurred at the start,
michael@0 930 // and it is the offset to the _start_ of the compound filter
michael@0 931 // pattern. Otherwise it is the offset to the _limit_ of the
michael@0 932 // compound filter pattern within idBlockResult.
michael@0 933 compoundFilter = NULL;
michael@0 934 int32_t compoundFilterOffset = -1;
michael@0 935
michael@0 936 while (pos < limit && U_SUCCESS(status)) {
michael@0 937 UChar c = rule.charAt(pos++);
michael@0 938 if (PatternProps::isWhiteSpace(c)) {
michael@0 939 // Ignore leading whitespace.
michael@0 940 continue;
michael@0 941 }
michael@0 942 // Skip lines starting with the comment character
michael@0 943 if (c == RULE_COMMENT_CHAR) {
michael@0 944 pos = rule.indexOf((UChar)0x000A /*\n*/, pos) + 1;
michael@0 945 if (pos == 0) {
michael@0 946 break; // No "\n" found; rest of rule is a commnet
michael@0 947 }
michael@0 948 continue; // Either fall out or restart with next line
michael@0 949 }
michael@0 950
michael@0 951 // skip empty rules
michael@0 952 if (c == END_OF_RULE)
michael@0 953 continue;
michael@0 954
michael@0 955 // keep track of how many rules we've seen
michael@0 956 ++ruleCount;
michael@0 957
michael@0 958 // We've found the start of a rule or ID. c is its first
michael@0 959 // character, and pos points past c.
michael@0 960 --pos;
michael@0 961 // Look for an ID token. Must have at least ID_TOKEN_LEN + 1
michael@0 962 // chars left.
michael@0 963 if ((pos + ID_TOKEN_LEN + 1) <= limit &&
michael@0 964 rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) {
michael@0 965 pos += ID_TOKEN_LEN;
michael@0 966 c = rule.charAt(pos);
michael@0 967 while (PatternProps::isWhiteSpace(c) && pos < limit) {
michael@0 968 ++pos;
michael@0 969 c = rule.charAt(pos);
michael@0 970 }
michael@0 971
michael@0 972 int32_t p = pos;
michael@0 973
michael@0 974 if (!parsingIDs) {
michael@0 975 if (curData != NULL) {
michael@0 976 if (direction == UTRANS_FORWARD)
michael@0 977 dataVector.addElement(curData, status);
michael@0 978 else
michael@0 979 dataVector.insertElementAt(curData, 0, status);
michael@0 980 curData = NULL;
michael@0 981 }
michael@0 982 parsingIDs = TRUE;
michael@0 983 }
michael@0 984
michael@0 985 TransliteratorIDParser::SingleID* id =
michael@0 986 TransliteratorIDParser::parseSingleID(rule, p, direction, status);
michael@0 987 if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) {
michael@0 988 // Successful ::ID parse.
michael@0 989
michael@0 990 if (direction == UTRANS_FORWARD) {
michael@0 991 idBlockResult.append(id->canonID).append(END_OF_RULE);
michael@0 992 } else {
michael@0 993 idBlockResult.insert(0, END_OF_RULE);
michael@0 994 idBlockResult.insert(0, id->canonID);
michael@0 995 }
michael@0 996
michael@0 997 } else {
michael@0 998 // Couldn't parse an ID. Try to parse a global filter
michael@0 999 int32_t withParens = -1;
michael@0 1000 UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, NULL);
michael@0 1001 if (f != NULL) {
michael@0 1002 if (ICU_Utility::parseChar(rule, p, END_OF_RULE)
michael@0 1003 && (direction == UTRANS_FORWARD) == (withParens == 0))
michael@0 1004 {
michael@0 1005 if (compoundFilter != NULL) {
michael@0 1006 // Multiple compound filters
michael@0 1007 syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rule, pos, status);
michael@0 1008 delete f;
michael@0 1009 } else {
michael@0 1010 compoundFilter = f;
michael@0 1011 compoundFilterOffset = ruleCount;
michael@0 1012 }
michael@0 1013 } else {
michael@0 1014 delete f;
michael@0 1015 }
michael@0 1016 } else {
michael@0 1017 // Invalid ::id
michael@0 1018 // Can be parsed as neither an ID nor a global filter
michael@0 1019 syntaxError(U_INVALID_ID, rule, pos, status);
michael@0 1020 }
michael@0 1021 }
michael@0 1022 delete id;
michael@0 1023 pos = p;
michael@0 1024 } else {
michael@0 1025 if (parsingIDs) {
michael@0 1026 tempstr = new UnicodeString(idBlockResult);
michael@0 1027 // NULL pointer check
michael@0 1028 if (tempstr == NULL) {
michael@0 1029 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1030 return;
michael@0 1031 }
michael@0 1032 if (direction == UTRANS_FORWARD)
michael@0 1033 idBlockVector.addElement(tempstr, status);
michael@0 1034 else
michael@0 1035 idBlockVector.insertElementAt(tempstr, 0, status);
michael@0 1036 idBlockResult.remove();
michael@0 1037 parsingIDs = FALSE;
michael@0 1038 curData = new TransliterationRuleData(status);
michael@0 1039 // NULL pointer check
michael@0 1040 if (curData == NULL) {
michael@0 1041 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1042 return;
michael@0 1043 }
michael@0 1044 parseData->data = curData;
michael@0 1045
michael@0 1046 // By default, rules use part of the private use area
michael@0 1047 // E000..F8FF for variables and other stand-ins. Currently
michael@0 1048 // the range F000..F8FF is typically sufficient. The 'use
michael@0 1049 // variable range' pragma allows rule sets to modify this.
michael@0 1050 setVariableRange(0xF000, 0xF8FF, status);
michael@0 1051 }
michael@0 1052
michael@0 1053 if (resemblesPragma(rule, pos, limit)) {
michael@0 1054 int32_t ppp = parsePragma(rule, pos, limit, status);
michael@0 1055 if (ppp < 0) {
michael@0 1056 syntaxError(U_MALFORMED_PRAGMA, rule, pos, status);
michael@0 1057 }
michael@0 1058 pos = ppp;
michael@0 1059 // Parse a rule
michael@0 1060 } else {
michael@0 1061 pos = parseRule(rule, pos, limit, status);
michael@0 1062 }
michael@0 1063 }
michael@0 1064 }
michael@0 1065
michael@0 1066 if (parsingIDs && idBlockResult.length() > 0) {
michael@0 1067 tempstr = new UnicodeString(idBlockResult);
michael@0 1068 // NULL pointer check
michael@0 1069 if (tempstr == NULL) {
michael@0 1070 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1071 return;
michael@0 1072 }
michael@0 1073 if (direction == UTRANS_FORWARD)
michael@0 1074 idBlockVector.addElement(tempstr, status);
michael@0 1075 else
michael@0 1076 idBlockVector.insertElementAt(tempstr, 0, status);
michael@0 1077 }
michael@0 1078 else if (!parsingIDs && curData != NULL) {
michael@0 1079 if (direction == UTRANS_FORWARD)
michael@0 1080 dataVector.addElement(curData, status);
michael@0 1081 else
michael@0 1082 dataVector.insertElementAt(curData, 0, status);
michael@0 1083 }
michael@0 1084
michael@0 1085 if (U_SUCCESS(status)) {
michael@0 1086 // Convert the set vector to an array
michael@0 1087 int32_t i, dataVectorSize = dataVector.size();
michael@0 1088 for (i = 0; i < dataVectorSize; i++) {
michael@0 1089 TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i);
michael@0 1090 data->variablesLength = variablesVector.size();
michael@0 1091 if (data->variablesLength == 0) {
michael@0 1092 data->variables = 0;
michael@0 1093 } else {
michael@0 1094 data->variables = (UnicodeFunctor**)uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor*));
michael@0 1095 // NULL pointer check
michael@0 1096 if (data->variables == NULL) {
michael@0 1097 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1098 return;
michael@0 1099 }
michael@0 1100 data->variablesAreOwned = (i == 0);
michael@0 1101 }
michael@0 1102
michael@0 1103 for (int32_t j = 0; j < data->variablesLength; j++) {
michael@0 1104 data->variables[j] =
michael@0 1105 ((UnicodeSet*)variablesVector.elementAt(j));
michael@0 1106 }
michael@0 1107
michael@0 1108 data->variableNames.removeAll();
michael@0 1109 int32_t pos = -1;
michael@0 1110 const UHashElement* he = variableNames.nextElement(pos);
michael@0 1111 while (he != NULL) {
michael@0 1112 UnicodeString* tempus = (UnicodeString*)(((UnicodeString*)(he->value.pointer))->clone());
michael@0 1113 if (tempus == NULL) {
michael@0 1114 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1115 return;
michael@0 1116 }
michael@0 1117 data->variableNames.put(*((UnicodeString*)(he->key.pointer)),
michael@0 1118 tempus, status);
michael@0 1119 he = variableNames.nextElement(pos);
michael@0 1120 }
michael@0 1121 }
michael@0 1122 variablesVector.removeAllElements(); // keeps them from getting deleted when we succeed
michael@0 1123
michael@0 1124 // Index the rules
michael@0 1125 if (compoundFilter != NULL) {
michael@0 1126 if ((direction == UTRANS_FORWARD && compoundFilterOffset != 1) ||
michael@0 1127 (direction == UTRANS_REVERSE && compoundFilterOffset != ruleCount)) {
michael@0 1128 status = U_MISPLACED_COMPOUND_FILTER;
michael@0 1129 }
michael@0 1130 }
michael@0 1131
michael@0 1132 for (i = 0; i < dataVectorSize; i++) {
michael@0 1133 TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i);
michael@0 1134 data->ruleSet.freeze(parseError, status);
michael@0 1135 }
michael@0 1136 if (idBlockVector.size() == 1 && ((UnicodeString*)idBlockVector.elementAt(0))->isEmpty()) {
michael@0 1137 idBlockVector.removeElementAt(0);
michael@0 1138 }
michael@0 1139 }
michael@0 1140 }
michael@0 1141
michael@0 1142 /**
michael@0 1143 * Set the variable range to [start, end] (inclusive).
michael@0 1144 */
michael@0 1145 void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) {
michael@0 1146 if (start > end || start < 0 || end > 0xFFFF) {
michael@0 1147 status = U_MALFORMED_PRAGMA;
michael@0 1148 return;
michael@0 1149 }
michael@0 1150
michael@0 1151 curData->variablesBase = (UChar) start;
michael@0 1152 if (dataVector.size() == 0) {
michael@0 1153 variableNext = (UChar) start;
michael@0 1154 variableLimit = (UChar) (end + 1);
michael@0 1155 }
michael@0 1156 }
michael@0 1157
michael@0 1158 /**
michael@0 1159 * Assert that the given character is NOT within the variable range.
michael@0 1160 * If it is, return FALSE. This is neccesary to ensure that the
michael@0 1161 * variable range does not overlap characters used in a rule.
michael@0 1162 */
michael@0 1163 UBool TransliteratorParser::checkVariableRange(UChar32 ch) const {
michael@0 1164 return !(ch >= curData->variablesBase && ch < variableLimit);
michael@0 1165 }
michael@0 1166
michael@0 1167 /**
michael@0 1168 * Set the maximum backup to 'backup', in response to a pragma
michael@0 1169 * statement.
michael@0 1170 */
michael@0 1171 void TransliteratorParser::pragmaMaximumBackup(int32_t /*backup*/) {
michael@0 1172 //TODO Finish
michael@0 1173 }
michael@0 1174
michael@0 1175 /**
michael@0 1176 * Begin normalizing all rules using the given mode, in response
michael@0 1177 * to a pragma statement.
michael@0 1178 */
michael@0 1179 void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /*mode*/) {
michael@0 1180 //TODO Finish
michael@0 1181 }
michael@0 1182
michael@0 1183 static const UChar PRAGMA_USE[] = {0x75,0x73,0x65,0x20,0}; // "use "
michael@0 1184
michael@0 1185 static const UChar PRAGMA_VARIABLE_RANGE[] = {0x7E,0x76,0x61,0x72,0x69,0x61,0x62,0x6C,0x65,0x20,0x72,0x61,0x6E,0x67,0x65,0x20,0x23,0x20,0x23,0x7E,0x3B,0}; // "~variable range # #~;"
michael@0 1186
michael@0 1187 static const UChar PRAGMA_MAXIMUM_BACKUP[] = {0x7E,0x6D,0x61,0x78,0x69,0x6D,0x75,0x6D,0x20,0x62,0x61,0x63,0x6B,0x75,0x70,0x20,0x23,0x7E,0x3B,0}; // "~maximum backup #~;"
michael@0 1188
michael@0 1189 static const UChar PRAGMA_NFD_RULES[] = {0x7E,0x6E,0x66,0x64,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfd rules~;"
michael@0 1190
michael@0 1191 static const UChar PRAGMA_NFC_RULES[] = {0x7E,0x6E,0x66,0x63,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfc rules~;"
michael@0 1192
michael@0 1193 /**
michael@0 1194 * Return true if the given rule looks like a pragma.
michael@0 1195 * @param pos offset to the first non-whitespace character
michael@0 1196 * of the rule.
michael@0 1197 * @param limit pointer past the last character of the rule.
michael@0 1198 */
michael@0 1199 UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) {
michael@0 1200 // Must start with /use\s/i
michael@0 1201 return ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_USE, 4), NULL) >= 0;
michael@0 1202 }
michael@0 1203
michael@0 1204 /**
michael@0 1205 * Parse a pragma. This method assumes resemblesPragma() has
michael@0 1206 * already returned true.
michael@0 1207 * @param pos offset to the first non-whitespace character
michael@0 1208 * of the rule.
michael@0 1209 * @param limit pointer past the last character of the rule.
michael@0 1210 * @return the position index after the final ';' of the pragma,
michael@0 1211 * or -1 on failure.
michael@0 1212 */
michael@0 1213 int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
michael@0 1214 int32_t array[2];
michael@0 1215
michael@0 1216 // resemblesPragma() has already returned true, so we
michael@0 1217 // know that pos points to /use\s/i; we can skip 4 characters
michael@0 1218 // immediately
michael@0 1219 pos += 4;
michael@0 1220
michael@0 1221 // Here are the pragmas we recognize:
michael@0 1222 // use variable range 0xE000 0xEFFF;
michael@0 1223 // use maximum backup 16;
michael@0 1224 // use nfd rules;
michael@0 1225 // use nfc rules;
michael@0 1226 int p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_VARIABLE_RANGE, -1), array);
michael@0 1227 if (p >= 0) {
michael@0 1228 setVariableRange(array[0], array[1], status);
michael@0 1229 return p;
michael@0 1230 }
michael@0 1231
michael@0 1232 p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_MAXIMUM_BACKUP, -1), array);
michael@0 1233 if (p >= 0) {
michael@0 1234 pragmaMaximumBackup(array[0]);
michael@0 1235 return p;
michael@0 1236 }
michael@0 1237
michael@0 1238 p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_NFD_RULES, -1), NULL);
michael@0 1239 if (p >= 0) {
michael@0 1240 pragmaNormalizeRules(UNORM_NFD);
michael@0 1241 return p;
michael@0 1242 }
michael@0 1243
michael@0 1244 p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_NFC_RULES, -1), NULL);
michael@0 1245 if (p >= 0) {
michael@0 1246 pragmaNormalizeRules(UNORM_NFC);
michael@0 1247 return p;
michael@0 1248 }
michael@0 1249
michael@0 1250 // Syntax error: unable to parse pragma
michael@0 1251 return -1;
michael@0 1252 }
michael@0 1253
michael@0 1254 /**
michael@0 1255 * MAIN PARSER. Parse the next rule in the given rule string, starting
michael@0 1256 * at pos. Return the index after the last character parsed. Do not
michael@0 1257 * parse characters at or after limit.
michael@0 1258 *
michael@0 1259 * Important: The character at pos must be a non-whitespace character
michael@0 1260 * that is not the comment character.
michael@0 1261 *
michael@0 1262 * This method handles quoting, escaping, and whitespace removal. It
michael@0 1263 * parses the end-of-rule character. It recognizes context and cursor
michael@0 1264 * indicators. Once it does a lexical breakdown of the rule at pos, it
michael@0 1265 * creates a rule object and adds it to our rule list.
michael@0 1266 */
michael@0 1267 int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
michael@0 1268 // Locate the left side, operator, and right side
michael@0 1269 int32_t start = pos;
michael@0 1270 UChar op = 0;
michael@0 1271 int32_t i;
michael@0 1272
michael@0 1273 // Set up segments data
michael@0 1274 segmentStandins.truncate(0);
michael@0 1275 segmentObjects.removeAllElements();
michael@0 1276
michael@0 1277 // Use pointers to automatics to make swapping possible.
michael@0 1278 RuleHalf _left(*this), _right(*this);
michael@0 1279 RuleHalf* left = &_left;
michael@0 1280 RuleHalf* right = &_right;
michael@0 1281
michael@0 1282 undefinedVariableName.remove();
michael@0 1283 pos = left->parse(rule, pos, limit, status);
michael@0 1284 if (U_FAILURE(status)) {
michael@0 1285 return start;
michael@0 1286 }
michael@0 1287
michael@0 1288 if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) {
michael@0 1289 return syntaxError(U_MISSING_OPERATOR, rule, start, status);
michael@0 1290 }
michael@0 1291 ++pos;
michael@0 1292
michael@0 1293 // Found an operator char. Check for forward-reverse operator.
michael@0 1294 if (op == REVERSE_RULE_OP &&
michael@0 1295 (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
michael@0 1296 ++pos;
michael@0 1297 op = FWDREV_RULE_OP;
michael@0 1298 }
michael@0 1299
michael@0 1300 // Translate alternate op characters.
michael@0 1301 switch (op) {
michael@0 1302 case ALT_FORWARD_RULE_OP:
michael@0 1303 op = FORWARD_RULE_OP;
michael@0 1304 break;
michael@0 1305 case ALT_REVERSE_RULE_OP:
michael@0 1306 op = REVERSE_RULE_OP;
michael@0 1307 break;
michael@0 1308 case ALT_FWDREV_RULE_OP:
michael@0 1309 op = FWDREV_RULE_OP;
michael@0 1310 break;
michael@0 1311 }
michael@0 1312
michael@0 1313 pos = right->parse(rule, pos, limit, status);
michael@0 1314 if (U_FAILURE(status)) {
michael@0 1315 return start;
michael@0 1316 }
michael@0 1317
michael@0 1318 if (pos < limit) {
michael@0 1319 if (rule.charAt(--pos) == END_OF_RULE) {
michael@0 1320 ++pos;
michael@0 1321 } else {
michael@0 1322 // RuleHalf parser must have terminated at an operator
michael@0 1323 return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status);
michael@0 1324 }
michael@0 1325 }
michael@0 1326
michael@0 1327 if (op == VARIABLE_DEF_OP) {
michael@0 1328 // LHS is the name. RHS is a single character, either a literal
michael@0 1329 // or a set (already parsed). If RHS is longer than one
michael@0 1330 // character, it is either a multi-character string, or multiple
michael@0 1331 // sets, or a mixture of chars and sets -- syntax error.
michael@0 1332
michael@0 1333 // We expect to see a single undefined variable (the one being
michael@0 1334 // defined).
michael@0 1335 if (undefinedVariableName.length() == 0) {
michael@0 1336 // "Missing '$' or duplicate definition"
michael@0 1337 return syntaxError(U_BAD_VARIABLE_DEFINITION, rule, start, status);
michael@0 1338 }
michael@0 1339 if (left->text.length() != 1 || left->text.charAt(0) != variableLimit) {
michael@0 1340 // "Malformed LHS"
michael@0 1341 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status);
michael@0 1342 }
michael@0 1343 if (left->anchorStart || left->anchorEnd ||
michael@0 1344 right->anchorStart || right->anchorEnd) {
michael@0 1345 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status);
michael@0 1346 }
michael@0 1347 // We allow anything on the right, including an empty string.
michael@0 1348 UnicodeString* value = new UnicodeString(right->text);
michael@0 1349 // NULL pointer check
michael@0 1350 if (value == NULL) {
michael@0 1351 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
michael@0 1352 }
michael@0 1353 variableNames.put(undefinedVariableName, value, status);
michael@0 1354 ++variableLimit;
michael@0 1355 return pos;
michael@0 1356 }
michael@0 1357
michael@0 1358 // If this is not a variable definition rule, we shouldn't have
michael@0 1359 // any undefined variable names.
michael@0 1360 if (undefinedVariableName.length() != 0) {
michael@0 1361 return syntaxError(// "Undefined variable $" + undefinedVariableName,
michael@0 1362 U_UNDEFINED_VARIABLE,
michael@0 1363 rule, start, status);
michael@0 1364 }
michael@0 1365
michael@0 1366 // Verify segments
michael@0 1367 if (segmentStandins.length() > segmentObjects.size()) {
michael@0 1368 syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start, status);
michael@0 1369 }
michael@0 1370 for (i=0; i<segmentStandins.length(); ++i) {
michael@0 1371 if (segmentStandins.charAt(i) == 0) {
michael@0 1372 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen
michael@0 1373 }
michael@0 1374 }
michael@0 1375 for (i=0; i<segmentObjects.size(); ++i) {
michael@0 1376 if (segmentObjects.elementAt(i) == NULL) {
michael@0 1377 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen
michael@0 1378 }
michael@0 1379 }
michael@0 1380
michael@0 1381 // If the direction we want doesn't match the rule
michael@0 1382 // direction, do nothing.
michael@0 1383 if (op != FWDREV_RULE_OP &&
michael@0 1384 ((direction == UTRANS_FORWARD) != (op == FORWARD_RULE_OP))) {
michael@0 1385 return pos;
michael@0 1386 }
michael@0 1387
michael@0 1388 // Transform the rule into a forward rule by swapping the
michael@0 1389 // sides if necessary.
michael@0 1390 if (direction == UTRANS_REVERSE) {
michael@0 1391 left = &_right;
michael@0 1392 right = &_left;
michael@0 1393 }
michael@0 1394
michael@0 1395 // Remove non-applicable elements in forward-reverse
michael@0 1396 // rules. Bidirectional rules ignore elements that do not
michael@0 1397 // apply.
michael@0 1398 if (op == FWDREV_RULE_OP) {
michael@0 1399 right->removeContext();
michael@0 1400 left->cursor = -1;
michael@0 1401 left->cursorOffset = 0;
michael@0 1402 }
michael@0 1403
michael@0 1404 // Normalize context
michael@0 1405 if (left->ante < 0) {
michael@0 1406 left->ante = 0;
michael@0 1407 }
michael@0 1408 if (left->post < 0) {
michael@0 1409 left->post = left->text.length();
michael@0 1410 }
michael@0 1411
michael@0 1412 // Context is only allowed on the input side. Cursors are only
michael@0 1413 // allowed on the output side. Segment delimiters can only appear
michael@0 1414 // on the left, and references on the right. Cursor offset
michael@0 1415 // cannot appear without an explicit cursor. Cursor offset
michael@0 1416 // cannot place the cursor outside the limits of the context.
michael@0 1417 // Anchors are only allowed on the input side.
michael@0 1418 if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 ||
michael@0 1419 (right->cursorOffset != 0 && right->cursor < 0) ||
michael@0 1420 // - The following two checks were used to ensure that the
michael@0 1421 // - the cursor offset stayed within the ante- or postcontext.
michael@0 1422 // - However, with the addition of quantifiers, we have to
michael@0 1423 // - allow arbitrary cursor offsets and do runtime checking.
michael@0 1424 //(right->cursorOffset > (left->text.length() - left->post)) ||
michael@0 1425 //(-right->cursorOffset > left->ante) ||
michael@0 1426 right->anchorStart || right->anchorEnd ||
michael@0 1427 !left->isValidInput(*this) || !right->isValidOutput(*this) ||
michael@0 1428 left->ante > left->post) {
michael@0 1429
michael@0 1430 return syntaxError(U_MALFORMED_RULE, rule, start, status);
michael@0 1431 }
michael@0 1432
michael@0 1433 // Flatten segment objects vector to an array
michael@0 1434 UnicodeFunctor** segmentsArray = NULL;
michael@0 1435 if (segmentObjects.size() > 0) {
michael@0 1436 segmentsArray = (UnicodeFunctor **)uprv_malloc(segmentObjects.size() * sizeof(UnicodeFunctor *));
michael@0 1437 // Null pointer check
michael@0 1438 if (segmentsArray == NULL) {
michael@0 1439 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
michael@0 1440 }
michael@0 1441 segmentObjects.toArray((void**) segmentsArray);
michael@0 1442 }
michael@0 1443 TransliterationRule* temptr = new TransliterationRule(
michael@0 1444 left->text, left->ante, left->post,
michael@0 1445 right->text, right->cursor, right->cursorOffset,
michael@0 1446 segmentsArray,
michael@0 1447 segmentObjects.size(),
michael@0 1448 left->anchorStart, left->anchorEnd,
michael@0 1449 curData,
michael@0 1450 status);
michael@0 1451 //Null pointer check
michael@0 1452 if (temptr == NULL) {
michael@0 1453 uprv_free(segmentsArray);
michael@0 1454 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
michael@0 1455 }
michael@0 1456
michael@0 1457 curData->ruleSet.addRule(temptr, status);
michael@0 1458
michael@0 1459 return pos;
michael@0 1460 }
michael@0 1461
michael@0 1462 /**
michael@0 1463 * Called by main parser upon syntax error. Search the rule string
michael@0 1464 * for the probable end of the rule. Of course, if the error is that
michael@0 1465 * the end of rule marker is missing, then the rule end will not be found.
michael@0 1466 * In any case the rule start will be correctly reported.
michael@0 1467 * @param msg error description
michael@0 1468 * @param rule pattern string
michael@0 1469 * @param start position of first character of current rule
michael@0 1470 */
michael@0 1471 int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode,
michael@0 1472 const UnicodeString& rule,
michael@0 1473 int32_t pos,
michael@0 1474 UErrorCode& status)
michael@0 1475 {
michael@0 1476 parseError.offset = pos;
michael@0 1477 parseError.line = 0 ; /* we are not using line numbers */
michael@0 1478
michael@0 1479 // for pre-context
michael@0 1480 const int32_t LEN = U_PARSE_CONTEXT_LEN - 1;
michael@0 1481 int32_t start = uprv_max(pos - LEN, 0);
michael@0 1482 int32_t stop = pos;
michael@0 1483
michael@0 1484 rule.extract(start,stop-start,parseError.preContext);
michael@0 1485 //null terminate the buffer
michael@0 1486 parseError.preContext[stop-start] = 0;
michael@0 1487
michael@0 1488 //for post-context
michael@0 1489 start = pos;
michael@0 1490 stop = uprv_min(pos + LEN, rule.length());
michael@0 1491
michael@0 1492 rule.extract(start,stop-start,parseError.postContext);
michael@0 1493 //null terminate the buffer
michael@0 1494 parseError.postContext[stop-start]= 0;
michael@0 1495
michael@0 1496 status = (UErrorCode)parseErrorCode;
michael@0 1497 return pos;
michael@0 1498
michael@0 1499 }
michael@0 1500
michael@0 1501 /**
michael@0 1502 * Parse a UnicodeSet out, store it, and return the stand-in character
michael@0 1503 * used to represent it.
michael@0 1504 */
michael@0 1505 UChar TransliteratorParser::parseSet(const UnicodeString& rule,
michael@0 1506 ParsePosition& pos,
michael@0 1507 UErrorCode& status) {
michael@0 1508 UnicodeSet* set = new UnicodeSet(rule, pos, USET_IGNORE_SPACE, parseData, status);
michael@0 1509 // Null pointer check
michael@0 1510 if (set == NULL) {
michael@0 1511 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1512 return (UChar)0x0000; // Return empty character with error.
michael@0 1513 }
michael@0 1514 set->compact();
michael@0 1515 return generateStandInFor(set, status);
michael@0 1516 }
michael@0 1517
michael@0 1518 /**
michael@0 1519 * Generate and return a stand-in for a new UnicodeFunctor. Store
michael@0 1520 * the matcher (adopt it).
michael@0 1521 */
michael@0 1522 UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) {
michael@0 1523 // assert(obj != null);
michael@0 1524
michael@0 1525 // Look up previous stand-in, if any. This is a short list
michael@0 1526 // (typical n is 0, 1, or 2); linear search is optimal.
michael@0 1527 for (int32_t i=0; i<variablesVector.size(); ++i) {
michael@0 1528 if (variablesVector.elementAt(i) == adopted) { // [sic] pointer comparison
michael@0 1529 return (UChar) (curData->variablesBase + i);
michael@0 1530 }
michael@0 1531 }
michael@0 1532
michael@0 1533 if (variableNext >= variableLimit) {
michael@0 1534 delete adopted;
michael@0 1535 status = U_VARIABLE_RANGE_EXHAUSTED;
michael@0 1536 return 0;
michael@0 1537 }
michael@0 1538 variablesVector.addElement(adopted, status);
michael@0 1539 return variableNext++;
michael@0 1540 }
michael@0 1541
michael@0 1542 /**
michael@0 1543 * Return the standin for segment seg (1-based).
michael@0 1544 */
michael@0 1545 UChar TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) {
michael@0 1546 // Special character used to indicate an empty spot
michael@0 1547 UChar empty = curData->variablesBase - 1;
michael@0 1548 while (segmentStandins.length() < seg) {
michael@0 1549 segmentStandins.append(empty);
michael@0 1550 }
michael@0 1551 UChar c = segmentStandins.charAt(seg-1);
michael@0 1552 if (c == empty) {
michael@0 1553 if (variableNext >= variableLimit) {
michael@0 1554 status = U_VARIABLE_RANGE_EXHAUSTED;
michael@0 1555 return 0;
michael@0 1556 }
michael@0 1557 c = variableNext++;
michael@0 1558 // Set a placeholder in the master variables vector that will be
michael@0 1559 // filled in later by setSegmentObject(). We know that we will get
michael@0 1560 // called first because setSegmentObject() will call us.
michael@0 1561 variablesVector.addElement((void*) NULL, status);
michael@0 1562 segmentStandins.setCharAt(seg-1, c);
michael@0 1563 }
michael@0 1564 return c;
michael@0 1565 }
michael@0 1566
michael@0 1567 /**
michael@0 1568 * Set the object for segment seg (1-based).
michael@0 1569 */
michael@0 1570 void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) {
michael@0 1571 // Since we call parseSection() recursively, nested
michael@0 1572 // segments will result in segment i+1 getting parsed
michael@0 1573 // and stored before segment i; be careful with the
michael@0 1574 // vector handling here.
michael@0 1575 if (segmentObjects.size() < seg) {
michael@0 1576 segmentObjects.setSize(seg, status);
michael@0 1577 }
michael@0 1578 int32_t index = getSegmentStandin(seg, status) - curData->variablesBase;
michael@0 1579 if (segmentObjects.elementAt(seg-1) != NULL ||
michael@0 1580 variablesVector.elementAt(index) != NULL) {
michael@0 1581 // should never happen
michael@0 1582 status = U_INTERNAL_TRANSLITERATOR_ERROR;
michael@0 1583 return;
michael@0 1584 }
michael@0 1585 segmentObjects.setElementAt(adopted, seg-1);
michael@0 1586 variablesVector.setElementAt(adopted, index);
michael@0 1587 }
michael@0 1588
michael@0 1589 /**
michael@0 1590 * Return the stand-in for the dot set. It is allocated the first
michael@0 1591 * time and reused thereafter.
michael@0 1592 */
michael@0 1593 UChar TransliteratorParser::getDotStandIn(UErrorCode& status) {
michael@0 1594 if (dotStandIn == (UChar) -1) {
michael@0 1595 UnicodeSet* tempus = new UnicodeSet(UnicodeString(TRUE, DOT_SET, -1), status);
michael@0 1596 // Null pointer check.
michael@0 1597 if (tempus == NULL) {
michael@0 1598 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1599 return (UChar)0x0000;
michael@0 1600 }
michael@0 1601 dotStandIn = generateStandInFor(tempus, status);
michael@0 1602 }
michael@0 1603 return dotStandIn;
michael@0 1604 }
michael@0 1605
michael@0 1606 /**
michael@0 1607 * Append the value of the given variable name to the given
michael@0 1608 * UnicodeString.
michael@0 1609 */
michael@0 1610 void TransliteratorParser::appendVariableDef(const UnicodeString& name,
michael@0 1611 UnicodeString& buf,
michael@0 1612 UErrorCode& status) {
michael@0 1613 const UnicodeString* s = (const UnicodeString*) variableNames.get(name);
michael@0 1614 if (s == NULL) {
michael@0 1615 // We allow one undefined variable so that variable definition
michael@0 1616 // statements work. For the first undefined variable we return
michael@0 1617 // the special placeholder variableLimit-1, and save the variable
michael@0 1618 // name.
michael@0 1619 if (undefinedVariableName.length() == 0) {
michael@0 1620 undefinedVariableName = name;
michael@0 1621 if (variableNext >= variableLimit) {
michael@0 1622 // throw new RuntimeException("Private use variables exhausted");
michael@0 1623 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1624 return;
michael@0 1625 }
michael@0 1626 buf.append((UChar) --variableLimit);
michael@0 1627 } else {
michael@0 1628 //throw new IllegalArgumentException("Undefined variable $"
michael@0 1629 // + name);
michael@0 1630 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1631 return;
michael@0 1632 }
michael@0 1633 } else {
michael@0 1634 buf.append(*s);
michael@0 1635 }
michael@0 1636 }
michael@0 1637
michael@0 1638 /**
michael@0 1639 * Glue method to get around access restrictions in C++.
michael@0 1640 */
michael@0 1641 /*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
michael@0 1642 return Transliterator::createBasicInstance(id, canonID);
michael@0 1643 }*/
michael@0 1644
michael@0 1645 U_NAMESPACE_END
michael@0 1646
michael@0 1647 U_CAPI int32_t
michael@0 1648 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status) {
michael@0 1649 U_NAMESPACE_USE
michael@0 1650
michael@0 1651 //const UChar *sourceStart = source;
michael@0 1652 const UChar *targetStart = target;
michael@0 1653 const UChar *sourceLimit = source+sourceLen;
michael@0 1654 UChar *targetLimit = target+sourceLen;
michael@0 1655 UChar32 c = 0;
michael@0 1656 UBool quoted = FALSE;
michael@0 1657 int32_t index;
michael@0 1658
michael@0 1659 uprv_memset(target, 0, sourceLen*U_SIZEOF_UCHAR);
michael@0 1660
michael@0 1661 /* read the rules into the buffer */
michael@0 1662 while (source < sourceLimit)
michael@0 1663 {
michael@0 1664 index=0;
michael@0 1665 U16_NEXT_UNSAFE(source, index, c);
michael@0 1666 source+=index;
michael@0 1667 if(c == QUOTE) {
michael@0 1668 quoted = (UBool)!quoted;
michael@0 1669 }
michael@0 1670 else if (!quoted) {
michael@0 1671 if (c == RULE_COMMENT_CHAR) {
michael@0 1672 /* skip comments and all preceding spaces */
michael@0 1673 while (targetStart < target && *(target - 1) == 0x0020) {
michael@0 1674 target--;
michael@0 1675 }
michael@0 1676 do {
michael@0 1677 c = *(source++);
michael@0 1678 }
michael@0 1679 while (c != CR && c != LF);
michael@0 1680 }
michael@0 1681 else if (c == ESCAPE) {
michael@0 1682 UChar32 c2 = *source;
michael@0 1683 if (c2 == CR || c2 == LF) {
michael@0 1684 /* A backslash at the end of a line. */
michael@0 1685 /* Since we're stripping lines, ignore the backslash. */
michael@0 1686 source++;
michael@0 1687 continue;
michael@0 1688 }
michael@0 1689 if (c2 == 0x0075 && source+5 < sourceLimit) { /* \u seen. \U isn't unescaped. */
michael@0 1690 int32_t escapeOffset = 0;
michael@0 1691 UnicodeString escapedStr(source, 5);
michael@0 1692 c2 = escapedStr.unescapeAt(escapeOffset);
michael@0 1693
michael@0 1694 if (c2 == (UChar32)0xFFFFFFFF || escapeOffset == 0)
michael@0 1695 {
michael@0 1696 *status = U_PARSE_ERROR;
michael@0 1697 return 0;
michael@0 1698 }
michael@0 1699 if (!PatternProps::isWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) {
michael@0 1700 /* It was escaped for a reason. Write what it was suppose to be. */
michael@0 1701 source+=5;
michael@0 1702 c = c2;
michael@0 1703 }
michael@0 1704 }
michael@0 1705 else if (c2 == QUOTE) {
michael@0 1706 /* \' seen. Make sure we don't do anything when we see it again. */
michael@0 1707 quoted = (UBool)!quoted;
michael@0 1708 }
michael@0 1709 }
michael@0 1710 }
michael@0 1711 if (c == CR || c == LF)
michael@0 1712 {
michael@0 1713 /* ignore spaces carriage returns, and all leading spaces on the next line.
michael@0 1714 * and line feed unless in the form \uXXXX
michael@0 1715 */
michael@0 1716 quoted = FALSE;
michael@0 1717 while (source < sourceLimit) {
michael@0 1718 c = *(source);
michael@0 1719 if (c != CR && c != LF && c != 0x0020) {
michael@0 1720 break;
michael@0 1721 }
michael@0 1722 source++;
michael@0 1723 }
michael@0 1724 continue;
michael@0 1725 }
michael@0 1726
michael@0 1727 /* Append UChar * after dissembling if c > 0xffff*/
michael@0 1728 index=0;
michael@0 1729 U16_APPEND_UNSAFE(target, index, c);
michael@0 1730 target+=index;
michael@0 1731 }
michael@0 1732 if (target < targetLimit) {
michael@0 1733 *target = 0;
michael@0 1734 }
michael@0 1735 return (int32_t)(target-targetStart);
michael@0 1736 }
michael@0 1737
michael@0 1738 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

mercurial