michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 1999-2011, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: * Date Name Description michael@0: * 11/17/99 aliu Creation. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_TRANSLITERATION michael@0: michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/utf16.h" michael@0: #include "rbt_set.h" michael@0: #include "rbt_rule.h" michael@0: #include "cmemory.h" michael@0: #include "putilimp.h" michael@0: michael@0: U_CDECL_BEGIN michael@0: static void U_CALLCONV _deleteRule(void *rule) { michael@0: delete (icu::TransliterationRule *)rule; michael@0: } michael@0: U_CDECL_END michael@0: michael@0: //---------------------------------------------------------------------- michael@0: // BEGIN Debugging support michael@0: //---------------------------------------------------------------------- michael@0: michael@0: // #define DEBUG_RBT michael@0: michael@0: #ifdef DEBUG_RBT michael@0: #include michael@0: #include "charstr.h" michael@0: michael@0: /** michael@0: * @param appendTo result is appended to this param. michael@0: * @param input the string being transliterated michael@0: * @param pos the index struct michael@0: */ michael@0: static UnicodeString& _formatInput(UnicodeString &appendTo, michael@0: const UnicodeString& input, michael@0: const UTransPosition& pos) { michael@0: // Output a string of the form aaa{bbb|ccc|ddd}eee, where michael@0: // the {} indicate the context start and limit, and the || michael@0: // indicate the start and limit. michael@0: if (0 <= pos.contextStart && michael@0: pos.contextStart <= pos.start && michael@0: pos.start <= pos.limit && michael@0: pos.limit <= pos.contextLimit && michael@0: pos.contextLimit <= input.length()) { michael@0: michael@0: UnicodeString a, b, c, d, e; michael@0: input.extractBetween(0, pos.contextStart, a); michael@0: input.extractBetween(pos.contextStart, pos.start, b); michael@0: input.extractBetween(pos.start, pos.limit, c); michael@0: input.extractBetween(pos.limit, pos.contextLimit, d); michael@0: input.extractBetween(pos.contextLimit, input.length(), e); michael@0: appendTo.append(a).append((UChar)123/*{*/).append(b). michael@0: append((UChar)124/*|*/).append(c).append((UChar)124/*|*/).append(d). michael@0: append((UChar)125/*}*/).append(e); michael@0: } else { michael@0: appendTo.append("INVALID UTransPosition"); michael@0: //appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" + michael@0: // pos.contextStart + ", s=" + pos.start + ", l=" + michael@0: // pos.limit + ", cl=" + pos.contextLimit + "} on " + michael@0: // input); michael@0: } michael@0: return appendTo; michael@0: } michael@0: michael@0: // Append a hex string to the target michael@0: UnicodeString& _appendHex(uint32_t number, michael@0: int32_t digits, michael@0: UnicodeString& target) { michael@0: static const UChar digitString[] = { michael@0: 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, michael@0: 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0 michael@0: }; michael@0: while (digits--) { michael@0: target += digitString[(number >> (digits*4)) & 0xF]; michael@0: } michael@0: return target; michael@0: } michael@0: michael@0: // Replace nonprintable characters with unicode escapes michael@0: UnicodeString& _escape(const UnicodeString &source, michael@0: UnicodeString &target) { michael@0: for (int32_t i = 0; i < source.length(); ) { michael@0: UChar32 ch = source.char32At(i); michael@0: i += U16_LENGTH(ch); michael@0: if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E) { michael@0: if (ch <= 0xFFFF) { michael@0: target += "\\u"; michael@0: _appendHex(ch, 4, target); michael@0: } else { michael@0: target += "\\U"; michael@0: _appendHex(ch, 8, target); michael@0: } michael@0: } else { michael@0: target += ch; michael@0: } michael@0: } michael@0: return target; michael@0: } michael@0: michael@0: inline void _debugOut(const char* msg, TransliterationRule* rule, michael@0: const Replaceable& theText, UTransPosition& pos) { michael@0: UnicodeString buf(msg, ""); michael@0: if (rule) { michael@0: UnicodeString r; michael@0: rule->toRule(r, TRUE); michael@0: buf.append((UChar)32).append(r); michael@0: } michael@0: buf.append(UnicodeString(" => ", "")); michael@0: UnicodeString* text = (UnicodeString*)&theText; michael@0: _formatInput(buf, *text, pos); michael@0: UnicodeString esc; michael@0: _escape(buf, esc); michael@0: CharString cbuf(esc); michael@0: printf("%s\n", (const char*) cbuf); michael@0: } michael@0: michael@0: #else michael@0: #define _debugOut(msg, rule, theText, pos) michael@0: #endif michael@0: michael@0: //---------------------------------------------------------------------- michael@0: // END Debugging support michael@0: //---------------------------------------------------------------------- michael@0: michael@0: // Fill the precontext and postcontext with the patterns of the rules michael@0: // that are masking one another. michael@0: static void maskingError(const icu::TransliterationRule& rule1, michael@0: const icu::TransliterationRule& rule2, michael@0: UParseError& parseError) { michael@0: icu::UnicodeString r; michael@0: int32_t len; michael@0: michael@0: parseError.line = parseError.offset = -1; michael@0: michael@0: // for pre-context michael@0: rule1.toRule(r, FALSE); michael@0: len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1); michael@0: r.extract(0, len, parseError.preContext); michael@0: parseError.preContext[len] = 0; michael@0: michael@0: //for post-context michael@0: r.truncate(0); michael@0: rule2.toRule(r, FALSE); michael@0: len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1); michael@0: r.extract(0, len, parseError.postContext); michael@0: parseError.postContext[len] = 0; michael@0: } michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: /** michael@0: * Construct a new empty rule set. michael@0: */ michael@0: TransliterationRuleSet::TransliterationRuleSet(UErrorCode& status) : UMemory() { michael@0: ruleVector = new UVector(&_deleteRule, NULL, status); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: if (ruleVector == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: rules = NULL; michael@0: maxContextLength = 0; michael@0: } michael@0: michael@0: /** michael@0: * Copy constructor. michael@0: */ michael@0: TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet& other) : michael@0: UMemory(other), michael@0: ruleVector(0), michael@0: rules(0), michael@0: maxContextLength(other.maxContextLength) { michael@0: michael@0: int32_t i, len; michael@0: uprv_memcpy(index, other.index, sizeof(index)); michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: ruleVector = new UVector(&_deleteRule, NULL, status); michael@0: if (other.ruleVector != 0 && ruleVector != 0 && U_SUCCESS(status)) { michael@0: len = other.ruleVector->size(); michael@0: for (i=0; ielementAt(i)); michael@0: // Null pointer test michael@0: if (tempTranslitRule == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: break; michael@0: } michael@0: ruleVector->addElement(tempTranslitRule, status); michael@0: if (U_FAILURE(status)) { michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: if (other.rules != 0 && U_SUCCESS(status)) { michael@0: UParseError p; michael@0: freeze(p, status); michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * Destructor. michael@0: */ michael@0: TransliterationRuleSet::~TransliterationRuleSet() { michael@0: delete ruleVector; // This deletes the contained rules michael@0: uprv_free(rules); michael@0: } michael@0: michael@0: void TransliterationRuleSet::setData(const TransliterationRuleData* d) { michael@0: /** michael@0: * We assume that the ruleset has already been frozen. michael@0: */ michael@0: int32_t len = index[256]; // see freeze() michael@0: for (int32_t i=0; isetData(d); michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * Return the maximum context length. michael@0: * @return the length of the longest preceding context. michael@0: */ michael@0: int32_t TransliterationRuleSet::getMaximumContextLength(void) const { michael@0: return maxContextLength; michael@0: } michael@0: michael@0: /** michael@0: * Add a rule to this set. Rules are added in order, and order is michael@0: * significant. The last call to this method must be followed by michael@0: * a call to freeze() before the rule set is used. michael@0: * michael@0: *

If freeze() has already been called, calling addRule() michael@0: * unfreezes the rules, and freeze() must be called again. michael@0: * michael@0: * @param adoptedRule the rule to add michael@0: */ michael@0: void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule, michael@0: UErrorCode& status) { michael@0: if (U_FAILURE(status)) { michael@0: delete adoptedRule; michael@0: return; michael@0: } michael@0: ruleVector->addElement(adoptedRule, status); michael@0: michael@0: int32_t len; michael@0: if ((len = adoptedRule->getContextLength()) > maxContextLength) { michael@0: maxContextLength = len; michael@0: } michael@0: michael@0: uprv_free(rules); michael@0: rules = 0; michael@0: } michael@0: michael@0: /** michael@0: * Check this for masked rules and index it to optimize performance. michael@0: * The sequence of operations is: (1) add rules to a set using michael@0: * addRule(); (2) freeze the set using michael@0: * freeze(); (3) use the rule set. If michael@0: * addRule() is called after calling this method, it michael@0: * invalidates this object, and this method must be called again. michael@0: * That is, freeze() may be called multiple times, michael@0: * although for optimal performance it shouldn't be. michael@0: */ michael@0: void TransliterationRuleSet::freeze(UParseError& parseError,UErrorCode& status) { michael@0: /* Construct the rule array and index table. We reorder the michael@0: * rules by sorting them into 256 bins. Each bin contains all michael@0: * rules matching the index value for that bin. A rule michael@0: * matches an index value if string whose first key character michael@0: * has a low byte equal to the index value can match the rule. michael@0: * michael@0: * Each bin contains zero or more rules, in the same order michael@0: * they were found originally. However, the total rules in michael@0: * the bins may exceed the number in the original vector, michael@0: * since rules that have a variable as their first key michael@0: * character will generally fall into more than one bin. michael@0: * michael@0: * That is, each bin contains all rules that either have that michael@0: * first index value as their first key character, or have michael@0: * a set containing the index value as their first character. michael@0: */ michael@0: int32_t n = ruleVector->size(); michael@0: int32_t j; michael@0: int16_t x; michael@0: UVector v(2*n, status); // heuristic; adjust as needed michael@0: michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: michael@0: /* Precompute the index values. This saves a LOT of time. michael@0: * Be careful not to call malloc(0). michael@0: */ michael@0: int16_t* indexValue = (int16_t*) uprv_malloc( sizeof(int16_t) * (n > 0 ? n : 1) ); michael@0: /* test for NULL */ michael@0: if (indexValue == 0) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: for (j=0; jelementAt(j); michael@0: indexValue[j] = r->getIndexValue(); michael@0: } michael@0: for (x=0; x<256; ++x) { michael@0: index[x] = v.size(); michael@0: for (j=0; j= 0) { michael@0: if (indexValue[j] == x) { michael@0: v.addElement(ruleVector->elementAt(j), status); michael@0: } michael@0: } else { michael@0: // If the indexValue is < 0, then the first key character is michael@0: // a set, and we must use the more time-consuming michael@0: // matchesIndexValue check. In practice this happens michael@0: // rarely, so we seldom tread this code path. michael@0: TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j); michael@0: if (r->matchesIndexValue((uint8_t)x)) { michael@0: v.addElement(r, status); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: uprv_free(indexValue); michael@0: index[256] = v.size(); michael@0: michael@0: /* Freeze things into an array. michael@0: */ michael@0: uprv_free(rules); // Contains alias pointers michael@0: michael@0: /* You can't do malloc(0)! */ michael@0: if (v.size() == 0) { michael@0: rules = NULL; michael@0: return; michael@0: } michael@0: rules = (TransliterationRule **)uprv_malloc(v.size() * sizeof(TransliterationRule *)); michael@0: /* test for NULL */ michael@0: if (rules == 0) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: for (j=0; jmasks(*r2)) { michael@0: //| if (errors == null) { michael@0: //| errors = new StringBuffer(); michael@0: //| } else { michael@0: //| errors.append("\n"); michael@0: //| } michael@0: //| errors.append("Rule " + r1 + " masks " + r2); michael@0: status = U_RULE_MASK_ERROR; michael@0: maskingError(*r1, *r2, parseError); michael@0: return; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: //if (errors != null) { michael@0: // throw new IllegalArgumentException(errors.toString()); michael@0: //} michael@0: } michael@0: michael@0: /** michael@0: * Transliterate the given text with the given UTransPosition michael@0: * indices. Return TRUE if the transliteration should continue michael@0: * or FALSE if it should halt (because of a U_PARTIAL_MATCH match). michael@0: * Note that FALSE is only ever returned if isIncremental is TRUE. michael@0: * @param text the text to be transliterated michael@0: * @param pos the position indices, which will be updated michael@0: * @param incremental if TRUE, assume new text may be inserted michael@0: * at index.limit, and return FALSE if thre is a partial match. michael@0: * @return TRUE unless a U_PARTIAL_MATCH has been obtained, michael@0: * indicating that transliteration should stop until more text michael@0: * arrives. michael@0: */ michael@0: UBool TransliterationRuleSet::transliterate(Replaceable& text, michael@0: UTransPosition& pos, michael@0: UBool incremental) { michael@0: int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF); michael@0: for (int32_t i=index[indexByte]; imatchAndReplace(text, pos, incremental); michael@0: switch (m) { michael@0: case U_MATCH: michael@0: _debugOut("match", rules[i], text, pos); michael@0: return TRUE; michael@0: case U_PARTIAL_MATCH: michael@0: _debugOut("partial match", rules[i], text, pos); michael@0: return FALSE; michael@0: default: /* Ram: added default to make GCC happy */ michael@0: break; michael@0: } michael@0: } michael@0: // No match or partial match from any rule michael@0: pos.start += U16_LENGTH(text.char32At(pos.start)); michael@0: _debugOut("no match", NULL, text, pos); michael@0: return TRUE; michael@0: } michael@0: michael@0: /** michael@0: * Create rule strings that represents this rule set. michael@0: */ michael@0: UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource, michael@0: UBool escapeUnprintable) const { michael@0: int32_t i; michael@0: int32_t count = ruleVector->size(); michael@0: ruleSource.truncate(0); michael@0: for (i=0; ielementAt(i); michael@0: r->toRule(ruleSource, escapeUnprintable); michael@0: } michael@0: return ruleSource; michael@0: } michael@0: michael@0: /** michael@0: * Return the set of all characters that may be modified michael@0: * (getTarget=false) or emitted (getTarget=true) by this set. michael@0: */ michael@0: UnicodeSet& TransliterationRuleSet::getSourceTargetSet(UnicodeSet& result, michael@0: UBool getTarget) const michael@0: { michael@0: result.clear(); michael@0: int32_t count = ruleVector->size(); michael@0: for (int32_t i=0; ielementAt(i); michael@0: if (getTarget) { michael@0: r->addTargetSetTo(result); michael@0: } else { michael@0: r->addSourceSetTo(result); michael@0: } michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */