michael@0: /* michael@0: ***************************************************************** michael@0: * Copyright (c) 2002-2011, International Business Machines Corporation michael@0: * and others. All Rights Reserved. michael@0: ***************************************************************** michael@0: * Date Name Description michael@0: * 06/06/2002 aliu Creation. michael@0: ***************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_TRANSLITERATION michael@0: michael@0: #include "unicode/uobject.h" michael@0: #include "unicode/uscript.h" michael@0: #include "nultrans.h" michael@0: #include "anytrans.h" michael@0: #include "uvector.h" michael@0: #include "tridpars.h" michael@0: #include "hash.h" michael@0: #include "putilimp.h" michael@0: #include "uinvchar.h" michael@0: michael@0: //------------------------------------------------------------ michael@0: // Constants michael@0: michael@0: static const UChar TARGET_SEP = 45; // '-' michael@0: static const UChar VARIANT_SEP = 47; // '/' michael@0: static const UChar ANY[] = {65,110,121,0}; // "Any" michael@0: static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null" michael@0: static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-" michael@0: michael@0: //------------------------------------------------------------ michael@0: michael@0: U_CDECL_BEGIN michael@0: /** michael@0: * Deleter function for Transliterator*. michael@0: */ michael@0: static void U_CALLCONV michael@0: _deleteTransliterator(void *obj) { michael@0: delete (icu::Transliterator*) obj; michael@0: } michael@0: U_CDECL_END michael@0: michael@0: //------------------------------------------------------------ michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: //------------------------------------------------------------ michael@0: // ScriptRunIterator michael@0: michael@0: /** michael@0: * Returns a series of ranges corresponding to scripts. They will be michael@0: * of the form: michael@0: * michael@0: * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second michael@0: * | | - first run (start, limit) michael@0: * | | - second run (start, limit) michael@0: * michael@0: * That is, the runs will overlap. The reason for this is so that a michael@0: * transliterator can consider common characters both before and after michael@0: * the scripts. michael@0: */ michael@0: class ScriptRunIterator : public UMemory { michael@0: private: michael@0: const Replaceable& text; michael@0: int32_t textStart; michael@0: int32_t textLimit; michael@0: michael@0: public: michael@0: /** michael@0: * The code of the current run, valid after next() returns. May michael@0: * be USCRIPT_INVALID_CODE if and only if the entire text is michael@0: * COMMON/INHERITED. michael@0: */ michael@0: UScriptCode scriptCode; michael@0: michael@0: /** michael@0: * The start of the run, inclusive, valid after next() returns. michael@0: */ michael@0: int32_t start; michael@0: michael@0: /** michael@0: * The end of the run, exclusive, valid after next() returns. michael@0: */ michael@0: int32_t limit; michael@0: michael@0: /** michael@0: * Constructs a run iterator over the given text from start michael@0: * (inclusive) to limit (exclusive). michael@0: */ michael@0: ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit); michael@0: michael@0: /** michael@0: * Returns TRUE if there are any more runs. TRUE is always michael@0: * returned at least once. Upon return, the caller should michael@0: * examine scriptCode, start, and limit. michael@0: */ michael@0: UBool next(); michael@0: michael@0: /** michael@0: * Adjusts internal indices for a change in the limit index of the michael@0: * given delta. A positive delta means the limit has increased. michael@0: */ michael@0: void adjustLimit(int32_t delta); michael@0: michael@0: private: michael@0: ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class michael@0: ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class michael@0: }; michael@0: michael@0: ScriptRunIterator::ScriptRunIterator(const Replaceable& theText, michael@0: int32_t myStart, int32_t myLimit) : michael@0: text(theText) michael@0: { michael@0: textStart = myStart; michael@0: textLimit = myLimit; michael@0: limit = myStart; michael@0: } michael@0: michael@0: UBool ScriptRunIterator::next() { michael@0: UChar32 ch; michael@0: UScriptCode s; michael@0: UErrorCode ec = U_ZERO_ERROR; michael@0: michael@0: scriptCode = USCRIPT_INVALID_CODE; // don't know script yet michael@0: start = limit; michael@0: michael@0: // Are we done? michael@0: if (start == textLimit) { michael@0: return FALSE; michael@0: } michael@0: michael@0: // Move start back to include adjacent COMMON or INHERITED michael@0: // characters michael@0: while (start > textStart) { michael@0: ch = text.char32At(start - 1); // look back michael@0: s = uscript_getScript(ch, &ec); michael@0: if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) { michael@0: --start; michael@0: } else { michael@0: break; michael@0: } michael@0: } michael@0: michael@0: // Move limit ahead to include COMMON, INHERITED, and characters michael@0: // of the current script. michael@0: while (limit < textLimit) { michael@0: ch = text.char32At(limit); // look ahead michael@0: s = uscript_getScript(ch, &ec); michael@0: if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) { michael@0: if (scriptCode == USCRIPT_INVALID_CODE) { michael@0: scriptCode = s; michael@0: } else if (s != scriptCode) { michael@0: break; michael@0: } michael@0: } michael@0: ++limit; michael@0: } michael@0: michael@0: // Return TRUE even if the entire text is COMMON / INHERITED, in michael@0: // which case scriptCode will be USCRIPT_INVALID_CODE. michael@0: return TRUE; michael@0: } michael@0: michael@0: void ScriptRunIterator::adjustLimit(int32_t delta) { michael@0: limit += delta; michael@0: textLimit += delta; michael@0: } michael@0: michael@0: //------------------------------------------------------------ michael@0: // AnyTransliterator michael@0: michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator) michael@0: michael@0: AnyTransliterator::AnyTransliterator(const UnicodeString& id, michael@0: const UnicodeString& theTarget, michael@0: const UnicodeString& theVariant, michael@0: UScriptCode theTargetScript, michael@0: UErrorCode& ec) : michael@0: Transliterator(id, NULL), michael@0: targetScript(theTargetScript) michael@0: { michael@0: cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); michael@0: if (U_FAILURE(ec)) { michael@0: return; michael@0: } michael@0: uhash_setValueDeleter(cache, _deleteTransliterator); michael@0: michael@0: target = theTarget; michael@0: if (theVariant.length() > 0) { michael@0: target.append(VARIANT_SEP).append(theVariant); michael@0: } michael@0: } michael@0: michael@0: AnyTransliterator::~AnyTransliterator() { michael@0: uhash_close(cache); michael@0: } michael@0: michael@0: /** michael@0: * Copy constructor. michael@0: */ michael@0: AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) : michael@0: Transliterator(o), michael@0: target(o.target), michael@0: targetScript(o.targetScript) michael@0: { michael@0: // Don't copy the cache contents michael@0: UErrorCode ec = U_ZERO_ERROR; michael@0: cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); michael@0: if (U_FAILURE(ec)) { michael@0: return; michael@0: } michael@0: uhash_setValueDeleter(cache, _deleteTransliterator); michael@0: } michael@0: michael@0: /** michael@0: * Transliterator API. michael@0: */ michael@0: Transliterator* AnyTransliterator::clone() const { michael@0: return new AnyTransliterator(*this); michael@0: } michael@0: michael@0: /** michael@0: * Implements {@link Transliterator#handleTransliterate}. michael@0: */ michael@0: void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, michael@0: UBool isIncremental) const { michael@0: int32_t allStart = pos.start; michael@0: int32_t allLimit = pos.limit; michael@0: michael@0: ScriptRunIterator it(text, pos.contextStart, pos.contextLimit); michael@0: michael@0: while (it.next()) { michael@0: // Ignore runs in the ante context michael@0: if (it.limit <= allStart) continue; michael@0: michael@0: // Try to instantiate transliterator from it.scriptCode to michael@0: // our target or target/variant michael@0: Transliterator* t = getTransliterator(it.scriptCode); michael@0: michael@0: if (t == NULL) { michael@0: // We have no transliterator. Do nothing, but keep michael@0: // pos.start up to date. michael@0: pos.start = it.limit; michael@0: continue; michael@0: } michael@0: michael@0: // If the run end is before the transliteration limit, do michael@0: // a non-incremental transliteration. Otherwise do an michael@0: // incremental one. michael@0: UBool incremental = isIncremental && (it.limit >= allLimit); michael@0: michael@0: pos.start = uprv_max(allStart, it.start); michael@0: pos.limit = uprv_min(allLimit, it.limit); michael@0: int32_t limit = pos.limit; michael@0: t->filteredTransliterate(text, pos, incremental); michael@0: int32_t delta = pos.limit - limit; michael@0: allLimit += delta; michael@0: it.adjustLimit(delta); michael@0: michael@0: // We're done if we enter the post context michael@0: if (it.limit >= allLimit) break; michael@0: } michael@0: michael@0: // Restore limit. pos.start is fine where the last transliterator michael@0: // left it, or at the end of the last run. michael@0: pos.limit = allLimit; michael@0: } michael@0: michael@0: Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { michael@0: michael@0: if (source == targetScript || source == USCRIPT_INVALID_CODE) { michael@0: return NULL; michael@0: } michael@0: michael@0: Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source); michael@0: if (t == NULL) { michael@0: UErrorCode ec = U_ZERO_ERROR; michael@0: UnicodeString sourceName(uscript_getName(source), -1, US_INV); michael@0: UnicodeString id(sourceName); michael@0: id.append(TARGET_SEP).append(target); michael@0: michael@0: t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); michael@0: if (U_FAILURE(ec) || t == NULL) { michael@0: delete t; michael@0: michael@0: // Try to pivot around Latin, our most common script michael@0: id = sourceName; michael@0: id.append(LATIN_PIVOT, -1).append(target); michael@0: t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); michael@0: if (U_FAILURE(ec) || t == NULL) { michael@0: delete t; michael@0: t = NULL; michael@0: } michael@0: } michael@0: michael@0: if (t != NULL) { michael@0: uhash_iput(cache, (int32_t) source, t, &ec); michael@0: } michael@0: } michael@0: michael@0: return t; michael@0: } michael@0: michael@0: /** michael@0: * Return the script code for a given name, or -1 if not found. michael@0: */ michael@0: static UScriptCode scriptNameToCode(const UnicodeString& name) { michael@0: char buf[128]; michael@0: UScriptCode code; michael@0: UErrorCode ec = U_ZERO_ERROR; michael@0: int32_t nameLen = name.length(); michael@0: UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen); michael@0: michael@0: if (isInvariant) { michael@0: name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV); michael@0: buf[127] = 0; // Make sure that we NULL terminate the string. michael@0: } michael@0: if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec)) michael@0: { michael@0: code = USCRIPT_INVALID_CODE; michael@0: } michael@0: return code; michael@0: } michael@0: michael@0: /** michael@0: * Registers standard transliterators with the system. Called by michael@0: * Transliterator during initialization. Scan all current targets and michael@0: * register those that are scripts T as Any-T/V. michael@0: */ michael@0: void AnyTransliterator::registerIDs() { michael@0: michael@0: UErrorCode ec = U_ZERO_ERROR; michael@0: Hashtable seen(TRUE, ec); michael@0: michael@0: int32_t sourceCount = Transliterator::_countAvailableSources(); michael@0: for (int32_t s=0; s= 1); michael@0: for (int32_t v=0; v