1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/anytrans.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,386 @@ 1.4 +/* 1.5 +***************************************************************** 1.6 +* Copyright (c) 2002-2011, International Business Machines Corporation 1.7 +* and others. All Rights Reserved. 1.8 +***************************************************************** 1.9 +* Date Name Description 1.10 +* 06/06/2002 aliu Creation. 1.11 +***************************************************************** 1.12 +*/ 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_TRANSLITERATION 1.17 + 1.18 +#include "unicode/uobject.h" 1.19 +#include "unicode/uscript.h" 1.20 +#include "nultrans.h" 1.21 +#include "anytrans.h" 1.22 +#include "uvector.h" 1.23 +#include "tridpars.h" 1.24 +#include "hash.h" 1.25 +#include "putilimp.h" 1.26 +#include "uinvchar.h" 1.27 + 1.28 +//------------------------------------------------------------ 1.29 +// Constants 1.30 + 1.31 +static const UChar TARGET_SEP = 45; // '-' 1.32 +static const UChar VARIANT_SEP = 47; // '/' 1.33 +static const UChar ANY[] = {65,110,121,0}; // "Any" 1.34 +static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null" 1.35 +static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-" 1.36 + 1.37 +//------------------------------------------------------------ 1.38 + 1.39 +U_CDECL_BEGIN 1.40 +/** 1.41 + * Deleter function for Transliterator*. 1.42 + */ 1.43 +static void U_CALLCONV 1.44 +_deleteTransliterator(void *obj) { 1.45 + delete (icu::Transliterator*) obj; 1.46 +} 1.47 +U_CDECL_END 1.48 + 1.49 +//------------------------------------------------------------ 1.50 + 1.51 +U_NAMESPACE_BEGIN 1.52 + 1.53 +//------------------------------------------------------------ 1.54 +// ScriptRunIterator 1.55 + 1.56 +/** 1.57 + * Returns a series of ranges corresponding to scripts. They will be 1.58 + * of the form: 1.59 + * 1.60 + * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second 1.61 + * | | - first run (start, limit) 1.62 + * | | - second run (start, limit) 1.63 + * 1.64 + * That is, the runs will overlap. The reason for this is so that a 1.65 + * transliterator can consider common characters both before and after 1.66 + * the scripts. 1.67 + */ 1.68 +class ScriptRunIterator : public UMemory { 1.69 +private: 1.70 + const Replaceable& text; 1.71 + int32_t textStart; 1.72 + int32_t textLimit; 1.73 + 1.74 +public: 1.75 + /** 1.76 + * The code of the current run, valid after next() returns. May 1.77 + * be USCRIPT_INVALID_CODE if and only if the entire text is 1.78 + * COMMON/INHERITED. 1.79 + */ 1.80 + UScriptCode scriptCode; 1.81 + 1.82 + /** 1.83 + * The start of the run, inclusive, valid after next() returns. 1.84 + */ 1.85 + int32_t start; 1.86 + 1.87 + /** 1.88 + * The end of the run, exclusive, valid after next() returns. 1.89 + */ 1.90 + int32_t limit; 1.91 + 1.92 + /** 1.93 + * Constructs a run iterator over the given text from start 1.94 + * (inclusive) to limit (exclusive). 1.95 + */ 1.96 + ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit); 1.97 + 1.98 + /** 1.99 + * Returns TRUE if there are any more runs. TRUE is always 1.100 + * returned at least once. Upon return, the caller should 1.101 + * examine scriptCode, start, and limit. 1.102 + */ 1.103 + UBool next(); 1.104 + 1.105 + /** 1.106 + * Adjusts internal indices for a change in the limit index of the 1.107 + * given delta. A positive delta means the limit has increased. 1.108 + */ 1.109 + void adjustLimit(int32_t delta); 1.110 + 1.111 +private: 1.112 + ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class 1.113 + ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class 1.114 +}; 1.115 + 1.116 +ScriptRunIterator::ScriptRunIterator(const Replaceable& theText, 1.117 + int32_t myStart, int32_t myLimit) : 1.118 + text(theText) 1.119 +{ 1.120 + textStart = myStart; 1.121 + textLimit = myLimit; 1.122 + limit = myStart; 1.123 +} 1.124 + 1.125 +UBool ScriptRunIterator::next() { 1.126 + UChar32 ch; 1.127 + UScriptCode s; 1.128 + UErrorCode ec = U_ZERO_ERROR; 1.129 + 1.130 + scriptCode = USCRIPT_INVALID_CODE; // don't know script yet 1.131 + start = limit; 1.132 + 1.133 + // Are we done? 1.134 + if (start == textLimit) { 1.135 + return FALSE; 1.136 + } 1.137 + 1.138 + // Move start back to include adjacent COMMON or INHERITED 1.139 + // characters 1.140 + while (start > textStart) { 1.141 + ch = text.char32At(start - 1); // look back 1.142 + s = uscript_getScript(ch, &ec); 1.143 + if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) { 1.144 + --start; 1.145 + } else { 1.146 + break; 1.147 + } 1.148 + } 1.149 + 1.150 + // Move limit ahead to include COMMON, INHERITED, and characters 1.151 + // of the current script. 1.152 + while (limit < textLimit) { 1.153 + ch = text.char32At(limit); // look ahead 1.154 + s = uscript_getScript(ch, &ec); 1.155 + if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) { 1.156 + if (scriptCode == USCRIPT_INVALID_CODE) { 1.157 + scriptCode = s; 1.158 + } else if (s != scriptCode) { 1.159 + break; 1.160 + } 1.161 + } 1.162 + ++limit; 1.163 + } 1.164 + 1.165 + // Return TRUE even if the entire text is COMMON / INHERITED, in 1.166 + // which case scriptCode will be USCRIPT_INVALID_CODE. 1.167 + return TRUE; 1.168 +} 1.169 + 1.170 +void ScriptRunIterator::adjustLimit(int32_t delta) { 1.171 + limit += delta; 1.172 + textLimit += delta; 1.173 +} 1.174 + 1.175 +//------------------------------------------------------------ 1.176 +// AnyTransliterator 1.177 + 1.178 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator) 1.179 + 1.180 +AnyTransliterator::AnyTransliterator(const UnicodeString& id, 1.181 + const UnicodeString& theTarget, 1.182 + const UnicodeString& theVariant, 1.183 + UScriptCode theTargetScript, 1.184 + UErrorCode& ec) : 1.185 + Transliterator(id, NULL), 1.186 + targetScript(theTargetScript) 1.187 +{ 1.188 + cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); 1.189 + if (U_FAILURE(ec)) { 1.190 + return; 1.191 + } 1.192 + uhash_setValueDeleter(cache, _deleteTransliterator); 1.193 + 1.194 + target = theTarget; 1.195 + if (theVariant.length() > 0) { 1.196 + target.append(VARIANT_SEP).append(theVariant); 1.197 + } 1.198 +} 1.199 + 1.200 +AnyTransliterator::~AnyTransliterator() { 1.201 + uhash_close(cache); 1.202 +} 1.203 + 1.204 +/** 1.205 + * Copy constructor. 1.206 + */ 1.207 +AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) : 1.208 + Transliterator(o), 1.209 + target(o.target), 1.210 + targetScript(o.targetScript) 1.211 +{ 1.212 + // Don't copy the cache contents 1.213 + UErrorCode ec = U_ZERO_ERROR; 1.214 + cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); 1.215 + if (U_FAILURE(ec)) { 1.216 + return; 1.217 + } 1.218 + uhash_setValueDeleter(cache, _deleteTransliterator); 1.219 +} 1.220 + 1.221 +/** 1.222 + * Transliterator API. 1.223 + */ 1.224 +Transliterator* AnyTransliterator::clone() const { 1.225 + return new AnyTransliterator(*this); 1.226 +} 1.227 + 1.228 +/** 1.229 + * Implements {@link Transliterator#handleTransliterate}. 1.230 + */ 1.231 +void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, 1.232 + UBool isIncremental) const { 1.233 + int32_t allStart = pos.start; 1.234 + int32_t allLimit = pos.limit; 1.235 + 1.236 + ScriptRunIterator it(text, pos.contextStart, pos.contextLimit); 1.237 + 1.238 + while (it.next()) { 1.239 + // Ignore runs in the ante context 1.240 + if (it.limit <= allStart) continue; 1.241 + 1.242 + // Try to instantiate transliterator from it.scriptCode to 1.243 + // our target or target/variant 1.244 + Transliterator* t = getTransliterator(it.scriptCode); 1.245 + 1.246 + if (t == NULL) { 1.247 + // We have no transliterator. Do nothing, but keep 1.248 + // pos.start up to date. 1.249 + pos.start = it.limit; 1.250 + continue; 1.251 + } 1.252 + 1.253 + // If the run end is before the transliteration limit, do 1.254 + // a non-incremental transliteration. Otherwise do an 1.255 + // incremental one. 1.256 + UBool incremental = isIncremental && (it.limit >= allLimit); 1.257 + 1.258 + pos.start = uprv_max(allStart, it.start); 1.259 + pos.limit = uprv_min(allLimit, it.limit); 1.260 + int32_t limit = pos.limit; 1.261 + t->filteredTransliterate(text, pos, incremental); 1.262 + int32_t delta = pos.limit - limit; 1.263 + allLimit += delta; 1.264 + it.adjustLimit(delta); 1.265 + 1.266 + // We're done if we enter the post context 1.267 + if (it.limit >= allLimit) break; 1.268 + } 1.269 + 1.270 + // Restore limit. pos.start is fine where the last transliterator 1.271 + // left it, or at the end of the last run. 1.272 + pos.limit = allLimit; 1.273 +} 1.274 + 1.275 +Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { 1.276 + 1.277 + if (source == targetScript || source == USCRIPT_INVALID_CODE) { 1.278 + return NULL; 1.279 + } 1.280 + 1.281 + Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source); 1.282 + if (t == NULL) { 1.283 + UErrorCode ec = U_ZERO_ERROR; 1.284 + UnicodeString sourceName(uscript_getName(source), -1, US_INV); 1.285 + UnicodeString id(sourceName); 1.286 + id.append(TARGET_SEP).append(target); 1.287 + 1.288 + t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); 1.289 + if (U_FAILURE(ec) || t == NULL) { 1.290 + delete t; 1.291 + 1.292 + // Try to pivot around Latin, our most common script 1.293 + id = sourceName; 1.294 + id.append(LATIN_PIVOT, -1).append(target); 1.295 + t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); 1.296 + if (U_FAILURE(ec) || t == NULL) { 1.297 + delete t; 1.298 + t = NULL; 1.299 + } 1.300 + } 1.301 + 1.302 + if (t != NULL) { 1.303 + uhash_iput(cache, (int32_t) source, t, &ec); 1.304 + } 1.305 + } 1.306 + 1.307 + return t; 1.308 +} 1.309 + 1.310 +/** 1.311 + * Return the script code for a given name, or -1 if not found. 1.312 + */ 1.313 +static UScriptCode scriptNameToCode(const UnicodeString& name) { 1.314 + char buf[128]; 1.315 + UScriptCode code; 1.316 + UErrorCode ec = U_ZERO_ERROR; 1.317 + int32_t nameLen = name.length(); 1.318 + UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen); 1.319 + 1.320 + if (isInvariant) { 1.321 + name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV); 1.322 + buf[127] = 0; // Make sure that we NULL terminate the string. 1.323 + } 1.324 + if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec)) 1.325 + { 1.326 + code = USCRIPT_INVALID_CODE; 1.327 + } 1.328 + return code; 1.329 +} 1.330 + 1.331 +/** 1.332 + * Registers standard transliterators with the system. Called by 1.333 + * Transliterator during initialization. Scan all current targets and 1.334 + * register those that are scripts T as Any-T/V. 1.335 + */ 1.336 +void AnyTransliterator::registerIDs() { 1.337 + 1.338 + UErrorCode ec = U_ZERO_ERROR; 1.339 + Hashtable seen(TRUE, ec); 1.340 + 1.341 + int32_t sourceCount = Transliterator::_countAvailableSources(); 1.342 + for (int32_t s=0; s<sourceCount; ++s) { 1.343 + UnicodeString source; 1.344 + Transliterator::_getAvailableSource(s, source); 1.345 + 1.346 + // Ignore the "Any" source 1.347 + if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue; 1.348 + 1.349 + int32_t targetCount = Transliterator::_countAvailableTargets(source); 1.350 + for (int32_t t=0; t<targetCount; ++t) { 1.351 + UnicodeString target; 1.352 + Transliterator::_getAvailableTarget(t, source, target); 1.353 + 1.354 + // Only process each target once 1.355 + if (seen.geti(target) != 0) continue; 1.356 + ec = U_ZERO_ERROR; 1.357 + seen.puti(target, 1, ec); 1.358 + 1.359 + // Get the script code for the target. If not a script, ignore. 1.360 + UScriptCode targetScript = scriptNameToCode(target); 1.361 + if (targetScript == USCRIPT_INVALID_CODE) continue; 1.362 + 1.363 + int32_t variantCount = Transliterator::_countAvailableVariants(source, target); 1.364 + // assert(variantCount >= 1); 1.365 + for (int32_t v=0; v<variantCount; ++v) { 1.366 + UnicodeString variant; 1.367 + Transliterator::_getAvailableVariant(v, source, target, variant); 1.368 + 1.369 + UnicodeString id; 1.370 + TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id); 1.371 + ec = U_ZERO_ERROR; 1.372 + AnyTransliterator* t = new AnyTransliterator(id, target, variant, 1.373 + targetScript, ec); 1.374 + if (U_FAILURE(ec)) { 1.375 + delete t; 1.376 + } else { 1.377 + Transliterator::_registerInstance(t); 1.378 + Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE); 1.379 + } 1.380 + } 1.381 + } 1.382 + } 1.383 +} 1.384 + 1.385 +U_NAMESPACE_END 1.386 + 1.387 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1.388 + 1.389 +//eof