intl/icu/source/i18n/anytrans.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 *****************************************************************
michael@0 3 * Copyright (c) 2002-2011, International Business Machines Corporation
michael@0 4 * and others. All Rights Reserved.
michael@0 5 *****************************************************************
michael@0 6 * Date Name Description
michael@0 7 * 06/06/2002 aliu Creation.
michael@0 8 *****************************************************************
michael@0 9 */
michael@0 10
michael@0 11 #include "unicode/utypes.h"
michael@0 12
michael@0 13 #if !UCONFIG_NO_TRANSLITERATION
michael@0 14
michael@0 15 #include "unicode/uobject.h"
michael@0 16 #include "unicode/uscript.h"
michael@0 17 #include "nultrans.h"
michael@0 18 #include "anytrans.h"
michael@0 19 #include "uvector.h"
michael@0 20 #include "tridpars.h"
michael@0 21 #include "hash.h"
michael@0 22 #include "putilimp.h"
michael@0 23 #include "uinvchar.h"
michael@0 24
michael@0 25 //------------------------------------------------------------
michael@0 26 // Constants
michael@0 27
michael@0 28 static const UChar TARGET_SEP = 45; // '-'
michael@0 29 static const UChar VARIANT_SEP = 47; // '/'
michael@0 30 static const UChar ANY[] = {65,110,121,0}; // "Any"
michael@0 31 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
michael@0 32 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
michael@0 33
michael@0 34 //------------------------------------------------------------
michael@0 35
michael@0 36 U_CDECL_BEGIN
michael@0 37 /**
michael@0 38 * Deleter function for Transliterator*.
michael@0 39 */
michael@0 40 static void U_CALLCONV
michael@0 41 _deleteTransliterator(void *obj) {
michael@0 42 delete (icu::Transliterator*) obj;
michael@0 43 }
michael@0 44 U_CDECL_END
michael@0 45
michael@0 46 //------------------------------------------------------------
michael@0 47
michael@0 48 U_NAMESPACE_BEGIN
michael@0 49
michael@0 50 //------------------------------------------------------------
michael@0 51 // ScriptRunIterator
michael@0 52
michael@0 53 /**
michael@0 54 * Returns a series of ranges corresponding to scripts. They will be
michael@0 55 * of the form:
michael@0 56 *
michael@0 57 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
michael@0 58 * | | - first run (start, limit)
michael@0 59 * | | - second run (start, limit)
michael@0 60 *
michael@0 61 * That is, the runs will overlap. The reason for this is so that a
michael@0 62 * transliterator can consider common characters both before and after
michael@0 63 * the scripts.
michael@0 64 */
michael@0 65 class ScriptRunIterator : public UMemory {
michael@0 66 private:
michael@0 67 const Replaceable& text;
michael@0 68 int32_t textStart;
michael@0 69 int32_t textLimit;
michael@0 70
michael@0 71 public:
michael@0 72 /**
michael@0 73 * The code of the current run, valid after next() returns. May
michael@0 74 * be USCRIPT_INVALID_CODE if and only if the entire text is
michael@0 75 * COMMON/INHERITED.
michael@0 76 */
michael@0 77 UScriptCode scriptCode;
michael@0 78
michael@0 79 /**
michael@0 80 * The start of the run, inclusive, valid after next() returns.
michael@0 81 */
michael@0 82 int32_t start;
michael@0 83
michael@0 84 /**
michael@0 85 * The end of the run, exclusive, valid after next() returns.
michael@0 86 */
michael@0 87 int32_t limit;
michael@0 88
michael@0 89 /**
michael@0 90 * Constructs a run iterator over the given text from start
michael@0 91 * (inclusive) to limit (exclusive).
michael@0 92 */
michael@0 93 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
michael@0 94
michael@0 95 /**
michael@0 96 * Returns TRUE if there are any more runs. TRUE is always
michael@0 97 * returned at least once. Upon return, the caller should
michael@0 98 * examine scriptCode, start, and limit.
michael@0 99 */
michael@0 100 UBool next();
michael@0 101
michael@0 102 /**
michael@0 103 * Adjusts internal indices for a change in the limit index of the
michael@0 104 * given delta. A positive delta means the limit has increased.
michael@0 105 */
michael@0 106 void adjustLimit(int32_t delta);
michael@0 107
michael@0 108 private:
michael@0 109 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
michael@0 110 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
michael@0 111 };
michael@0 112
michael@0 113 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
michael@0 114 int32_t myStart, int32_t myLimit) :
michael@0 115 text(theText)
michael@0 116 {
michael@0 117 textStart = myStart;
michael@0 118 textLimit = myLimit;
michael@0 119 limit = myStart;
michael@0 120 }
michael@0 121
michael@0 122 UBool ScriptRunIterator::next() {
michael@0 123 UChar32 ch;
michael@0 124 UScriptCode s;
michael@0 125 UErrorCode ec = U_ZERO_ERROR;
michael@0 126
michael@0 127 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
michael@0 128 start = limit;
michael@0 129
michael@0 130 // Are we done?
michael@0 131 if (start == textLimit) {
michael@0 132 return FALSE;
michael@0 133 }
michael@0 134
michael@0 135 // Move start back to include adjacent COMMON or INHERITED
michael@0 136 // characters
michael@0 137 while (start > textStart) {
michael@0 138 ch = text.char32At(start - 1); // look back
michael@0 139 s = uscript_getScript(ch, &ec);
michael@0 140 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
michael@0 141 --start;
michael@0 142 } else {
michael@0 143 break;
michael@0 144 }
michael@0 145 }
michael@0 146
michael@0 147 // Move limit ahead to include COMMON, INHERITED, and characters
michael@0 148 // of the current script.
michael@0 149 while (limit < textLimit) {
michael@0 150 ch = text.char32At(limit); // look ahead
michael@0 151 s = uscript_getScript(ch, &ec);
michael@0 152 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
michael@0 153 if (scriptCode == USCRIPT_INVALID_CODE) {
michael@0 154 scriptCode = s;
michael@0 155 } else if (s != scriptCode) {
michael@0 156 break;
michael@0 157 }
michael@0 158 }
michael@0 159 ++limit;
michael@0 160 }
michael@0 161
michael@0 162 // Return TRUE even if the entire text is COMMON / INHERITED, in
michael@0 163 // which case scriptCode will be USCRIPT_INVALID_CODE.
michael@0 164 return TRUE;
michael@0 165 }
michael@0 166
michael@0 167 void ScriptRunIterator::adjustLimit(int32_t delta) {
michael@0 168 limit += delta;
michael@0 169 textLimit += delta;
michael@0 170 }
michael@0 171
michael@0 172 //------------------------------------------------------------
michael@0 173 // AnyTransliterator
michael@0 174
michael@0 175 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
michael@0 176
michael@0 177 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
michael@0 178 const UnicodeString& theTarget,
michael@0 179 const UnicodeString& theVariant,
michael@0 180 UScriptCode theTargetScript,
michael@0 181 UErrorCode& ec) :
michael@0 182 Transliterator(id, NULL),
michael@0 183 targetScript(theTargetScript)
michael@0 184 {
michael@0 185 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
michael@0 186 if (U_FAILURE(ec)) {
michael@0 187 return;
michael@0 188 }
michael@0 189 uhash_setValueDeleter(cache, _deleteTransliterator);
michael@0 190
michael@0 191 target = theTarget;
michael@0 192 if (theVariant.length() > 0) {
michael@0 193 target.append(VARIANT_SEP).append(theVariant);
michael@0 194 }
michael@0 195 }
michael@0 196
michael@0 197 AnyTransliterator::~AnyTransliterator() {
michael@0 198 uhash_close(cache);
michael@0 199 }
michael@0 200
michael@0 201 /**
michael@0 202 * Copy constructor.
michael@0 203 */
michael@0 204 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
michael@0 205 Transliterator(o),
michael@0 206 target(o.target),
michael@0 207 targetScript(o.targetScript)
michael@0 208 {
michael@0 209 // Don't copy the cache contents
michael@0 210 UErrorCode ec = U_ZERO_ERROR;
michael@0 211 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
michael@0 212 if (U_FAILURE(ec)) {
michael@0 213 return;
michael@0 214 }
michael@0 215 uhash_setValueDeleter(cache, _deleteTransliterator);
michael@0 216 }
michael@0 217
michael@0 218 /**
michael@0 219 * Transliterator API.
michael@0 220 */
michael@0 221 Transliterator* AnyTransliterator::clone() const {
michael@0 222 return new AnyTransliterator(*this);
michael@0 223 }
michael@0 224
michael@0 225 /**
michael@0 226 * Implements {@link Transliterator#handleTransliterate}.
michael@0 227 */
michael@0 228 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
michael@0 229 UBool isIncremental) const {
michael@0 230 int32_t allStart = pos.start;
michael@0 231 int32_t allLimit = pos.limit;
michael@0 232
michael@0 233 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
michael@0 234
michael@0 235 while (it.next()) {
michael@0 236 // Ignore runs in the ante context
michael@0 237 if (it.limit <= allStart) continue;
michael@0 238
michael@0 239 // Try to instantiate transliterator from it.scriptCode to
michael@0 240 // our target or target/variant
michael@0 241 Transliterator* t = getTransliterator(it.scriptCode);
michael@0 242
michael@0 243 if (t == NULL) {
michael@0 244 // We have no transliterator. Do nothing, but keep
michael@0 245 // pos.start up to date.
michael@0 246 pos.start = it.limit;
michael@0 247 continue;
michael@0 248 }
michael@0 249
michael@0 250 // If the run end is before the transliteration limit, do
michael@0 251 // a non-incremental transliteration. Otherwise do an
michael@0 252 // incremental one.
michael@0 253 UBool incremental = isIncremental && (it.limit >= allLimit);
michael@0 254
michael@0 255 pos.start = uprv_max(allStart, it.start);
michael@0 256 pos.limit = uprv_min(allLimit, it.limit);
michael@0 257 int32_t limit = pos.limit;
michael@0 258 t->filteredTransliterate(text, pos, incremental);
michael@0 259 int32_t delta = pos.limit - limit;
michael@0 260 allLimit += delta;
michael@0 261 it.adjustLimit(delta);
michael@0 262
michael@0 263 // We're done if we enter the post context
michael@0 264 if (it.limit >= allLimit) break;
michael@0 265 }
michael@0 266
michael@0 267 // Restore limit. pos.start is fine where the last transliterator
michael@0 268 // left it, or at the end of the last run.
michael@0 269 pos.limit = allLimit;
michael@0 270 }
michael@0 271
michael@0 272 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
michael@0 273
michael@0 274 if (source == targetScript || source == USCRIPT_INVALID_CODE) {
michael@0 275 return NULL;
michael@0 276 }
michael@0 277
michael@0 278 Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
michael@0 279 if (t == NULL) {
michael@0 280 UErrorCode ec = U_ZERO_ERROR;
michael@0 281 UnicodeString sourceName(uscript_getName(source), -1, US_INV);
michael@0 282 UnicodeString id(sourceName);
michael@0 283 id.append(TARGET_SEP).append(target);
michael@0 284
michael@0 285 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
michael@0 286 if (U_FAILURE(ec) || t == NULL) {
michael@0 287 delete t;
michael@0 288
michael@0 289 // Try to pivot around Latin, our most common script
michael@0 290 id = sourceName;
michael@0 291 id.append(LATIN_PIVOT, -1).append(target);
michael@0 292 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
michael@0 293 if (U_FAILURE(ec) || t == NULL) {
michael@0 294 delete t;
michael@0 295 t = NULL;
michael@0 296 }
michael@0 297 }
michael@0 298
michael@0 299 if (t != NULL) {
michael@0 300 uhash_iput(cache, (int32_t) source, t, &ec);
michael@0 301 }
michael@0 302 }
michael@0 303
michael@0 304 return t;
michael@0 305 }
michael@0 306
michael@0 307 /**
michael@0 308 * Return the script code for a given name, or -1 if not found.
michael@0 309 */
michael@0 310 static UScriptCode scriptNameToCode(const UnicodeString& name) {
michael@0 311 char buf[128];
michael@0 312 UScriptCode code;
michael@0 313 UErrorCode ec = U_ZERO_ERROR;
michael@0 314 int32_t nameLen = name.length();
michael@0 315 UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
michael@0 316
michael@0 317 if (isInvariant) {
michael@0 318 name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
michael@0 319 buf[127] = 0; // Make sure that we NULL terminate the string.
michael@0 320 }
michael@0 321 if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
michael@0 322 {
michael@0 323 code = USCRIPT_INVALID_CODE;
michael@0 324 }
michael@0 325 return code;
michael@0 326 }
michael@0 327
michael@0 328 /**
michael@0 329 * Registers standard transliterators with the system. Called by
michael@0 330 * Transliterator during initialization. Scan all current targets and
michael@0 331 * register those that are scripts T as Any-T/V.
michael@0 332 */
michael@0 333 void AnyTransliterator::registerIDs() {
michael@0 334
michael@0 335 UErrorCode ec = U_ZERO_ERROR;
michael@0 336 Hashtable seen(TRUE, ec);
michael@0 337
michael@0 338 int32_t sourceCount = Transliterator::_countAvailableSources();
michael@0 339 for (int32_t s=0; s<sourceCount; ++s) {
michael@0 340 UnicodeString source;
michael@0 341 Transliterator::_getAvailableSource(s, source);
michael@0 342
michael@0 343 // Ignore the "Any" source
michael@0 344 if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
michael@0 345
michael@0 346 int32_t targetCount = Transliterator::_countAvailableTargets(source);
michael@0 347 for (int32_t t=0; t<targetCount; ++t) {
michael@0 348 UnicodeString target;
michael@0 349 Transliterator::_getAvailableTarget(t, source, target);
michael@0 350
michael@0 351 // Only process each target once
michael@0 352 if (seen.geti(target) != 0) continue;
michael@0 353 ec = U_ZERO_ERROR;
michael@0 354 seen.puti(target, 1, ec);
michael@0 355
michael@0 356 // Get the script code for the target. If not a script, ignore.
michael@0 357 UScriptCode targetScript = scriptNameToCode(target);
michael@0 358 if (targetScript == USCRIPT_INVALID_CODE) continue;
michael@0 359
michael@0 360 int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
michael@0 361 // assert(variantCount >= 1);
michael@0 362 for (int32_t v=0; v<variantCount; ++v) {
michael@0 363 UnicodeString variant;
michael@0 364 Transliterator::_getAvailableVariant(v, source, target, variant);
michael@0 365
michael@0 366 UnicodeString id;
michael@0 367 TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
michael@0 368 ec = U_ZERO_ERROR;
michael@0 369 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
michael@0 370 targetScript, ec);
michael@0 371 if (U_FAILURE(ec)) {
michael@0 372 delete t;
michael@0 373 } else {
michael@0 374 Transliterator::_registerInstance(t);
michael@0 375 Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
michael@0 376 }
michael@0 377 }
michael@0 378 }
michael@0 379 }
michael@0 380 }
michael@0 381
michael@0 382 U_NAMESPACE_END
michael@0 383
michael@0 384 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
michael@0 385
michael@0 386 //eof

mercurial