intl/icu/source/i18n/anytrans.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/anytrans.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,386 @@
     1.4 +/*
     1.5 +*****************************************************************
     1.6 +* Copyright (c) 2002-2011, International Business Machines Corporation
     1.7 +* and others.  All Rights Reserved.
     1.8 +*****************************************************************
     1.9 +* Date        Name        Description
    1.10 +* 06/06/2002  aliu        Creation.
    1.11 +*****************************************************************
    1.12 +*/
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +
    1.16 +#if !UCONFIG_NO_TRANSLITERATION
    1.17 +
    1.18 +#include "unicode/uobject.h"
    1.19 +#include "unicode/uscript.h"
    1.20 +#include "nultrans.h"
    1.21 +#include "anytrans.h"
    1.22 +#include "uvector.h"
    1.23 +#include "tridpars.h"
    1.24 +#include "hash.h"
    1.25 +#include "putilimp.h"
    1.26 +#include "uinvchar.h"
    1.27 +
    1.28 +//------------------------------------------------------------
    1.29 +// Constants
    1.30 +
    1.31 +static const UChar TARGET_SEP = 45; // '-'
    1.32 +static const UChar VARIANT_SEP = 47; // '/'
    1.33 +static const UChar ANY[] = {65,110,121,0}; // "Any"
    1.34 +static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
    1.35 +static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
    1.36 +
    1.37 +//------------------------------------------------------------
    1.38 +
    1.39 +U_CDECL_BEGIN
    1.40 +/**
    1.41 + * Deleter function for Transliterator*.
    1.42 + */
    1.43 +static void U_CALLCONV
    1.44 +_deleteTransliterator(void *obj) {
    1.45 +    delete (icu::Transliterator*) obj;    
    1.46 +}
    1.47 +U_CDECL_END
    1.48 +
    1.49 +//------------------------------------------------------------
    1.50 +
    1.51 +U_NAMESPACE_BEGIN
    1.52 +
    1.53 +//------------------------------------------------------------
    1.54 +// ScriptRunIterator
    1.55 +
    1.56 +/**
    1.57 + * Returns a series of ranges corresponding to scripts. They will be
    1.58 + * of the form:
    1.59 + *
    1.60 + * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
    1.61 + * |            |          - first run (start, limit)
    1.62 + *          |           |  - second run (start, limit)
    1.63 + *
    1.64 + * That is, the runs will overlap. The reason for this is so that a
    1.65 + * transliterator can consider common characters both before and after
    1.66 + * the scripts.
    1.67 + */
    1.68 +class ScriptRunIterator : public UMemory {
    1.69 +private:
    1.70 +    const Replaceable& text;
    1.71 +    int32_t textStart;
    1.72 +    int32_t textLimit;
    1.73 +
    1.74 +public:
    1.75 +    /**
    1.76 +     * The code of the current run, valid after next() returns.  May
    1.77 +     * be USCRIPT_INVALID_CODE if and only if the entire text is
    1.78 +     * COMMON/INHERITED.
    1.79 +     */
    1.80 +    UScriptCode scriptCode;
    1.81 +
    1.82 +    /**
    1.83 +     * The start of the run, inclusive, valid after next() returns.
    1.84 +     */
    1.85 +    int32_t start;
    1.86 +
    1.87 +    /**
    1.88 +     * The end of the run, exclusive, valid after next() returns.
    1.89 +     */
    1.90 +    int32_t limit;
    1.91 +    
    1.92 +    /**
    1.93 +     * Constructs a run iterator over the given text from start
    1.94 +     * (inclusive) to limit (exclusive).
    1.95 +     */
    1.96 +    ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
    1.97 +
    1.98 +    /**
    1.99 +     * Returns TRUE if there are any more runs.  TRUE is always
   1.100 +     * returned at least once.  Upon return, the caller should
   1.101 +     * examine scriptCode, start, and limit.
   1.102 +     */
   1.103 +    UBool next();
   1.104 +
   1.105 +    /**
   1.106 +     * Adjusts internal indices for a change in the limit index of the
   1.107 +     * given delta.  A positive delta means the limit has increased.
   1.108 +     */
   1.109 +    void adjustLimit(int32_t delta);
   1.110 +
   1.111 +private:
   1.112 +    ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
   1.113 +    ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
   1.114 +};
   1.115 +
   1.116 +ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
   1.117 +                                     int32_t myStart, int32_t myLimit) :
   1.118 +    text(theText)
   1.119 +{
   1.120 +    textStart = myStart;
   1.121 +    textLimit = myLimit;
   1.122 +    limit = myStart;
   1.123 +}
   1.124 +
   1.125 +UBool ScriptRunIterator::next() {
   1.126 +    UChar32 ch;
   1.127 +    UScriptCode s;
   1.128 +    UErrorCode ec = U_ZERO_ERROR;
   1.129 +
   1.130 +    scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
   1.131 +    start = limit;
   1.132 +
   1.133 +    // Are we done?
   1.134 +    if (start == textLimit) {
   1.135 +        return FALSE;
   1.136 +    }
   1.137 +
   1.138 +    // Move start back to include adjacent COMMON or INHERITED
   1.139 +    // characters
   1.140 +    while (start > textStart) {
   1.141 +        ch = text.char32At(start - 1); // look back
   1.142 +        s = uscript_getScript(ch, &ec);
   1.143 +        if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
   1.144 +            --start;
   1.145 +        } else {
   1.146 +            break;
   1.147 +        }
   1.148 +    }
   1.149 +
   1.150 +    // Move limit ahead to include COMMON, INHERITED, and characters
   1.151 +    // of the current script.
   1.152 +    while (limit < textLimit) {
   1.153 +        ch = text.char32At(limit); // look ahead
   1.154 +        s = uscript_getScript(ch, &ec);
   1.155 +        if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
   1.156 +            if (scriptCode == USCRIPT_INVALID_CODE) {
   1.157 +                scriptCode = s;
   1.158 +            } else if (s != scriptCode) {
   1.159 +                break;
   1.160 +            }
   1.161 +        }
   1.162 +        ++limit;
   1.163 +    }
   1.164 +
   1.165 +    // Return TRUE even if the entire text is COMMON / INHERITED, in
   1.166 +    // which case scriptCode will be USCRIPT_INVALID_CODE.
   1.167 +    return TRUE;
   1.168 +}
   1.169 +
   1.170 +void ScriptRunIterator::adjustLimit(int32_t delta) {
   1.171 +    limit += delta;
   1.172 +    textLimit += delta;
   1.173 +}
   1.174 +
   1.175 +//------------------------------------------------------------
   1.176 +// AnyTransliterator
   1.177 +
   1.178 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
   1.179 +
   1.180 +AnyTransliterator::AnyTransliterator(const UnicodeString& id,
   1.181 +                                     const UnicodeString& theTarget,
   1.182 +                                     const UnicodeString& theVariant,
   1.183 +                                     UScriptCode theTargetScript,
   1.184 +                                     UErrorCode& ec) :
   1.185 +    Transliterator(id, NULL),
   1.186 +    targetScript(theTargetScript) 
   1.187 +{
   1.188 +    cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
   1.189 +    if (U_FAILURE(ec)) {
   1.190 +        return;
   1.191 +    }
   1.192 +    uhash_setValueDeleter(cache, _deleteTransliterator);
   1.193 +
   1.194 +    target = theTarget;
   1.195 +    if (theVariant.length() > 0) {
   1.196 +        target.append(VARIANT_SEP).append(theVariant);
   1.197 +    }
   1.198 +}
   1.199 +
   1.200 +AnyTransliterator::~AnyTransliterator() {
   1.201 +    uhash_close(cache);
   1.202 +}
   1.203 +
   1.204 +/**
   1.205 + * Copy constructor.
   1.206 + */
   1.207 +AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
   1.208 +    Transliterator(o),
   1.209 +    target(o.target),
   1.210 +    targetScript(o.targetScript)
   1.211 +{
   1.212 +    // Don't copy the cache contents
   1.213 +    UErrorCode ec = U_ZERO_ERROR;
   1.214 +    cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
   1.215 +    if (U_FAILURE(ec)) {
   1.216 +        return;
   1.217 +    }
   1.218 +    uhash_setValueDeleter(cache, _deleteTransliterator);
   1.219 +}
   1.220 +
   1.221 +/**
   1.222 + * Transliterator API.
   1.223 + */
   1.224 +Transliterator* AnyTransliterator::clone() const {
   1.225 +    return new AnyTransliterator(*this);
   1.226 +}
   1.227 +
   1.228 +/**
   1.229 + * Implements {@link Transliterator#handleTransliterate}.
   1.230 + */
   1.231 +void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
   1.232 +                                            UBool isIncremental) const {
   1.233 +    int32_t allStart = pos.start;
   1.234 +    int32_t allLimit = pos.limit;
   1.235 +
   1.236 +    ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
   1.237 +
   1.238 +    while (it.next()) {
   1.239 +        // Ignore runs in the ante context
   1.240 +        if (it.limit <= allStart) continue;
   1.241 +
   1.242 +        // Try to instantiate transliterator from it.scriptCode to
   1.243 +        // our target or target/variant
   1.244 +        Transliterator* t = getTransliterator(it.scriptCode);
   1.245 +       
   1.246 +        if (t == NULL) {
   1.247 +            // We have no transliterator.  Do nothing, but keep
   1.248 +            // pos.start up to date.
   1.249 +            pos.start = it.limit;
   1.250 +            continue;
   1.251 +        }
   1.252 +
   1.253 +        // If the run end is before the transliteration limit, do
   1.254 +        // a non-incremental transliteration.  Otherwise do an
   1.255 +        // incremental one.
   1.256 +        UBool incremental = isIncremental && (it.limit >= allLimit);
   1.257 +        
   1.258 +        pos.start = uprv_max(allStart, it.start);
   1.259 +        pos.limit = uprv_min(allLimit, it.limit);
   1.260 +        int32_t limit = pos.limit;
   1.261 +        t->filteredTransliterate(text, pos, incremental);
   1.262 +        int32_t delta = pos.limit - limit;
   1.263 +        allLimit += delta;
   1.264 +        it.adjustLimit(delta);
   1.265 +
   1.266 +        // We're done if we enter the post context
   1.267 +        if (it.limit >= allLimit) break;
   1.268 +    }
   1.269 +
   1.270 +    // Restore limit.  pos.start is fine where the last transliterator
   1.271 +    // left it, or at the end of the last run.
   1.272 +    pos.limit = allLimit;
   1.273 +}
   1.274 +
   1.275 +Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
   1.276 +
   1.277 +    if (source == targetScript || source == USCRIPT_INVALID_CODE) {
   1.278 +        return NULL;
   1.279 +    }
   1.280 +
   1.281 +    Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
   1.282 +    if (t == NULL) {
   1.283 +        UErrorCode ec = U_ZERO_ERROR;
   1.284 +        UnicodeString sourceName(uscript_getName(source), -1, US_INV);
   1.285 +        UnicodeString id(sourceName);
   1.286 +        id.append(TARGET_SEP).append(target);
   1.287 +        
   1.288 +        t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
   1.289 +        if (U_FAILURE(ec) || t == NULL) {
   1.290 +            delete t;
   1.291 +            
   1.292 +            // Try to pivot around Latin, our most common script
   1.293 +            id = sourceName;
   1.294 +            id.append(LATIN_PIVOT, -1).append(target);
   1.295 +            t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
   1.296 +            if (U_FAILURE(ec) || t == NULL) {
   1.297 +                delete t;
   1.298 +                t = NULL;
   1.299 +            }
   1.300 +        }
   1.301 +
   1.302 +        if (t != NULL) {
   1.303 +            uhash_iput(cache, (int32_t) source, t, &ec);
   1.304 +        }
   1.305 +    }
   1.306 +
   1.307 +    return t;
   1.308 +}
   1.309 +
   1.310 +/**
   1.311 + * Return the script code for a given name, or -1 if not found.
   1.312 + */
   1.313 +static UScriptCode scriptNameToCode(const UnicodeString& name) {
   1.314 +    char buf[128];
   1.315 +    UScriptCode code;
   1.316 +    UErrorCode ec = U_ZERO_ERROR;
   1.317 +    int32_t nameLen = name.length();
   1.318 +    UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
   1.319 +    
   1.320 +    if (isInvariant) {
   1.321 +        name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
   1.322 +        buf[127] = 0;   // Make sure that we NULL terminate the string.
   1.323 +    }
   1.324 +    if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
   1.325 +    {
   1.326 +        code = USCRIPT_INVALID_CODE;
   1.327 +    }
   1.328 +    return code;
   1.329 +}
   1.330 +
   1.331 +/**
   1.332 + * Registers standard transliterators with the system.  Called by
   1.333 + * Transliterator during initialization.  Scan all current targets and
   1.334 + * register those that are scripts T as Any-T/V.
   1.335 + */
   1.336 +void AnyTransliterator::registerIDs() {
   1.337 +
   1.338 +    UErrorCode ec = U_ZERO_ERROR;
   1.339 +    Hashtable seen(TRUE, ec);
   1.340 +
   1.341 +    int32_t sourceCount = Transliterator::_countAvailableSources();
   1.342 +    for (int32_t s=0; s<sourceCount; ++s) {
   1.343 +        UnicodeString source;
   1.344 +        Transliterator::_getAvailableSource(s, source);
   1.345 +
   1.346 +        // Ignore the "Any" source
   1.347 +        if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
   1.348 +
   1.349 +        int32_t targetCount = Transliterator::_countAvailableTargets(source);
   1.350 +        for (int32_t t=0; t<targetCount; ++t) {
   1.351 +            UnicodeString target;
   1.352 +            Transliterator::_getAvailableTarget(t, source, target);
   1.353 +
   1.354 +            // Only process each target once
   1.355 +            if (seen.geti(target) != 0) continue;
   1.356 +            ec = U_ZERO_ERROR;
   1.357 +            seen.puti(target, 1, ec);
   1.358 +            
   1.359 +            // Get the script code for the target.  If not a script, ignore.
   1.360 +            UScriptCode targetScript = scriptNameToCode(target);
   1.361 +            if (targetScript == USCRIPT_INVALID_CODE) continue;
   1.362 +
   1.363 +            int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
   1.364 +            // assert(variantCount >= 1);
   1.365 +            for (int32_t v=0; v<variantCount; ++v) {
   1.366 +                UnicodeString variant;
   1.367 +                Transliterator::_getAvailableVariant(v, source, target, variant);
   1.368 +                
   1.369 +                UnicodeString id;
   1.370 +                TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
   1.371 +                ec = U_ZERO_ERROR;
   1.372 +                AnyTransliterator* t = new AnyTransliterator(id, target, variant,
   1.373 +                                                             targetScript, ec);
   1.374 +                if (U_FAILURE(ec)) {
   1.375 +                    delete t;
   1.376 +                } else {
   1.377 +                    Transliterator::_registerInstance(t);
   1.378 +                    Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
   1.379 +                }
   1.380 +            }
   1.381 +        }
   1.382 +    }
   1.383 +}
   1.384 +
   1.385 +U_NAMESPACE_END
   1.386 +
   1.387 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */
   1.388 +
   1.389 +//eof

mercurial