intl/icu/source/i18n/anytrans.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 *****************************************************************
     3 * Copyright (c) 2002-2011, International Business Machines Corporation
     4 * and others.  All Rights Reserved.
     5 *****************************************************************
     6 * Date        Name        Description
     7 * 06/06/2002  aliu        Creation.
     8 *****************************************************************
     9 */
    11 #include "unicode/utypes.h"
    13 #if !UCONFIG_NO_TRANSLITERATION
    15 #include "unicode/uobject.h"
    16 #include "unicode/uscript.h"
    17 #include "nultrans.h"
    18 #include "anytrans.h"
    19 #include "uvector.h"
    20 #include "tridpars.h"
    21 #include "hash.h"
    22 #include "putilimp.h"
    23 #include "uinvchar.h"
    25 //------------------------------------------------------------
    26 // Constants
    28 static const UChar TARGET_SEP = 45; // '-'
    29 static const UChar VARIANT_SEP = 47; // '/'
    30 static const UChar ANY[] = {65,110,121,0}; // "Any"
    31 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
    32 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
    34 //------------------------------------------------------------
    36 U_CDECL_BEGIN
    37 /**
    38  * Deleter function for Transliterator*.
    39  */
    40 static void U_CALLCONV
    41 _deleteTransliterator(void *obj) {
    42     delete (icu::Transliterator*) obj;    
    43 }
    44 U_CDECL_END
    46 //------------------------------------------------------------
    48 U_NAMESPACE_BEGIN
    50 //------------------------------------------------------------
    51 // ScriptRunIterator
    53 /**
    54  * Returns a series of ranges corresponding to scripts. They will be
    55  * of the form:
    56  *
    57  * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
    58  * |            |          - first run (start, limit)
    59  *          |           |  - second run (start, limit)
    60  *
    61  * That is, the runs will overlap. The reason for this is so that a
    62  * transliterator can consider common characters both before and after
    63  * the scripts.
    64  */
    65 class ScriptRunIterator : public UMemory {
    66 private:
    67     const Replaceable& text;
    68     int32_t textStart;
    69     int32_t textLimit;
    71 public:
    72     /**
    73      * The code of the current run, valid after next() returns.  May
    74      * be USCRIPT_INVALID_CODE if and only if the entire text is
    75      * COMMON/INHERITED.
    76      */
    77     UScriptCode scriptCode;
    79     /**
    80      * The start of the run, inclusive, valid after next() returns.
    81      */
    82     int32_t start;
    84     /**
    85      * The end of the run, exclusive, valid after next() returns.
    86      */
    87     int32_t limit;
    89     /**
    90      * Constructs a run iterator over the given text from start
    91      * (inclusive) to limit (exclusive).
    92      */
    93     ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
    95     /**
    96      * Returns TRUE if there are any more runs.  TRUE is always
    97      * returned at least once.  Upon return, the caller should
    98      * examine scriptCode, start, and limit.
    99      */
   100     UBool next();
   102     /**
   103      * Adjusts internal indices for a change in the limit index of the
   104      * given delta.  A positive delta means the limit has increased.
   105      */
   106     void adjustLimit(int32_t delta);
   108 private:
   109     ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
   110     ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
   111 };
   113 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
   114                                      int32_t myStart, int32_t myLimit) :
   115     text(theText)
   116 {
   117     textStart = myStart;
   118     textLimit = myLimit;
   119     limit = myStart;
   120 }
   122 UBool ScriptRunIterator::next() {
   123     UChar32 ch;
   124     UScriptCode s;
   125     UErrorCode ec = U_ZERO_ERROR;
   127     scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
   128     start = limit;
   130     // Are we done?
   131     if (start == textLimit) {
   132         return FALSE;
   133     }
   135     // Move start back to include adjacent COMMON or INHERITED
   136     // characters
   137     while (start > textStart) {
   138         ch = text.char32At(start - 1); // look back
   139         s = uscript_getScript(ch, &ec);
   140         if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
   141             --start;
   142         } else {
   143             break;
   144         }
   145     }
   147     // Move limit ahead to include COMMON, INHERITED, and characters
   148     // of the current script.
   149     while (limit < textLimit) {
   150         ch = text.char32At(limit); // look ahead
   151         s = uscript_getScript(ch, &ec);
   152         if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
   153             if (scriptCode == USCRIPT_INVALID_CODE) {
   154                 scriptCode = s;
   155             } else if (s != scriptCode) {
   156                 break;
   157             }
   158         }
   159         ++limit;
   160     }
   162     // Return TRUE even if the entire text is COMMON / INHERITED, in
   163     // which case scriptCode will be USCRIPT_INVALID_CODE.
   164     return TRUE;
   165 }
   167 void ScriptRunIterator::adjustLimit(int32_t delta) {
   168     limit += delta;
   169     textLimit += delta;
   170 }
   172 //------------------------------------------------------------
   173 // AnyTransliterator
   175 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
   177 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
   178                                      const UnicodeString& theTarget,
   179                                      const UnicodeString& theVariant,
   180                                      UScriptCode theTargetScript,
   181                                      UErrorCode& ec) :
   182     Transliterator(id, NULL),
   183     targetScript(theTargetScript) 
   184 {
   185     cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
   186     if (U_FAILURE(ec)) {
   187         return;
   188     }
   189     uhash_setValueDeleter(cache, _deleteTransliterator);
   191     target = theTarget;
   192     if (theVariant.length() > 0) {
   193         target.append(VARIANT_SEP).append(theVariant);
   194     }
   195 }
   197 AnyTransliterator::~AnyTransliterator() {
   198     uhash_close(cache);
   199 }
   201 /**
   202  * Copy constructor.
   203  */
   204 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
   205     Transliterator(o),
   206     target(o.target),
   207     targetScript(o.targetScript)
   208 {
   209     // Don't copy the cache contents
   210     UErrorCode ec = U_ZERO_ERROR;
   211     cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
   212     if (U_FAILURE(ec)) {
   213         return;
   214     }
   215     uhash_setValueDeleter(cache, _deleteTransliterator);
   216 }
   218 /**
   219  * Transliterator API.
   220  */
   221 Transliterator* AnyTransliterator::clone() const {
   222     return new AnyTransliterator(*this);
   223 }
   225 /**
   226  * Implements {@link Transliterator#handleTransliterate}.
   227  */
   228 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
   229                                             UBool isIncremental) const {
   230     int32_t allStart = pos.start;
   231     int32_t allLimit = pos.limit;
   233     ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
   235     while (it.next()) {
   236         // Ignore runs in the ante context
   237         if (it.limit <= allStart) continue;
   239         // Try to instantiate transliterator from it.scriptCode to
   240         // our target or target/variant
   241         Transliterator* t = getTransliterator(it.scriptCode);
   243         if (t == NULL) {
   244             // We have no transliterator.  Do nothing, but keep
   245             // pos.start up to date.
   246             pos.start = it.limit;
   247             continue;
   248         }
   250         // If the run end is before the transliteration limit, do
   251         // a non-incremental transliteration.  Otherwise do an
   252         // incremental one.
   253         UBool incremental = isIncremental && (it.limit >= allLimit);
   255         pos.start = uprv_max(allStart, it.start);
   256         pos.limit = uprv_min(allLimit, it.limit);
   257         int32_t limit = pos.limit;
   258         t->filteredTransliterate(text, pos, incremental);
   259         int32_t delta = pos.limit - limit;
   260         allLimit += delta;
   261         it.adjustLimit(delta);
   263         // We're done if we enter the post context
   264         if (it.limit >= allLimit) break;
   265     }
   267     // Restore limit.  pos.start is fine where the last transliterator
   268     // left it, or at the end of the last run.
   269     pos.limit = allLimit;
   270 }
   272 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
   274     if (source == targetScript || source == USCRIPT_INVALID_CODE) {
   275         return NULL;
   276     }
   278     Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
   279     if (t == NULL) {
   280         UErrorCode ec = U_ZERO_ERROR;
   281         UnicodeString sourceName(uscript_getName(source), -1, US_INV);
   282         UnicodeString id(sourceName);
   283         id.append(TARGET_SEP).append(target);
   285         t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
   286         if (U_FAILURE(ec) || t == NULL) {
   287             delete t;
   289             // Try to pivot around Latin, our most common script
   290             id = sourceName;
   291             id.append(LATIN_PIVOT, -1).append(target);
   292             t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
   293             if (U_FAILURE(ec) || t == NULL) {
   294                 delete t;
   295                 t = NULL;
   296             }
   297         }
   299         if (t != NULL) {
   300             uhash_iput(cache, (int32_t) source, t, &ec);
   301         }
   302     }
   304     return t;
   305 }
   307 /**
   308  * Return the script code for a given name, or -1 if not found.
   309  */
   310 static UScriptCode scriptNameToCode(const UnicodeString& name) {
   311     char buf[128];
   312     UScriptCode code;
   313     UErrorCode ec = U_ZERO_ERROR;
   314     int32_t nameLen = name.length();
   315     UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
   317     if (isInvariant) {
   318         name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
   319         buf[127] = 0;   // Make sure that we NULL terminate the string.
   320     }
   321     if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
   322     {
   323         code = USCRIPT_INVALID_CODE;
   324     }
   325     return code;
   326 }
   328 /**
   329  * Registers standard transliterators with the system.  Called by
   330  * Transliterator during initialization.  Scan all current targets and
   331  * register those that are scripts T as Any-T/V.
   332  */
   333 void AnyTransliterator::registerIDs() {
   335     UErrorCode ec = U_ZERO_ERROR;
   336     Hashtable seen(TRUE, ec);
   338     int32_t sourceCount = Transliterator::_countAvailableSources();
   339     for (int32_t s=0; s<sourceCount; ++s) {
   340         UnicodeString source;
   341         Transliterator::_getAvailableSource(s, source);
   343         // Ignore the "Any" source
   344         if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
   346         int32_t targetCount = Transliterator::_countAvailableTargets(source);
   347         for (int32_t t=0; t<targetCount; ++t) {
   348             UnicodeString target;
   349             Transliterator::_getAvailableTarget(t, source, target);
   351             // Only process each target once
   352             if (seen.geti(target) != 0) continue;
   353             ec = U_ZERO_ERROR;
   354             seen.puti(target, 1, ec);
   356             // Get the script code for the target.  If not a script, ignore.
   357             UScriptCode targetScript = scriptNameToCode(target);
   358             if (targetScript == USCRIPT_INVALID_CODE) continue;
   360             int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
   361             // assert(variantCount >= 1);
   362             for (int32_t v=0; v<variantCount; ++v) {
   363                 UnicodeString variant;
   364                 Transliterator::_getAvailableVariant(v, source, target, variant);
   366                 UnicodeString id;
   367                 TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
   368                 ec = U_ZERO_ERROR;
   369                 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
   370                                                              targetScript, ec);
   371                 if (U_FAILURE(ec)) {
   372                     delete t;
   373                 } else {
   374                     Transliterator::_registerInstance(t);
   375                     Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
   376                 }
   377             }
   378         }
   379     }
   380 }
   382 U_NAMESPACE_END
   384 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
   386 //eof

mercurial