intl/icu/source/common/brkeng.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2  ************************************************************************************
     3  * Copyright (C) 2006-2013, International Business Machines Corporation
     4  * and others. All Rights Reserved.
     5  ************************************************************************************
     6  */
     8 #include "unicode/utypes.h"
    10 #if !UCONFIG_NO_BREAK_ITERATION
    12 #include "brkeng.h"
    13 #include "dictbe.h"
    14 #include "unicode/uchar.h"
    15 #include "unicode/uniset.h"
    16 #include "unicode/chariter.h"
    17 #include "unicode/ures.h"
    18 #include "unicode/udata.h"
    19 #include "unicode/putil.h"
    20 #include "unicode/ustring.h"
    21 #include "unicode/uscript.h"
    22 #include "unicode/ucharstrie.h"
    23 #include "unicode/bytestrie.h"
    24 #include "charstr.h"
    25 #include "dictionarydata.h"
    26 #include "uvector.h"
    27 #include "umutex.h"
    28 #include "uresimp.h"
    29 #include "ubrkimpl.h"
    31 U_NAMESPACE_BEGIN
    33 /*
    34  ******************************************************************
    35  */
    37 LanguageBreakEngine::LanguageBreakEngine() {
    38 }
    40 LanguageBreakEngine::~LanguageBreakEngine() {
    41 }
    43 /*
    44  ******************************************************************
    45  */
    47 LanguageBreakFactory::LanguageBreakFactory() {
    48 }
    50 LanguageBreakFactory::~LanguageBreakFactory() {
    51 }
    53 /*
    54  ******************************************************************
    55  */
    57 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
    58     for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
    59         fHandled[i] = 0;
    60     }
    61 }
    63 UnhandledEngine::~UnhandledEngine() {
    64     for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
    65         if (fHandled[i] != 0) {
    66             delete fHandled[i];
    67         }
    68     }
    69 }
    71 UBool
    72 UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
    73     return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
    74         && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
    75 }
    77 int32_t
    78 UnhandledEngine::findBreaks( UText *text,
    79                                  int32_t startPos,
    80                                  int32_t endPos,
    81                                  UBool reverse,
    82                                  int32_t breakType,
    83                                  UStack &/*foundBreaks*/ ) const {
    84     if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
    85         UChar32 c = utext_current32(text); 
    86         if (reverse) {
    87             while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
    88                 c = utext_previous32(text);
    89             }
    90         }
    91         else {
    92             while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
    93                 utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
    94                 c = utext_current32(text);
    95             }
    96         }
    97     }
    98     return 0;
    99 }
   101 void
   102 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
   103     if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
   104         if (fHandled[breakType] == 0) {
   105             fHandled[breakType] = new UnicodeSet();
   106             if (fHandled[breakType] == 0) {
   107                 return;
   108             }
   109         }
   110         if (!fHandled[breakType]->contains(c)) {
   111             UErrorCode status = U_ZERO_ERROR;
   112             // Apply the entire script of the character.
   113             int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
   114             fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
   115         }
   116     }
   117 }
   119 /*
   120  ******************************************************************
   121  */
   123 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
   124     fEngines = 0;
   125 }
   127 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
   128     if (fEngines != 0) {
   129         delete fEngines;
   130     }
   131 }
   133 U_NAMESPACE_END
   134 U_CDECL_BEGIN
   135 static void U_CALLCONV _deleteEngine(void *obj) {
   136     delete (const icu::LanguageBreakEngine *) obj;
   137 }
   138 U_CDECL_END
   139 U_NAMESPACE_BEGIN
   141 const LanguageBreakEngine *
   142 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
   143     UBool       needsInit;
   144     int32_t     i;
   145     const LanguageBreakEngine *lbe = NULL;
   146     UErrorCode  status = U_ZERO_ERROR;
   148     // TODO: The global mutex should not be used.
   149     // The global mutex should only be used for short periods.
   150     // A ICULanguageBreakFactory specific mutex should be used.
   151     umtx_lock(NULL);
   152     needsInit = (UBool)(fEngines == NULL);
   153     if (!needsInit) {
   154         i = fEngines->size();
   155         while (--i >= 0) {
   156             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
   157             if (lbe != NULL && lbe->handles(c, breakType)) {
   158                 break;
   159             }
   160             lbe = NULL;
   161         }
   162     }
   163     umtx_unlock(NULL);
   165     if (lbe != NULL) {
   166         return lbe;
   167     }
   169     if (needsInit) {
   170         UStack  *engines = new UStack(_deleteEngine, NULL, status);
   171         if (U_SUCCESS(status) && engines == NULL) {
   172             status = U_MEMORY_ALLOCATION_ERROR;
   173         }
   174         else if (U_FAILURE(status)) {
   175             delete engines;
   176             engines = NULL;
   177         }
   178         else {
   179             umtx_lock(NULL);
   180             if (fEngines == NULL) {
   181                 fEngines = engines;
   182                 engines = NULL;
   183             }
   184             umtx_unlock(NULL);
   185             delete engines;
   186         }
   187     }
   189     if (fEngines == NULL) {
   190         return NULL;
   191     }
   193     // We didn't find an engine the first time through, or there was no
   194     // stack. Create an engine.
   195     const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
   197     // Now get the lock, and see if someone else has created it in the
   198     // meantime
   199     umtx_lock(NULL);
   200     i = fEngines->size();
   201     while (--i >= 0) {
   202         lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
   203         if (lbe != NULL && lbe->handles(c, breakType)) {
   204             break;
   205         }
   206         lbe = NULL;
   207     }
   208     if (lbe == NULL && newlbe != NULL) {
   209         fEngines->push((void *)newlbe, status);
   210         lbe = newlbe;
   211         newlbe = NULL;
   212     }
   213     umtx_unlock(NULL);
   215     delete newlbe;
   217     return lbe;
   218 }
   220 const LanguageBreakEngine *
   221 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
   222     UErrorCode status = U_ZERO_ERROR;
   223     UScriptCode code = uscript_getScript(c, &status);
   224     if (U_SUCCESS(status)) {
   225         DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
   226         if (m != NULL) {
   227             const LanguageBreakEngine *engine = NULL;
   228             switch(code) {
   229             case USCRIPT_THAI:
   230                 engine = new ThaiBreakEngine(m, status);
   231                 break;
   232             case USCRIPT_LAO:
   233                 engine = new LaoBreakEngine(m, status);
   234                 break;
   235             case USCRIPT_KHMER:
   236                 engine = new KhmerBreakEngine(m, status);
   237                 break;
   239 #if !UCONFIG_NO_NORMALIZATION
   240                 // CJK not available w/o normalization
   241             case USCRIPT_HANGUL:
   242                 engine = new CjkBreakEngine(m, kKorean, status);
   243                 break;
   245             // use same BreakEngine and dictionary for both Chinese and Japanese
   246             case USCRIPT_HIRAGANA:
   247             case USCRIPT_KATAKANA:
   248             case USCRIPT_HAN:
   249                 engine = new CjkBreakEngine(m, kChineseJapanese, status);
   250                 break;
   251 #if 0
   252             // TODO: Have to get some characters with script=common handled
   253             // by CjkBreakEngine (e.g. U+309B). Simply subjecting
   254             // them to CjkBreakEngine does not work. The engine has to
   255             // special-case them.
   256             case USCRIPT_COMMON:
   257             {
   258                 UBlockCode block = ublock_getCode(code);
   259                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
   260                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);
   261                 break;
   262             }
   263 #endif
   264 #endif
   266             default:
   267                 break;
   268             }
   269             if (engine == NULL) {
   270                 delete m;
   271             }
   272             else if (U_FAILURE(status)) {
   273                 delete engine;
   274                 engine = NULL;
   275             }
   276             return engine;
   277         }
   278     }
   279     return NULL;
   280 }
   282 DictionaryMatcher *
   283 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { 
   284     UErrorCode status = U_ZERO_ERROR;
   285     // open root from brkitr tree.
   286     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
   287     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
   288     int32_t dictnlength = 0;
   289     const UChar *dictfname =
   290         ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
   291     if (U_FAILURE(status)) {
   292         ures_close(b);
   293         return NULL;
   294     }
   295     CharString dictnbuf;
   296     CharString ext;
   297     const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
   298     if (extStart != NULL) {
   299         int32_t len = (int32_t)(extStart - dictfname);
   300         ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
   301         dictnlength = len;
   302     }
   303     dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
   304     ures_close(b);
   306     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
   307     if (U_SUCCESS(status)) {
   308         // build trie
   309         const uint8_t *data = (const uint8_t *)udata_getMemory(file);
   310         const int32_t *indexes = (const int32_t *)data;
   311         const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
   312         const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
   313         DictionaryMatcher *m = NULL;
   314         if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
   315             const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
   316             const char *characters = (const char *)(data + offset);
   317             m = new BytesDictionaryMatcher(characters, transform, file);
   318         }
   319         else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
   320             const UChar *characters = (const UChar *)(data + offset);
   321             m = new UCharsDictionaryMatcher(characters, file);
   322         }
   323         if (m == NULL) {
   324             // no matcher exists to take ownership - either we are an invalid 
   325             // type or memory allocation failed
   326             udata_close(file);
   327         }
   328         return m;
   329     } else if (dictfname != NULL) {
   330         // we don't have a dictionary matcher.
   331         // returning NULL here will cause us to fail to find a dictionary break engine, as expected
   332         status = U_ZERO_ERROR;
   333         return NULL;
   334     }
   335     return NULL;
   336 }
   338 U_NAMESPACE_END
   340 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial