The Tor Browser: intl/icu/source/common/brkeng.cpp@fc2d59ddac77

     1 /*

     2  ************************************************************************************

     3  * Copyright (C) 2006-2013, International Business Machines Corporation

     4  * and others. All Rights Reserved.

     5  ************************************************************************************

     6  */

     8 #include "unicode/utypes.h"

    10 #if !UCONFIG_NO_BREAK_ITERATION

    12 #include "brkeng.h"

    13 #include "dictbe.h"

    14 #include "unicode/uchar.h"

    15 #include "unicode/uniset.h"

    16 #include "unicode/chariter.h"

    17 #include "unicode/ures.h"

    18 #include "unicode/udata.h"

    19 #include "unicode/putil.h"

    20 #include "unicode/ustring.h"

    21 #include "unicode/uscript.h"

    22 #include "unicode/ucharstrie.h"

    23 #include "unicode/bytestrie.h"

    24 #include "charstr.h"

    25 #include "dictionarydata.h"

    26 #include "uvector.h"

    27 #include "umutex.h"

    28 #include "uresimp.h"

    29 #include "ubrkimpl.h"

    31 U_NAMESPACE_BEGIN

    33 /*

    34  ******************************************************************

    35  */

    37 LanguageBreakEngine::LanguageBreakEngine() {

    38 }

    40 LanguageBreakEngine::~LanguageBreakEngine() {

    41 }

    43 /*

    44  ******************************************************************

    45  */

    47 LanguageBreakFactory::LanguageBreakFactory() {

    48 }

    50 LanguageBreakFactory::~LanguageBreakFactory() {

    51 }

    53 /*

    54  ******************************************************************

    55  */

    57 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {

    58     for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {

    59         fHandled[i] = 0;

    60     }

    61 }

    63 UnhandledEngine::~UnhandledEngine() {

    64     for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {

    65         if (fHandled[i] != 0) {

    66             delete fHandled[i];

    67         }

    68     }

    69 }

    71 UBool

    72 UnhandledEngine::handles(UChar32 c, int32_t breakType) const {

    73     return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))

    74         && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));

    75 }

    77 int32_t

    78 UnhandledEngine::findBreaks( UText *text,

    79                                  int32_t startPos,

    80                                  int32_t endPos,

    81                                  UBool reverse,

    82                                  int32_t breakType,

    83                                  UStack &/*foundBreaks*/ ) const {

    84     if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {

    85         UChar32 c = utext_current32(text);

    86         if (reverse) {

    87             while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {

    88                 c = utext_previous32(text);

    89             }

    90         }

    91         else {

    92             while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {

    93                 utext_next32(text);            // TODO:  recast loop to work with post-increment operations.

    94                 c = utext_current32(text);

    95             }

    96         }

    97     }

    98     return 0;

    99 }

   101 void

   102 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {

   103     if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {

   104         if (fHandled[breakType] == 0) {

   105             fHandled[breakType] = new UnicodeSet();

   106             if (fHandled[breakType] == 0) {

   107                 return;

   108             }

   109         }

   110         if (!fHandled[breakType]->contains(c)) {

   111             UErrorCode status = U_ZERO_ERROR;

   112             // Apply the entire script of the character.

   113             int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);

   114             fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);

   115         }

   116     }

   117 }

   119 /*

   120  ******************************************************************

   121  */

   123 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {

   124     fEngines = 0;

   125 }

   127 ICULanguageBreakFactory::~ICULanguageBreakFactory() {

   128     if (fEngines != 0) {

   129         delete fEngines;

   130     }

   131 }

   133 U_NAMESPACE_END

   134 U_CDECL_BEGIN

   135 static void U_CALLCONV _deleteEngine(void *obj) {

   136     delete (const icu::LanguageBreakEngine *) obj;

   137 }

   138 U_CDECL_END

   139 U_NAMESPACE_BEGIN

   141 const LanguageBreakEngine *

   142 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {

   143     UBool       needsInit;

   144     int32_t     i;

   145     const LanguageBreakEngine *lbe = NULL;

   146     UErrorCode  status = U_ZERO_ERROR;

   148     // TODO: The global mutex should not be used.

   149     // The global mutex should only be used for short periods.

   150     // A ICULanguageBreakFactory specific mutex should be used.

   151     umtx_lock(NULL);

   152     needsInit = (UBool)(fEngines == NULL);

   153     if (!needsInit) {

   154         i = fEngines->size();

   155         while (--i >= 0) {

   156             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));

   157             if (lbe != NULL && lbe->handles(c, breakType)) {

   158                 break;

   159             }

   160             lbe = NULL;

   161         }

   162     }

   163     umtx_unlock(NULL);

   165     if (lbe != NULL) {

   166         return lbe;

   167     }

   169     if (needsInit) {

   170         UStack  *engines = new UStack(_deleteEngine, NULL, status);

   171         if (U_SUCCESS(status) && engines == NULL) {

   172             status = U_MEMORY_ALLOCATION_ERROR;

   173         }

   174         else if (U_FAILURE(status)) {

   175             delete engines;

   176             engines = NULL;

   177         }

   178         else {

   179             umtx_lock(NULL);

   180             if (fEngines == NULL) {

   181                 fEngines = engines;

   182                 engines = NULL;

   183             }

   184             umtx_unlock(NULL);

   185             delete engines;

   186         }

   187     }

   189     if (fEngines == NULL) {

   190         return NULL;

   191     }

   193     // We didn't find an engine the first time through, or there was no

   194     // stack. Create an engine.

   195     const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);

   197     // Now get the lock, and see if someone else has created it in the

   198     // meantime

   199     umtx_lock(NULL);

   200     i = fEngines->size();

   201     while (--i >= 0) {

   202         lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));

   203         if (lbe != NULL && lbe->handles(c, breakType)) {

   204             break;

   205         }

   206         lbe = NULL;

   207     }

   208     if (lbe == NULL && newlbe != NULL) {

   209         fEngines->push((void *)newlbe, status);

   210         lbe = newlbe;

   211         newlbe = NULL;

   212     }

   213     umtx_unlock(NULL);

   215     delete newlbe;

   217     return lbe;

   218 }

   220 const LanguageBreakEngine *

   221 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {

   222     UErrorCode status = U_ZERO_ERROR;

   223     UScriptCode code = uscript_getScript(c, &status);

   224     if (U_SUCCESS(status)) {

   225         DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);

   226         if (m != NULL) {

   227             const LanguageBreakEngine *engine = NULL;

   228             switch(code) {

   229             case USCRIPT_THAI:

   230                 engine = new ThaiBreakEngine(m, status);

   231                 break;

   232             case USCRIPT_LAO:

   233                 engine = new LaoBreakEngine(m, status);

   234                 break;

   235             case USCRIPT_KHMER:

   236                 engine = new KhmerBreakEngine(m, status);

   237                 break;

   239 #if !UCONFIG_NO_NORMALIZATION

   240                 // CJK not available w/o normalization

   241             case USCRIPT_HANGUL:

   242                 engine = new CjkBreakEngine(m, kKorean, status);

   243                 break;

   245             // use same BreakEngine and dictionary for both Chinese and Japanese

   246             case USCRIPT_HIRAGANA:

   247             case USCRIPT_KATAKANA:

   248             case USCRIPT_HAN:

   249                 engine = new CjkBreakEngine(m, kChineseJapanese, status);

   250                 break;

   251 #if 0

   252             // TODO: Have to get some characters with script=common handled

   253             // by CjkBreakEngine (e.g. U+309B). Simply subjecting

   254             // them to CjkBreakEngine does not work. The engine has to

   255             // special-case them.

   256             case USCRIPT_COMMON:

   257             {

   258                 UBlockCode block = ublock_getCode(code);

   259                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)

   260                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);

   261                 break;

   262             }

   263 #endif

   264 #endif

   266             default:

   267                 break;

   268             }

   269             if (engine == NULL) {

   270                 delete m;

   271             }

   272             else if (U_FAILURE(status)) {

   273                 delete engine;

   274                 engine = NULL;

   275             }

   276             return engine;

   277         }

   278     }

   279     return NULL;

   280 }

   282 DictionaryMatcher *

   283 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {

   284     UErrorCode status = U_ZERO_ERROR;

   285     // open root from brkitr tree.

   286     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);

   287     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);

   288     int32_t dictnlength = 0;

   289     const UChar *dictfname =

   290         ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);

   291     if (U_FAILURE(status)) {

   292         ures_close(b);

   293         return NULL;

   294     }

   295     CharString dictnbuf;

   296     CharString ext;

   297     const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot

   298     if (extStart != NULL) {

   299         int32_t len = (int32_t)(extStart - dictfname);

   300         ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);

   301         dictnlength = len;

   302     }

   303     dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);

   304     ures_close(b);

   306     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);

   307     if (U_SUCCESS(status)) {

   308         // build trie

   309         const uint8_t *data = (const uint8_t *)udata_getMemory(file);

   310         const int32_t *indexes = (const int32_t *)data;

   311         const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];

   312         const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;

   313         DictionaryMatcher *m = NULL;

   314         if (trieType == DictionaryData::TRIE_TYPE_BYTES) {

   315             const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];

   316             const char *characters = (const char *)(data + offset);

   317             m = new BytesDictionaryMatcher(characters, transform, file);

   318         }

   319         else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {

   320             const UChar *characters = (const UChar *)(data + offset);

   321             m = new UCharsDictionaryMatcher(characters, file);

   322         }

   323         if (m == NULL) {

   324             // no matcher exists to take ownership - either we are an invalid

   325             // type or memory allocation failed

   326             udata_close(file);

   327         }

   328         return m;

   329     } else if (dictfname != NULL) {

   330         // we don't have a dictionary matcher.

   331         // returning NULL here will cause us to fail to find a dictionary break engine, as expected

   332         status = U_ZERO_ERROR;

   333         return NULL;

   334     }

   335     return NULL;

   336 }

   338 U_NAMESPACE_END

   340 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

The Tor Browser / file revision

intl/icu/source/common/brkeng.cpp@fc2d59ddac77

intl/icu/source/common/brkeng.cpp