intl/icu/source/common/brkeng.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/brkeng.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,340 @@
     1.4 +/*
     1.5 + ************************************************************************************
     1.6 + * Copyright (C) 2006-2013, International Business Machines Corporation
     1.7 + * and others. All Rights Reserved.
     1.8 + ************************************************************************************
     1.9 + */
    1.10 +
    1.11 +#include "unicode/utypes.h"
    1.12 +
    1.13 +#if !UCONFIG_NO_BREAK_ITERATION
    1.14 +
    1.15 +#include "brkeng.h"
    1.16 +#include "dictbe.h"
    1.17 +#include "unicode/uchar.h"
    1.18 +#include "unicode/uniset.h"
    1.19 +#include "unicode/chariter.h"
    1.20 +#include "unicode/ures.h"
    1.21 +#include "unicode/udata.h"
    1.22 +#include "unicode/putil.h"
    1.23 +#include "unicode/ustring.h"
    1.24 +#include "unicode/uscript.h"
    1.25 +#include "unicode/ucharstrie.h"
    1.26 +#include "unicode/bytestrie.h"
    1.27 +#include "charstr.h"
    1.28 +#include "dictionarydata.h"
    1.29 +#include "uvector.h"
    1.30 +#include "umutex.h"
    1.31 +#include "uresimp.h"
    1.32 +#include "ubrkimpl.h"
    1.33 +
    1.34 +U_NAMESPACE_BEGIN
    1.35 +
    1.36 +/*
    1.37 + ******************************************************************
    1.38 + */
    1.39 +
    1.40 +LanguageBreakEngine::LanguageBreakEngine() {
    1.41 +}
    1.42 +
    1.43 +LanguageBreakEngine::~LanguageBreakEngine() {
    1.44 +}
    1.45 +
    1.46 +/*
    1.47 + ******************************************************************
    1.48 + */
    1.49 +
    1.50 +LanguageBreakFactory::LanguageBreakFactory() {
    1.51 +}
    1.52 +
    1.53 +LanguageBreakFactory::~LanguageBreakFactory() {
    1.54 +}
    1.55 +
    1.56 +/*
    1.57 + ******************************************************************
    1.58 + */
    1.59 +
    1.60 +UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
    1.61 +    for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
    1.62 +        fHandled[i] = 0;
    1.63 +    }
    1.64 +}
    1.65 +
    1.66 +UnhandledEngine::~UnhandledEngine() {
    1.67 +    for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
    1.68 +        if (fHandled[i] != 0) {
    1.69 +            delete fHandled[i];
    1.70 +        }
    1.71 +    }
    1.72 +}
    1.73 +
    1.74 +UBool
    1.75 +UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
    1.76 +    return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
    1.77 +        && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
    1.78 +}
    1.79 +
    1.80 +int32_t
    1.81 +UnhandledEngine::findBreaks( UText *text,
    1.82 +                                 int32_t startPos,
    1.83 +                                 int32_t endPos,
    1.84 +                                 UBool reverse,
    1.85 +                                 int32_t breakType,
    1.86 +                                 UStack &/*foundBreaks*/ ) const {
    1.87 +    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
    1.88 +        UChar32 c = utext_current32(text); 
    1.89 +        if (reverse) {
    1.90 +            while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
    1.91 +                c = utext_previous32(text);
    1.92 +            }
    1.93 +        }
    1.94 +        else {
    1.95 +            while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
    1.96 +                utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
    1.97 +                c = utext_current32(text);
    1.98 +            }
    1.99 +        }
   1.100 +    }
   1.101 +    return 0;
   1.102 +}
   1.103 +
   1.104 +void
   1.105 +UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
   1.106 +    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
   1.107 +        if (fHandled[breakType] == 0) {
   1.108 +            fHandled[breakType] = new UnicodeSet();
   1.109 +            if (fHandled[breakType] == 0) {
   1.110 +                return;
   1.111 +            }
   1.112 +        }
   1.113 +        if (!fHandled[breakType]->contains(c)) {
   1.114 +            UErrorCode status = U_ZERO_ERROR;
   1.115 +            // Apply the entire script of the character.
   1.116 +            int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
   1.117 +            fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
   1.118 +        }
   1.119 +    }
   1.120 +}
   1.121 +
   1.122 +/*
   1.123 + ******************************************************************
   1.124 + */
   1.125 +
   1.126 +ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
   1.127 +    fEngines = 0;
   1.128 +}
   1.129 +
   1.130 +ICULanguageBreakFactory::~ICULanguageBreakFactory() {
   1.131 +    if (fEngines != 0) {
   1.132 +        delete fEngines;
   1.133 +    }
   1.134 +}
   1.135 +
   1.136 +U_NAMESPACE_END
   1.137 +U_CDECL_BEGIN
   1.138 +static void U_CALLCONV _deleteEngine(void *obj) {
   1.139 +    delete (const icu::LanguageBreakEngine *) obj;
   1.140 +}
   1.141 +U_CDECL_END
   1.142 +U_NAMESPACE_BEGIN
   1.143 +
   1.144 +const LanguageBreakEngine *
   1.145 +ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
   1.146 +    UBool       needsInit;
   1.147 +    int32_t     i;
   1.148 +    const LanguageBreakEngine *lbe = NULL;
   1.149 +    UErrorCode  status = U_ZERO_ERROR;
   1.150 +
   1.151 +    // TODO: The global mutex should not be used.
   1.152 +    // The global mutex should only be used for short periods.
   1.153 +    // A ICULanguageBreakFactory specific mutex should be used.
   1.154 +    umtx_lock(NULL);
   1.155 +    needsInit = (UBool)(fEngines == NULL);
   1.156 +    if (!needsInit) {
   1.157 +        i = fEngines->size();
   1.158 +        while (--i >= 0) {
   1.159 +            lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
   1.160 +            if (lbe != NULL && lbe->handles(c, breakType)) {
   1.161 +                break;
   1.162 +            }
   1.163 +            lbe = NULL;
   1.164 +        }
   1.165 +    }
   1.166 +    umtx_unlock(NULL);
   1.167 +    
   1.168 +    if (lbe != NULL) {
   1.169 +        return lbe;
   1.170 +    }
   1.171 +    
   1.172 +    if (needsInit) {
   1.173 +        UStack  *engines = new UStack(_deleteEngine, NULL, status);
   1.174 +        if (U_SUCCESS(status) && engines == NULL) {
   1.175 +            status = U_MEMORY_ALLOCATION_ERROR;
   1.176 +        }
   1.177 +        else if (U_FAILURE(status)) {
   1.178 +            delete engines;
   1.179 +            engines = NULL;
   1.180 +        }
   1.181 +        else {
   1.182 +            umtx_lock(NULL);
   1.183 +            if (fEngines == NULL) {
   1.184 +                fEngines = engines;
   1.185 +                engines = NULL;
   1.186 +            }
   1.187 +            umtx_unlock(NULL);
   1.188 +            delete engines;
   1.189 +        }
   1.190 +    }
   1.191 +    
   1.192 +    if (fEngines == NULL) {
   1.193 +        return NULL;
   1.194 +    }
   1.195 +
   1.196 +    // We didn't find an engine the first time through, or there was no
   1.197 +    // stack. Create an engine.
   1.198 +    const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
   1.199 +    
   1.200 +    // Now get the lock, and see if someone else has created it in the
   1.201 +    // meantime
   1.202 +    umtx_lock(NULL);
   1.203 +    i = fEngines->size();
   1.204 +    while (--i >= 0) {
   1.205 +        lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
   1.206 +        if (lbe != NULL && lbe->handles(c, breakType)) {
   1.207 +            break;
   1.208 +        }
   1.209 +        lbe = NULL;
   1.210 +    }
   1.211 +    if (lbe == NULL && newlbe != NULL) {
   1.212 +        fEngines->push((void *)newlbe, status);
   1.213 +        lbe = newlbe;
   1.214 +        newlbe = NULL;
   1.215 +    }
   1.216 +    umtx_unlock(NULL);
   1.217 +    
   1.218 +    delete newlbe;
   1.219 +
   1.220 +    return lbe;
   1.221 +}
   1.222 +
   1.223 +const LanguageBreakEngine *
   1.224 +ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
   1.225 +    UErrorCode status = U_ZERO_ERROR;
   1.226 +    UScriptCode code = uscript_getScript(c, &status);
   1.227 +    if (U_SUCCESS(status)) {
   1.228 +        DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
   1.229 +        if (m != NULL) {
   1.230 +            const LanguageBreakEngine *engine = NULL;
   1.231 +            switch(code) {
   1.232 +            case USCRIPT_THAI:
   1.233 +                engine = new ThaiBreakEngine(m, status);
   1.234 +                break;
   1.235 +            case USCRIPT_LAO:
   1.236 +                engine = new LaoBreakEngine(m, status);
   1.237 +                break;
   1.238 +            case USCRIPT_KHMER:
   1.239 +                engine = new KhmerBreakEngine(m, status);
   1.240 +                break;
   1.241 +
   1.242 +#if !UCONFIG_NO_NORMALIZATION
   1.243 +                // CJK not available w/o normalization
   1.244 +            case USCRIPT_HANGUL:
   1.245 +                engine = new CjkBreakEngine(m, kKorean, status);
   1.246 +                break;
   1.247 +
   1.248 +            // use same BreakEngine and dictionary for both Chinese and Japanese
   1.249 +            case USCRIPT_HIRAGANA:
   1.250 +            case USCRIPT_KATAKANA:
   1.251 +            case USCRIPT_HAN:
   1.252 +                engine = new CjkBreakEngine(m, kChineseJapanese, status);
   1.253 +                break;
   1.254 +#if 0
   1.255 +            // TODO: Have to get some characters with script=common handled
   1.256 +            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
   1.257 +            // them to CjkBreakEngine does not work. The engine has to
   1.258 +            // special-case them.
   1.259 +            case USCRIPT_COMMON:
   1.260 +            {
   1.261 +                UBlockCode block = ublock_getCode(code);
   1.262 +                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
   1.263 +                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
   1.264 +                break;
   1.265 +            }
   1.266 +#endif
   1.267 +#endif
   1.268 +
   1.269 +            default:
   1.270 +                break;
   1.271 +            }
   1.272 +            if (engine == NULL) {
   1.273 +                delete m;
   1.274 +            }
   1.275 +            else if (U_FAILURE(status)) {
   1.276 +                delete engine;
   1.277 +                engine = NULL;
   1.278 +            }
   1.279 +            return engine;
   1.280 +        }
   1.281 +    }
   1.282 +    return NULL;
   1.283 +}
   1.284 +
   1.285 +DictionaryMatcher *
   1.286 +ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { 
   1.287 +    UErrorCode status = U_ZERO_ERROR;
   1.288 +    // open root from brkitr tree.
   1.289 +    UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
   1.290 +    b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
   1.291 +    int32_t dictnlength = 0;
   1.292 +    const UChar *dictfname =
   1.293 +        ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
   1.294 +    if (U_FAILURE(status)) {
   1.295 +        ures_close(b);
   1.296 +        return NULL;
   1.297 +    }
   1.298 +    CharString dictnbuf;
   1.299 +    CharString ext;
   1.300 +    const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
   1.301 +    if (extStart != NULL) {
   1.302 +        int32_t len = (int32_t)(extStart - dictfname);
   1.303 +        ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
   1.304 +        dictnlength = len;
   1.305 +    }
   1.306 +    dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
   1.307 +    ures_close(b);
   1.308 +
   1.309 +    UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
   1.310 +    if (U_SUCCESS(status)) {
   1.311 +        // build trie
   1.312 +        const uint8_t *data = (const uint8_t *)udata_getMemory(file);
   1.313 +        const int32_t *indexes = (const int32_t *)data;
   1.314 +        const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
   1.315 +        const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
   1.316 +        DictionaryMatcher *m = NULL;
   1.317 +        if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
   1.318 +            const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
   1.319 +            const char *characters = (const char *)(data + offset);
   1.320 +            m = new BytesDictionaryMatcher(characters, transform, file);
   1.321 +        }
   1.322 +        else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
   1.323 +            const UChar *characters = (const UChar *)(data + offset);
   1.324 +            m = new UCharsDictionaryMatcher(characters, file);
   1.325 +        }
   1.326 +        if (m == NULL) {
   1.327 +            // no matcher exists to take ownership - either we are an invalid 
   1.328 +            // type or memory allocation failed
   1.329 +            udata_close(file);
   1.330 +        }
   1.331 +        return m;
   1.332 +    } else if (dictfname != NULL) {
   1.333 +        // we don't have a dictionary matcher.
   1.334 +        // returning NULL here will cause us to fail to find a dictionary break engine, as expected
   1.335 +        status = U_ZERO_ERROR;
   1.336 +        return NULL;
   1.337 +    }
   1.338 +    return NULL;
   1.339 +}
   1.340 +
   1.341 +U_NAMESPACE_END
   1.342 +
   1.343 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial