1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/brkeng.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,340 @@ 1.4 +/* 1.5 + ************************************************************************************ 1.6 + * Copyright (C) 2006-2013, International Business Machines Corporation 1.7 + * and others. All Rights Reserved. 1.8 + ************************************************************************************ 1.9 + */ 1.10 + 1.11 +#include "unicode/utypes.h" 1.12 + 1.13 +#if !UCONFIG_NO_BREAK_ITERATION 1.14 + 1.15 +#include "brkeng.h" 1.16 +#include "dictbe.h" 1.17 +#include "unicode/uchar.h" 1.18 +#include "unicode/uniset.h" 1.19 +#include "unicode/chariter.h" 1.20 +#include "unicode/ures.h" 1.21 +#include "unicode/udata.h" 1.22 +#include "unicode/putil.h" 1.23 +#include "unicode/ustring.h" 1.24 +#include "unicode/uscript.h" 1.25 +#include "unicode/ucharstrie.h" 1.26 +#include "unicode/bytestrie.h" 1.27 +#include "charstr.h" 1.28 +#include "dictionarydata.h" 1.29 +#include "uvector.h" 1.30 +#include "umutex.h" 1.31 +#include "uresimp.h" 1.32 +#include "ubrkimpl.h" 1.33 + 1.34 +U_NAMESPACE_BEGIN 1.35 + 1.36 +/* 1.37 + ****************************************************************** 1.38 + */ 1.39 + 1.40 +LanguageBreakEngine::LanguageBreakEngine() { 1.41 +} 1.42 + 1.43 +LanguageBreakEngine::~LanguageBreakEngine() { 1.44 +} 1.45 + 1.46 +/* 1.47 + ****************************************************************** 1.48 + */ 1.49 + 1.50 +LanguageBreakFactory::LanguageBreakFactory() { 1.51 +} 1.52 + 1.53 +LanguageBreakFactory::~LanguageBreakFactory() { 1.54 +} 1.55 + 1.56 +/* 1.57 + ****************************************************************** 1.58 + */ 1.59 + 1.60 +UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) { 1.61 + for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 1.62 + fHandled[i] = 0; 1.63 + } 1.64 +} 1.65 + 1.66 +UnhandledEngine::~UnhandledEngine() { 1.67 + for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 1.68 + if (fHandled[i] != 0) { 1.69 + delete fHandled[i]; 1.70 + } 1.71 + } 1.72 +} 1.73 + 1.74 +UBool 1.75 +UnhandledEngine::handles(UChar32 c, int32_t breakType) const { 1.76 + return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])) 1.77 + && fHandled[breakType] != 0 && fHandled[breakType]->contains(c)); 1.78 +} 1.79 + 1.80 +int32_t 1.81 +UnhandledEngine::findBreaks( UText *text, 1.82 + int32_t startPos, 1.83 + int32_t endPos, 1.84 + UBool reverse, 1.85 + int32_t breakType, 1.86 + UStack &/*foundBreaks*/ ) const { 1.87 + if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 1.88 + UChar32 c = utext_current32(text); 1.89 + if (reverse) { 1.90 + while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { 1.91 + c = utext_previous32(text); 1.92 + } 1.93 + } 1.94 + else { 1.95 + while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { 1.96 + utext_next32(text); // TODO: recast loop to work with post-increment operations. 1.97 + c = utext_current32(text); 1.98 + } 1.99 + } 1.100 + } 1.101 + return 0; 1.102 +} 1.103 + 1.104 +void 1.105 +UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) { 1.106 + if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 1.107 + if (fHandled[breakType] == 0) { 1.108 + fHandled[breakType] = new UnicodeSet(); 1.109 + if (fHandled[breakType] == 0) { 1.110 + return; 1.111 + } 1.112 + } 1.113 + if (!fHandled[breakType]->contains(c)) { 1.114 + UErrorCode status = U_ZERO_ERROR; 1.115 + // Apply the entire script of the character. 1.116 + int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); 1.117 + fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status); 1.118 + } 1.119 + } 1.120 +} 1.121 + 1.122 +/* 1.123 + ****************************************************************** 1.124 + */ 1.125 + 1.126 +ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { 1.127 + fEngines = 0; 1.128 +} 1.129 + 1.130 +ICULanguageBreakFactory::~ICULanguageBreakFactory() { 1.131 + if (fEngines != 0) { 1.132 + delete fEngines; 1.133 + } 1.134 +} 1.135 + 1.136 +U_NAMESPACE_END 1.137 +U_CDECL_BEGIN 1.138 +static void U_CALLCONV _deleteEngine(void *obj) { 1.139 + delete (const icu::LanguageBreakEngine *) obj; 1.140 +} 1.141 +U_CDECL_END 1.142 +U_NAMESPACE_BEGIN 1.143 + 1.144 +const LanguageBreakEngine * 1.145 +ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) { 1.146 + UBool needsInit; 1.147 + int32_t i; 1.148 + const LanguageBreakEngine *lbe = NULL; 1.149 + UErrorCode status = U_ZERO_ERROR; 1.150 + 1.151 + // TODO: The global mutex should not be used. 1.152 + // The global mutex should only be used for short periods. 1.153 + // A ICULanguageBreakFactory specific mutex should be used. 1.154 + umtx_lock(NULL); 1.155 + needsInit = (UBool)(fEngines == NULL); 1.156 + if (!needsInit) { 1.157 + i = fEngines->size(); 1.158 + while (--i >= 0) { 1.159 + lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 1.160 + if (lbe != NULL && lbe->handles(c, breakType)) { 1.161 + break; 1.162 + } 1.163 + lbe = NULL; 1.164 + } 1.165 + } 1.166 + umtx_unlock(NULL); 1.167 + 1.168 + if (lbe != NULL) { 1.169 + return lbe; 1.170 + } 1.171 + 1.172 + if (needsInit) { 1.173 + UStack *engines = new UStack(_deleteEngine, NULL, status); 1.174 + if (U_SUCCESS(status) && engines == NULL) { 1.175 + status = U_MEMORY_ALLOCATION_ERROR; 1.176 + } 1.177 + else if (U_FAILURE(status)) { 1.178 + delete engines; 1.179 + engines = NULL; 1.180 + } 1.181 + else { 1.182 + umtx_lock(NULL); 1.183 + if (fEngines == NULL) { 1.184 + fEngines = engines; 1.185 + engines = NULL; 1.186 + } 1.187 + umtx_unlock(NULL); 1.188 + delete engines; 1.189 + } 1.190 + } 1.191 + 1.192 + if (fEngines == NULL) { 1.193 + return NULL; 1.194 + } 1.195 + 1.196 + // We didn't find an engine the first time through, or there was no 1.197 + // stack. Create an engine. 1.198 + const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType); 1.199 + 1.200 + // Now get the lock, and see if someone else has created it in the 1.201 + // meantime 1.202 + umtx_lock(NULL); 1.203 + i = fEngines->size(); 1.204 + while (--i >= 0) { 1.205 + lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 1.206 + if (lbe != NULL && lbe->handles(c, breakType)) { 1.207 + break; 1.208 + } 1.209 + lbe = NULL; 1.210 + } 1.211 + if (lbe == NULL && newlbe != NULL) { 1.212 + fEngines->push((void *)newlbe, status); 1.213 + lbe = newlbe; 1.214 + newlbe = NULL; 1.215 + } 1.216 + umtx_unlock(NULL); 1.217 + 1.218 + delete newlbe; 1.219 + 1.220 + return lbe; 1.221 +} 1.222 + 1.223 +const LanguageBreakEngine * 1.224 +ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) { 1.225 + UErrorCode status = U_ZERO_ERROR; 1.226 + UScriptCode code = uscript_getScript(c, &status); 1.227 + if (U_SUCCESS(status)) { 1.228 + DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType); 1.229 + if (m != NULL) { 1.230 + const LanguageBreakEngine *engine = NULL; 1.231 + switch(code) { 1.232 + case USCRIPT_THAI: 1.233 + engine = new ThaiBreakEngine(m, status); 1.234 + break; 1.235 + case USCRIPT_LAO: 1.236 + engine = new LaoBreakEngine(m, status); 1.237 + break; 1.238 + case USCRIPT_KHMER: 1.239 + engine = new KhmerBreakEngine(m, status); 1.240 + break; 1.241 + 1.242 +#if !UCONFIG_NO_NORMALIZATION 1.243 + // CJK not available w/o normalization 1.244 + case USCRIPT_HANGUL: 1.245 + engine = new CjkBreakEngine(m, kKorean, status); 1.246 + break; 1.247 + 1.248 + // use same BreakEngine and dictionary for both Chinese and Japanese 1.249 + case USCRIPT_HIRAGANA: 1.250 + case USCRIPT_KATAKANA: 1.251 + case USCRIPT_HAN: 1.252 + engine = new CjkBreakEngine(m, kChineseJapanese, status); 1.253 + break; 1.254 +#if 0 1.255 + // TODO: Have to get some characters with script=common handled 1.256 + // by CjkBreakEngine (e.g. U+309B). Simply subjecting 1.257 + // them to CjkBreakEngine does not work. The engine has to 1.258 + // special-case them. 1.259 + case USCRIPT_COMMON: 1.260 + { 1.261 + UBlockCode block = ublock_getCode(code); 1.262 + if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) 1.263 + engine = new CjkBreakEngine(dict, kChineseJapanese, status); 1.264 + break; 1.265 + } 1.266 +#endif 1.267 +#endif 1.268 + 1.269 + default: 1.270 + break; 1.271 + } 1.272 + if (engine == NULL) { 1.273 + delete m; 1.274 + } 1.275 + else if (U_FAILURE(status)) { 1.276 + delete engine; 1.277 + engine = NULL; 1.278 + } 1.279 + return engine; 1.280 + } 1.281 + } 1.282 + return NULL; 1.283 +} 1.284 + 1.285 +DictionaryMatcher * 1.286 +ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { 1.287 + UErrorCode status = U_ZERO_ERROR; 1.288 + // open root from brkitr tree. 1.289 + UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); 1.290 + b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); 1.291 + int32_t dictnlength = 0; 1.292 + const UChar *dictfname = 1.293 + ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); 1.294 + if (U_FAILURE(status)) { 1.295 + ures_close(b); 1.296 + return NULL; 1.297 + } 1.298 + CharString dictnbuf; 1.299 + CharString ext; 1.300 + const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot 1.301 + if (extStart != NULL) { 1.302 + int32_t len = (int32_t)(extStart - dictfname); 1.303 + ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); 1.304 + dictnlength = len; 1.305 + } 1.306 + dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); 1.307 + ures_close(b); 1.308 + 1.309 + UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); 1.310 + if (U_SUCCESS(status)) { 1.311 + // build trie 1.312 + const uint8_t *data = (const uint8_t *)udata_getMemory(file); 1.313 + const int32_t *indexes = (const int32_t *)data; 1.314 + const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; 1.315 + const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 1.316 + DictionaryMatcher *m = NULL; 1.317 + if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 1.318 + const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; 1.319 + const char *characters = (const char *)(data + offset); 1.320 + m = new BytesDictionaryMatcher(characters, transform, file); 1.321 + } 1.322 + else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 1.323 + const UChar *characters = (const UChar *)(data + offset); 1.324 + m = new UCharsDictionaryMatcher(characters, file); 1.325 + } 1.326 + if (m == NULL) { 1.327 + // no matcher exists to take ownership - either we are an invalid 1.328 + // type or memory allocation failed 1.329 + udata_close(file); 1.330 + } 1.331 + return m; 1.332 + } else if (dictfname != NULL) { 1.333 + // we don't have a dictionary matcher. 1.334 + // returning NULL here will cause us to fail to find a dictionary break engine, as expected 1.335 + status = U_ZERO_ERROR; 1.336 + return NULL; 1.337 + } 1.338 + return NULL; 1.339 +} 1.340 + 1.341 +U_NAMESPACE_END 1.342 + 1.343 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */