intl/icu/source/common/brkeng.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 ************************************************************************************
michael@0 3 * Copyright (C) 2006-2013, International Business Machines Corporation
michael@0 4 * and others. All Rights Reserved.
michael@0 5 ************************************************************************************
michael@0 6 */
michael@0 7
michael@0 8 #include "unicode/utypes.h"
michael@0 9
michael@0 10 #if !UCONFIG_NO_BREAK_ITERATION
michael@0 11
michael@0 12 #include "brkeng.h"
michael@0 13 #include "dictbe.h"
michael@0 14 #include "unicode/uchar.h"
michael@0 15 #include "unicode/uniset.h"
michael@0 16 #include "unicode/chariter.h"
michael@0 17 #include "unicode/ures.h"
michael@0 18 #include "unicode/udata.h"
michael@0 19 #include "unicode/putil.h"
michael@0 20 #include "unicode/ustring.h"
michael@0 21 #include "unicode/uscript.h"
michael@0 22 #include "unicode/ucharstrie.h"
michael@0 23 #include "unicode/bytestrie.h"
michael@0 24 #include "charstr.h"
michael@0 25 #include "dictionarydata.h"
michael@0 26 #include "uvector.h"
michael@0 27 #include "umutex.h"
michael@0 28 #include "uresimp.h"
michael@0 29 #include "ubrkimpl.h"
michael@0 30
michael@0 31 U_NAMESPACE_BEGIN
michael@0 32
michael@0 33 /*
michael@0 34 ******************************************************************
michael@0 35 */
michael@0 36
michael@0 37 LanguageBreakEngine::LanguageBreakEngine() {
michael@0 38 }
michael@0 39
michael@0 40 LanguageBreakEngine::~LanguageBreakEngine() {
michael@0 41 }
michael@0 42
michael@0 43 /*
michael@0 44 ******************************************************************
michael@0 45 */
michael@0 46
michael@0 47 LanguageBreakFactory::LanguageBreakFactory() {
michael@0 48 }
michael@0 49
michael@0 50 LanguageBreakFactory::~LanguageBreakFactory() {
michael@0 51 }
michael@0 52
michael@0 53 /*
michael@0 54 ******************************************************************
michael@0 55 */
michael@0 56
michael@0 57 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
michael@0 58 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
michael@0 59 fHandled[i] = 0;
michael@0 60 }
michael@0 61 }
michael@0 62
michael@0 63 UnhandledEngine::~UnhandledEngine() {
michael@0 64 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
michael@0 65 if (fHandled[i] != 0) {
michael@0 66 delete fHandled[i];
michael@0 67 }
michael@0 68 }
michael@0 69 }
michael@0 70
michael@0 71 UBool
michael@0 72 UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
michael@0 73 return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
michael@0 74 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
michael@0 75 }
michael@0 76
michael@0 77 int32_t
michael@0 78 UnhandledEngine::findBreaks( UText *text,
michael@0 79 int32_t startPos,
michael@0 80 int32_t endPos,
michael@0 81 UBool reverse,
michael@0 82 int32_t breakType,
michael@0 83 UStack &/*foundBreaks*/ ) const {
michael@0 84 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
michael@0 85 UChar32 c = utext_current32(text);
michael@0 86 if (reverse) {
michael@0 87 while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
michael@0 88 c = utext_previous32(text);
michael@0 89 }
michael@0 90 }
michael@0 91 else {
michael@0 92 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
michael@0 93 utext_next32(text); // TODO: recast loop to work with post-increment operations.
michael@0 94 c = utext_current32(text);
michael@0 95 }
michael@0 96 }
michael@0 97 }
michael@0 98 return 0;
michael@0 99 }
michael@0 100
michael@0 101 void
michael@0 102 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
michael@0 103 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
michael@0 104 if (fHandled[breakType] == 0) {
michael@0 105 fHandled[breakType] = new UnicodeSet();
michael@0 106 if (fHandled[breakType] == 0) {
michael@0 107 return;
michael@0 108 }
michael@0 109 }
michael@0 110 if (!fHandled[breakType]->contains(c)) {
michael@0 111 UErrorCode status = U_ZERO_ERROR;
michael@0 112 // Apply the entire script of the character.
michael@0 113 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
michael@0 114 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
michael@0 115 }
michael@0 116 }
michael@0 117 }
michael@0 118
michael@0 119 /*
michael@0 120 ******************************************************************
michael@0 121 */
michael@0 122
michael@0 123 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
michael@0 124 fEngines = 0;
michael@0 125 }
michael@0 126
michael@0 127 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
michael@0 128 if (fEngines != 0) {
michael@0 129 delete fEngines;
michael@0 130 }
michael@0 131 }
michael@0 132
michael@0 133 U_NAMESPACE_END
michael@0 134 U_CDECL_BEGIN
michael@0 135 static void U_CALLCONV _deleteEngine(void *obj) {
michael@0 136 delete (const icu::LanguageBreakEngine *) obj;
michael@0 137 }
michael@0 138 U_CDECL_END
michael@0 139 U_NAMESPACE_BEGIN
michael@0 140
michael@0 141 const LanguageBreakEngine *
michael@0 142 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
michael@0 143 UBool needsInit;
michael@0 144 int32_t i;
michael@0 145 const LanguageBreakEngine *lbe = NULL;
michael@0 146 UErrorCode status = U_ZERO_ERROR;
michael@0 147
michael@0 148 // TODO: The global mutex should not be used.
michael@0 149 // The global mutex should only be used for short periods.
michael@0 150 // A ICULanguageBreakFactory specific mutex should be used.
michael@0 151 umtx_lock(NULL);
michael@0 152 needsInit = (UBool)(fEngines == NULL);
michael@0 153 if (!needsInit) {
michael@0 154 i = fEngines->size();
michael@0 155 while (--i >= 0) {
michael@0 156 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
michael@0 157 if (lbe != NULL && lbe->handles(c, breakType)) {
michael@0 158 break;
michael@0 159 }
michael@0 160 lbe = NULL;
michael@0 161 }
michael@0 162 }
michael@0 163 umtx_unlock(NULL);
michael@0 164
michael@0 165 if (lbe != NULL) {
michael@0 166 return lbe;
michael@0 167 }
michael@0 168
michael@0 169 if (needsInit) {
michael@0 170 UStack *engines = new UStack(_deleteEngine, NULL, status);
michael@0 171 if (U_SUCCESS(status) && engines == NULL) {
michael@0 172 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 173 }
michael@0 174 else if (U_FAILURE(status)) {
michael@0 175 delete engines;
michael@0 176 engines = NULL;
michael@0 177 }
michael@0 178 else {
michael@0 179 umtx_lock(NULL);
michael@0 180 if (fEngines == NULL) {
michael@0 181 fEngines = engines;
michael@0 182 engines = NULL;
michael@0 183 }
michael@0 184 umtx_unlock(NULL);
michael@0 185 delete engines;
michael@0 186 }
michael@0 187 }
michael@0 188
michael@0 189 if (fEngines == NULL) {
michael@0 190 return NULL;
michael@0 191 }
michael@0 192
michael@0 193 // We didn't find an engine the first time through, or there was no
michael@0 194 // stack. Create an engine.
michael@0 195 const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
michael@0 196
michael@0 197 // Now get the lock, and see if someone else has created it in the
michael@0 198 // meantime
michael@0 199 umtx_lock(NULL);
michael@0 200 i = fEngines->size();
michael@0 201 while (--i >= 0) {
michael@0 202 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
michael@0 203 if (lbe != NULL && lbe->handles(c, breakType)) {
michael@0 204 break;
michael@0 205 }
michael@0 206 lbe = NULL;
michael@0 207 }
michael@0 208 if (lbe == NULL && newlbe != NULL) {
michael@0 209 fEngines->push((void *)newlbe, status);
michael@0 210 lbe = newlbe;
michael@0 211 newlbe = NULL;
michael@0 212 }
michael@0 213 umtx_unlock(NULL);
michael@0 214
michael@0 215 delete newlbe;
michael@0 216
michael@0 217 return lbe;
michael@0 218 }
michael@0 219
michael@0 220 const LanguageBreakEngine *
michael@0 221 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
michael@0 222 UErrorCode status = U_ZERO_ERROR;
michael@0 223 UScriptCode code = uscript_getScript(c, &status);
michael@0 224 if (U_SUCCESS(status)) {
michael@0 225 DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
michael@0 226 if (m != NULL) {
michael@0 227 const LanguageBreakEngine *engine = NULL;
michael@0 228 switch(code) {
michael@0 229 case USCRIPT_THAI:
michael@0 230 engine = new ThaiBreakEngine(m, status);
michael@0 231 break;
michael@0 232 case USCRIPT_LAO:
michael@0 233 engine = new LaoBreakEngine(m, status);
michael@0 234 break;
michael@0 235 case USCRIPT_KHMER:
michael@0 236 engine = new KhmerBreakEngine(m, status);
michael@0 237 break;
michael@0 238
michael@0 239 #if !UCONFIG_NO_NORMALIZATION
michael@0 240 // CJK not available w/o normalization
michael@0 241 case USCRIPT_HANGUL:
michael@0 242 engine = new CjkBreakEngine(m, kKorean, status);
michael@0 243 break;
michael@0 244
michael@0 245 // use same BreakEngine and dictionary for both Chinese and Japanese
michael@0 246 case USCRIPT_HIRAGANA:
michael@0 247 case USCRIPT_KATAKANA:
michael@0 248 case USCRIPT_HAN:
michael@0 249 engine = new CjkBreakEngine(m, kChineseJapanese, status);
michael@0 250 break;
michael@0 251 #if 0
michael@0 252 // TODO: Have to get some characters with script=common handled
michael@0 253 // by CjkBreakEngine (e.g. U+309B). Simply subjecting
michael@0 254 // them to CjkBreakEngine does not work. The engine has to
michael@0 255 // special-case them.
michael@0 256 case USCRIPT_COMMON:
michael@0 257 {
michael@0 258 UBlockCode block = ublock_getCode(code);
michael@0 259 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
michael@0 260 engine = new CjkBreakEngine(dict, kChineseJapanese, status);
michael@0 261 break;
michael@0 262 }
michael@0 263 #endif
michael@0 264 #endif
michael@0 265
michael@0 266 default:
michael@0 267 break;
michael@0 268 }
michael@0 269 if (engine == NULL) {
michael@0 270 delete m;
michael@0 271 }
michael@0 272 else if (U_FAILURE(status)) {
michael@0 273 delete engine;
michael@0 274 engine = NULL;
michael@0 275 }
michael@0 276 return engine;
michael@0 277 }
michael@0 278 }
michael@0 279 return NULL;
michael@0 280 }
michael@0 281
michael@0 282 DictionaryMatcher *
michael@0 283 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
michael@0 284 UErrorCode status = U_ZERO_ERROR;
michael@0 285 // open root from brkitr tree.
michael@0 286 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
michael@0 287 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
michael@0 288 int32_t dictnlength = 0;
michael@0 289 const UChar *dictfname =
michael@0 290 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
michael@0 291 if (U_FAILURE(status)) {
michael@0 292 ures_close(b);
michael@0 293 return NULL;
michael@0 294 }
michael@0 295 CharString dictnbuf;
michael@0 296 CharString ext;
michael@0 297 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
michael@0 298 if (extStart != NULL) {
michael@0 299 int32_t len = (int32_t)(extStart - dictfname);
michael@0 300 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
michael@0 301 dictnlength = len;
michael@0 302 }
michael@0 303 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
michael@0 304 ures_close(b);
michael@0 305
michael@0 306 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
michael@0 307 if (U_SUCCESS(status)) {
michael@0 308 // build trie
michael@0 309 const uint8_t *data = (const uint8_t *)udata_getMemory(file);
michael@0 310 const int32_t *indexes = (const int32_t *)data;
michael@0 311 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
michael@0 312 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
michael@0 313 DictionaryMatcher *m = NULL;
michael@0 314 if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
michael@0 315 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
michael@0 316 const char *characters = (const char *)(data + offset);
michael@0 317 m = new BytesDictionaryMatcher(characters, transform, file);
michael@0 318 }
michael@0 319 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
michael@0 320 const UChar *characters = (const UChar *)(data + offset);
michael@0 321 m = new UCharsDictionaryMatcher(characters, file);
michael@0 322 }
michael@0 323 if (m == NULL) {
michael@0 324 // no matcher exists to take ownership - either we are an invalid
michael@0 325 // type or memory allocation failed
michael@0 326 udata_close(file);
michael@0 327 }
michael@0 328 return m;
michael@0 329 } else if (dictfname != NULL) {
michael@0 330 // we don't have a dictionary matcher.
michael@0 331 // returning NULL here will cause us to fail to find a dictionary break engine, as expected
michael@0 332 status = U_ZERO_ERROR;
michael@0 333 return NULL;
michael@0 334 }
michael@0 335 return NULL;
michael@0 336 }
michael@0 337
michael@0 338 U_NAMESPACE_END
michael@0 339
michael@0 340 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial