Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | ************************************************************************************ |
michael@0 | 3 | * Copyright (C) 2006-2013, International Business Machines Corporation |
michael@0 | 4 | * and others. All Rights Reserved. |
michael@0 | 5 | ************************************************************************************ |
michael@0 | 6 | */ |
michael@0 | 7 | |
michael@0 | 8 | #include "unicode/utypes.h" |
michael@0 | 9 | |
michael@0 | 10 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 11 | |
michael@0 | 12 | #include "brkeng.h" |
michael@0 | 13 | #include "dictbe.h" |
michael@0 | 14 | #include "unicode/uchar.h" |
michael@0 | 15 | #include "unicode/uniset.h" |
michael@0 | 16 | #include "unicode/chariter.h" |
michael@0 | 17 | #include "unicode/ures.h" |
michael@0 | 18 | #include "unicode/udata.h" |
michael@0 | 19 | #include "unicode/putil.h" |
michael@0 | 20 | #include "unicode/ustring.h" |
michael@0 | 21 | #include "unicode/uscript.h" |
michael@0 | 22 | #include "unicode/ucharstrie.h" |
michael@0 | 23 | #include "unicode/bytestrie.h" |
michael@0 | 24 | #include "charstr.h" |
michael@0 | 25 | #include "dictionarydata.h" |
michael@0 | 26 | #include "uvector.h" |
michael@0 | 27 | #include "umutex.h" |
michael@0 | 28 | #include "uresimp.h" |
michael@0 | 29 | #include "ubrkimpl.h" |
michael@0 | 30 | |
michael@0 | 31 | U_NAMESPACE_BEGIN |
michael@0 | 32 | |
michael@0 | 33 | /* |
michael@0 | 34 | ****************************************************************** |
michael@0 | 35 | */ |
michael@0 | 36 | |
michael@0 | 37 | LanguageBreakEngine::LanguageBreakEngine() { |
michael@0 | 38 | } |
michael@0 | 39 | |
michael@0 | 40 | LanguageBreakEngine::~LanguageBreakEngine() { |
michael@0 | 41 | } |
michael@0 | 42 | |
michael@0 | 43 | /* |
michael@0 | 44 | ****************************************************************** |
michael@0 | 45 | */ |
michael@0 | 46 | |
michael@0 | 47 | LanguageBreakFactory::LanguageBreakFactory() { |
michael@0 | 48 | } |
michael@0 | 49 | |
michael@0 | 50 | LanguageBreakFactory::~LanguageBreakFactory() { |
michael@0 | 51 | } |
michael@0 | 52 | |
michael@0 | 53 | /* |
michael@0 | 54 | ****************************************************************** |
michael@0 | 55 | */ |
michael@0 | 56 | |
michael@0 | 57 | UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) { |
michael@0 | 58 | for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { |
michael@0 | 59 | fHandled[i] = 0; |
michael@0 | 60 | } |
michael@0 | 61 | } |
michael@0 | 62 | |
michael@0 | 63 | UnhandledEngine::~UnhandledEngine() { |
michael@0 | 64 | for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { |
michael@0 | 65 | if (fHandled[i] != 0) { |
michael@0 | 66 | delete fHandled[i]; |
michael@0 | 67 | } |
michael@0 | 68 | } |
michael@0 | 69 | } |
michael@0 | 70 | |
michael@0 | 71 | UBool |
michael@0 | 72 | UnhandledEngine::handles(UChar32 c, int32_t breakType) const { |
michael@0 | 73 | return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])) |
michael@0 | 74 | && fHandled[breakType] != 0 && fHandled[breakType]->contains(c)); |
michael@0 | 75 | } |
michael@0 | 76 | |
michael@0 | 77 | int32_t |
michael@0 | 78 | UnhandledEngine::findBreaks( UText *text, |
michael@0 | 79 | int32_t startPos, |
michael@0 | 80 | int32_t endPos, |
michael@0 | 81 | UBool reverse, |
michael@0 | 82 | int32_t breakType, |
michael@0 | 83 | UStack &/*foundBreaks*/ ) const { |
michael@0 | 84 | if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { |
michael@0 | 85 | UChar32 c = utext_current32(text); |
michael@0 | 86 | if (reverse) { |
michael@0 | 87 | while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { |
michael@0 | 88 | c = utext_previous32(text); |
michael@0 | 89 | } |
michael@0 | 90 | } |
michael@0 | 91 | else { |
michael@0 | 92 | while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { |
michael@0 | 93 | utext_next32(text); // TODO: recast loop to work with post-increment operations. |
michael@0 | 94 | c = utext_current32(text); |
michael@0 | 95 | } |
michael@0 | 96 | } |
michael@0 | 97 | } |
michael@0 | 98 | return 0; |
michael@0 | 99 | } |
michael@0 | 100 | |
michael@0 | 101 | void |
michael@0 | 102 | UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) { |
michael@0 | 103 | if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { |
michael@0 | 104 | if (fHandled[breakType] == 0) { |
michael@0 | 105 | fHandled[breakType] = new UnicodeSet(); |
michael@0 | 106 | if (fHandled[breakType] == 0) { |
michael@0 | 107 | return; |
michael@0 | 108 | } |
michael@0 | 109 | } |
michael@0 | 110 | if (!fHandled[breakType]->contains(c)) { |
michael@0 | 111 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 112 | // Apply the entire script of the character. |
michael@0 | 113 | int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); |
michael@0 | 114 | fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status); |
michael@0 | 115 | } |
michael@0 | 116 | } |
michael@0 | 117 | } |
michael@0 | 118 | |
michael@0 | 119 | /* |
michael@0 | 120 | ****************************************************************** |
michael@0 | 121 | */ |
michael@0 | 122 | |
michael@0 | 123 | ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { |
michael@0 | 124 | fEngines = 0; |
michael@0 | 125 | } |
michael@0 | 126 | |
michael@0 | 127 | ICULanguageBreakFactory::~ICULanguageBreakFactory() { |
michael@0 | 128 | if (fEngines != 0) { |
michael@0 | 129 | delete fEngines; |
michael@0 | 130 | } |
michael@0 | 131 | } |
michael@0 | 132 | |
michael@0 | 133 | U_NAMESPACE_END |
michael@0 | 134 | U_CDECL_BEGIN |
michael@0 | 135 | static void U_CALLCONV _deleteEngine(void *obj) { |
michael@0 | 136 | delete (const icu::LanguageBreakEngine *) obj; |
michael@0 | 137 | } |
michael@0 | 138 | U_CDECL_END |
michael@0 | 139 | U_NAMESPACE_BEGIN |
michael@0 | 140 | |
michael@0 | 141 | const LanguageBreakEngine * |
michael@0 | 142 | ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) { |
michael@0 | 143 | UBool needsInit; |
michael@0 | 144 | int32_t i; |
michael@0 | 145 | const LanguageBreakEngine *lbe = NULL; |
michael@0 | 146 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 147 | |
michael@0 | 148 | // TODO: The global mutex should not be used. |
michael@0 | 149 | // The global mutex should only be used for short periods. |
michael@0 | 150 | // A ICULanguageBreakFactory specific mutex should be used. |
michael@0 | 151 | umtx_lock(NULL); |
michael@0 | 152 | needsInit = (UBool)(fEngines == NULL); |
michael@0 | 153 | if (!needsInit) { |
michael@0 | 154 | i = fEngines->size(); |
michael@0 | 155 | while (--i >= 0) { |
michael@0 | 156 | lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); |
michael@0 | 157 | if (lbe != NULL && lbe->handles(c, breakType)) { |
michael@0 | 158 | break; |
michael@0 | 159 | } |
michael@0 | 160 | lbe = NULL; |
michael@0 | 161 | } |
michael@0 | 162 | } |
michael@0 | 163 | umtx_unlock(NULL); |
michael@0 | 164 | |
michael@0 | 165 | if (lbe != NULL) { |
michael@0 | 166 | return lbe; |
michael@0 | 167 | } |
michael@0 | 168 | |
michael@0 | 169 | if (needsInit) { |
michael@0 | 170 | UStack *engines = new UStack(_deleteEngine, NULL, status); |
michael@0 | 171 | if (U_SUCCESS(status) && engines == NULL) { |
michael@0 | 172 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 173 | } |
michael@0 | 174 | else if (U_FAILURE(status)) { |
michael@0 | 175 | delete engines; |
michael@0 | 176 | engines = NULL; |
michael@0 | 177 | } |
michael@0 | 178 | else { |
michael@0 | 179 | umtx_lock(NULL); |
michael@0 | 180 | if (fEngines == NULL) { |
michael@0 | 181 | fEngines = engines; |
michael@0 | 182 | engines = NULL; |
michael@0 | 183 | } |
michael@0 | 184 | umtx_unlock(NULL); |
michael@0 | 185 | delete engines; |
michael@0 | 186 | } |
michael@0 | 187 | } |
michael@0 | 188 | |
michael@0 | 189 | if (fEngines == NULL) { |
michael@0 | 190 | return NULL; |
michael@0 | 191 | } |
michael@0 | 192 | |
michael@0 | 193 | // We didn't find an engine the first time through, or there was no |
michael@0 | 194 | // stack. Create an engine. |
michael@0 | 195 | const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType); |
michael@0 | 196 | |
michael@0 | 197 | // Now get the lock, and see if someone else has created it in the |
michael@0 | 198 | // meantime |
michael@0 | 199 | umtx_lock(NULL); |
michael@0 | 200 | i = fEngines->size(); |
michael@0 | 201 | while (--i >= 0) { |
michael@0 | 202 | lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); |
michael@0 | 203 | if (lbe != NULL && lbe->handles(c, breakType)) { |
michael@0 | 204 | break; |
michael@0 | 205 | } |
michael@0 | 206 | lbe = NULL; |
michael@0 | 207 | } |
michael@0 | 208 | if (lbe == NULL && newlbe != NULL) { |
michael@0 | 209 | fEngines->push((void *)newlbe, status); |
michael@0 | 210 | lbe = newlbe; |
michael@0 | 211 | newlbe = NULL; |
michael@0 | 212 | } |
michael@0 | 213 | umtx_unlock(NULL); |
michael@0 | 214 | |
michael@0 | 215 | delete newlbe; |
michael@0 | 216 | |
michael@0 | 217 | return lbe; |
michael@0 | 218 | } |
michael@0 | 219 | |
michael@0 | 220 | const LanguageBreakEngine * |
michael@0 | 221 | ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) { |
michael@0 | 222 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 223 | UScriptCode code = uscript_getScript(c, &status); |
michael@0 | 224 | if (U_SUCCESS(status)) { |
michael@0 | 225 | DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType); |
michael@0 | 226 | if (m != NULL) { |
michael@0 | 227 | const LanguageBreakEngine *engine = NULL; |
michael@0 | 228 | switch(code) { |
michael@0 | 229 | case USCRIPT_THAI: |
michael@0 | 230 | engine = new ThaiBreakEngine(m, status); |
michael@0 | 231 | break; |
michael@0 | 232 | case USCRIPT_LAO: |
michael@0 | 233 | engine = new LaoBreakEngine(m, status); |
michael@0 | 234 | break; |
michael@0 | 235 | case USCRIPT_KHMER: |
michael@0 | 236 | engine = new KhmerBreakEngine(m, status); |
michael@0 | 237 | break; |
michael@0 | 238 | |
michael@0 | 239 | #if !UCONFIG_NO_NORMALIZATION |
michael@0 | 240 | // CJK not available w/o normalization |
michael@0 | 241 | case USCRIPT_HANGUL: |
michael@0 | 242 | engine = new CjkBreakEngine(m, kKorean, status); |
michael@0 | 243 | break; |
michael@0 | 244 | |
michael@0 | 245 | // use same BreakEngine and dictionary for both Chinese and Japanese |
michael@0 | 246 | case USCRIPT_HIRAGANA: |
michael@0 | 247 | case USCRIPT_KATAKANA: |
michael@0 | 248 | case USCRIPT_HAN: |
michael@0 | 249 | engine = new CjkBreakEngine(m, kChineseJapanese, status); |
michael@0 | 250 | break; |
michael@0 | 251 | #if 0 |
michael@0 | 252 | // TODO: Have to get some characters with script=common handled |
michael@0 | 253 | // by CjkBreakEngine (e.g. U+309B). Simply subjecting |
michael@0 | 254 | // them to CjkBreakEngine does not work. The engine has to |
michael@0 | 255 | // special-case them. |
michael@0 | 256 | case USCRIPT_COMMON: |
michael@0 | 257 | { |
michael@0 | 258 | UBlockCode block = ublock_getCode(code); |
michael@0 | 259 | if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) |
michael@0 | 260 | engine = new CjkBreakEngine(dict, kChineseJapanese, status); |
michael@0 | 261 | break; |
michael@0 | 262 | } |
michael@0 | 263 | #endif |
michael@0 | 264 | #endif |
michael@0 | 265 | |
michael@0 | 266 | default: |
michael@0 | 267 | break; |
michael@0 | 268 | } |
michael@0 | 269 | if (engine == NULL) { |
michael@0 | 270 | delete m; |
michael@0 | 271 | } |
michael@0 | 272 | else if (U_FAILURE(status)) { |
michael@0 | 273 | delete engine; |
michael@0 | 274 | engine = NULL; |
michael@0 | 275 | } |
michael@0 | 276 | return engine; |
michael@0 | 277 | } |
michael@0 | 278 | } |
michael@0 | 279 | return NULL; |
michael@0 | 280 | } |
michael@0 | 281 | |
michael@0 | 282 | DictionaryMatcher * |
michael@0 | 283 | ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { |
michael@0 | 284 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 285 | // open root from brkitr tree. |
michael@0 | 286 | UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); |
michael@0 | 287 | b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); |
michael@0 | 288 | int32_t dictnlength = 0; |
michael@0 | 289 | const UChar *dictfname = |
michael@0 | 290 | ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); |
michael@0 | 291 | if (U_FAILURE(status)) { |
michael@0 | 292 | ures_close(b); |
michael@0 | 293 | return NULL; |
michael@0 | 294 | } |
michael@0 | 295 | CharString dictnbuf; |
michael@0 | 296 | CharString ext; |
michael@0 | 297 | const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot |
michael@0 | 298 | if (extStart != NULL) { |
michael@0 | 299 | int32_t len = (int32_t)(extStart - dictfname); |
michael@0 | 300 | ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); |
michael@0 | 301 | dictnlength = len; |
michael@0 | 302 | } |
michael@0 | 303 | dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); |
michael@0 | 304 | ures_close(b); |
michael@0 | 305 | |
michael@0 | 306 | UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); |
michael@0 | 307 | if (U_SUCCESS(status)) { |
michael@0 | 308 | // build trie |
michael@0 | 309 | const uint8_t *data = (const uint8_t *)udata_getMemory(file); |
michael@0 | 310 | const int32_t *indexes = (const int32_t *)data; |
michael@0 | 311 | const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; |
michael@0 | 312 | const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; |
michael@0 | 313 | DictionaryMatcher *m = NULL; |
michael@0 | 314 | if (trieType == DictionaryData::TRIE_TYPE_BYTES) { |
michael@0 | 315 | const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; |
michael@0 | 316 | const char *characters = (const char *)(data + offset); |
michael@0 | 317 | m = new BytesDictionaryMatcher(characters, transform, file); |
michael@0 | 318 | } |
michael@0 | 319 | else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { |
michael@0 | 320 | const UChar *characters = (const UChar *)(data + offset); |
michael@0 | 321 | m = new UCharsDictionaryMatcher(characters, file); |
michael@0 | 322 | } |
michael@0 | 323 | if (m == NULL) { |
michael@0 | 324 | // no matcher exists to take ownership - either we are an invalid |
michael@0 | 325 | // type or memory allocation failed |
michael@0 | 326 | udata_close(file); |
michael@0 | 327 | } |
michael@0 | 328 | return m; |
michael@0 | 329 | } else if (dictfname != NULL) { |
michael@0 | 330 | // we don't have a dictionary matcher. |
michael@0 | 331 | // returning NULL here will cause us to fail to find a dictionary break engine, as expected |
michael@0 | 332 | status = U_ZERO_ERROR; |
michael@0 | 333 | return NULL; |
michael@0 | 334 | } |
michael@0 | 335 | return NULL; |
michael@0 | 336 | } |
michael@0 | 337 | |
michael@0 | 338 | U_NAMESPACE_END |
michael@0 | 339 | |
michael@0 | 340 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |