michael@0: /* michael@0: ************************************************************************************ michael@0: * Copyright (C) 2006-2013, International Business Machines Corporation michael@0: * and others. All Rights Reserved. michael@0: ************************************************************************************ michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: michael@0: #include "brkeng.h" michael@0: #include "dictbe.h" michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/chariter.h" michael@0: #include "unicode/ures.h" michael@0: #include "unicode/udata.h" michael@0: #include "unicode/putil.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/uscript.h" michael@0: #include "unicode/ucharstrie.h" michael@0: #include "unicode/bytestrie.h" michael@0: #include "charstr.h" michael@0: #include "dictionarydata.h" michael@0: #include "uvector.h" michael@0: #include "umutex.h" michael@0: #include "uresimp.h" michael@0: #include "ubrkimpl.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: /* michael@0: ****************************************************************** michael@0: */ michael@0: michael@0: LanguageBreakEngine::LanguageBreakEngine() { michael@0: } michael@0: michael@0: LanguageBreakEngine::~LanguageBreakEngine() { michael@0: } michael@0: michael@0: /* michael@0: ****************************************************************** michael@0: */ michael@0: michael@0: LanguageBreakFactory::LanguageBreakFactory() { michael@0: } michael@0: michael@0: LanguageBreakFactory::~LanguageBreakFactory() { michael@0: } michael@0: michael@0: /* michael@0: ****************************************************************** michael@0: */ michael@0: michael@0: UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) { michael@0: for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { michael@0: fHandled[i] = 0; michael@0: } michael@0: } michael@0: michael@0: UnhandledEngine::~UnhandledEngine() { michael@0: for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { michael@0: if (fHandled[i] != 0) { michael@0: delete fHandled[i]; michael@0: } michael@0: } michael@0: } michael@0: michael@0: UBool michael@0: UnhandledEngine::handles(UChar32 c, int32_t breakType) const { michael@0: return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])) michael@0: && fHandled[breakType] != 0 && fHandled[breakType]->contains(c)); michael@0: } michael@0: michael@0: int32_t michael@0: UnhandledEngine::findBreaks( UText *text, michael@0: int32_t startPos, michael@0: int32_t endPos, michael@0: UBool reverse, michael@0: int32_t breakType, michael@0: UStack &/*foundBreaks*/ ) const { michael@0: if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { michael@0: UChar32 c = utext_current32(text); michael@0: if (reverse) { michael@0: while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { michael@0: c = utext_previous32(text); michael@0: } michael@0: } michael@0: else { michael@0: while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { michael@0: utext_next32(text); // TODO: recast loop to work with post-increment operations. michael@0: c = utext_current32(text); michael@0: } michael@0: } michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: void michael@0: UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) { michael@0: if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { michael@0: if (fHandled[breakType] == 0) { michael@0: fHandled[breakType] = new UnicodeSet(); michael@0: if (fHandled[breakType] == 0) { michael@0: return; michael@0: } michael@0: } michael@0: if (!fHandled[breakType]->contains(c)) { michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: // Apply the entire script of the character. michael@0: int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); michael@0: fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status); michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* michael@0: ****************************************************************** michael@0: */ michael@0: michael@0: ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { michael@0: fEngines = 0; michael@0: } michael@0: michael@0: ICULanguageBreakFactory::~ICULanguageBreakFactory() { michael@0: if (fEngines != 0) { michael@0: delete fEngines; michael@0: } michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: U_CDECL_BEGIN michael@0: static void U_CALLCONV _deleteEngine(void *obj) { michael@0: delete (const icu::LanguageBreakEngine *) obj; michael@0: } michael@0: U_CDECL_END michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: const LanguageBreakEngine * michael@0: ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) { michael@0: UBool needsInit; michael@0: int32_t i; michael@0: const LanguageBreakEngine *lbe = NULL; michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: michael@0: // TODO: The global mutex should not be used. michael@0: // The global mutex should only be used for short periods. michael@0: // A ICULanguageBreakFactory specific mutex should be used. michael@0: umtx_lock(NULL); michael@0: needsInit = (UBool)(fEngines == NULL); michael@0: if (!needsInit) { michael@0: i = fEngines->size(); michael@0: while (--i >= 0) { michael@0: lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); michael@0: if (lbe != NULL && lbe->handles(c, breakType)) { michael@0: break; michael@0: } michael@0: lbe = NULL; michael@0: } michael@0: } michael@0: umtx_unlock(NULL); michael@0: michael@0: if (lbe != NULL) { michael@0: return lbe; michael@0: } michael@0: michael@0: if (needsInit) { michael@0: UStack *engines = new UStack(_deleteEngine, NULL, status); michael@0: if (U_SUCCESS(status) && engines == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: else if (U_FAILURE(status)) { michael@0: delete engines; michael@0: engines = NULL; michael@0: } michael@0: else { michael@0: umtx_lock(NULL); michael@0: if (fEngines == NULL) { michael@0: fEngines = engines; michael@0: engines = NULL; michael@0: } michael@0: umtx_unlock(NULL); michael@0: delete engines; michael@0: } michael@0: } michael@0: michael@0: if (fEngines == NULL) { michael@0: return NULL; michael@0: } michael@0: michael@0: // We didn't find an engine the first time through, or there was no michael@0: // stack. Create an engine. michael@0: const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType); michael@0: michael@0: // Now get the lock, and see if someone else has created it in the michael@0: // meantime michael@0: umtx_lock(NULL); michael@0: i = fEngines->size(); michael@0: while (--i >= 0) { michael@0: lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); michael@0: if (lbe != NULL && lbe->handles(c, breakType)) { michael@0: break; michael@0: } michael@0: lbe = NULL; michael@0: } michael@0: if (lbe == NULL && newlbe != NULL) { michael@0: fEngines->push((void *)newlbe, status); michael@0: lbe = newlbe; michael@0: newlbe = NULL; michael@0: } michael@0: umtx_unlock(NULL); michael@0: michael@0: delete newlbe; michael@0: michael@0: return lbe; michael@0: } michael@0: michael@0: const LanguageBreakEngine * michael@0: ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) { michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: UScriptCode code = uscript_getScript(c, &status); michael@0: if (U_SUCCESS(status)) { michael@0: DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType); michael@0: if (m != NULL) { michael@0: const LanguageBreakEngine *engine = NULL; michael@0: switch(code) { michael@0: case USCRIPT_THAI: michael@0: engine = new ThaiBreakEngine(m, status); michael@0: break; michael@0: case USCRIPT_LAO: michael@0: engine = new LaoBreakEngine(m, status); michael@0: break; michael@0: case USCRIPT_KHMER: michael@0: engine = new KhmerBreakEngine(m, status); michael@0: break; michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: // CJK not available w/o normalization michael@0: case USCRIPT_HANGUL: michael@0: engine = new CjkBreakEngine(m, kKorean, status); michael@0: break; michael@0: michael@0: // use same BreakEngine and dictionary for both Chinese and Japanese michael@0: case USCRIPT_HIRAGANA: michael@0: case USCRIPT_KATAKANA: michael@0: case USCRIPT_HAN: michael@0: engine = new CjkBreakEngine(m, kChineseJapanese, status); michael@0: break; michael@0: #if 0 michael@0: // TODO: Have to get some characters with script=common handled michael@0: // by CjkBreakEngine (e.g. U+309B). Simply subjecting michael@0: // them to CjkBreakEngine does not work. The engine has to michael@0: // special-case them. michael@0: case USCRIPT_COMMON: michael@0: { michael@0: UBlockCode block = ublock_getCode(code); michael@0: if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) michael@0: engine = new CjkBreakEngine(dict, kChineseJapanese, status); michael@0: break; michael@0: } michael@0: #endif michael@0: #endif michael@0: michael@0: default: michael@0: break; michael@0: } michael@0: if (engine == NULL) { michael@0: delete m; michael@0: } michael@0: else if (U_FAILURE(status)) { michael@0: delete engine; michael@0: engine = NULL; michael@0: } michael@0: return engine; michael@0: } michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: DictionaryMatcher * michael@0: ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: // open root from brkitr tree. michael@0: UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); michael@0: b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); michael@0: int32_t dictnlength = 0; michael@0: const UChar *dictfname = michael@0: ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); michael@0: if (U_FAILURE(status)) { michael@0: ures_close(b); michael@0: return NULL; michael@0: } michael@0: CharString dictnbuf; michael@0: CharString ext; michael@0: const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot michael@0: if (extStart != NULL) { michael@0: int32_t len = (int32_t)(extStart - dictfname); michael@0: ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); michael@0: dictnlength = len; michael@0: } michael@0: dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); michael@0: ures_close(b); michael@0: michael@0: UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); michael@0: if (U_SUCCESS(status)) { michael@0: // build trie michael@0: const uint8_t *data = (const uint8_t *)udata_getMemory(file); michael@0: const int32_t *indexes = (const int32_t *)data; michael@0: const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; michael@0: const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; michael@0: DictionaryMatcher *m = NULL; michael@0: if (trieType == DictionaryData::TRIE_TYPE_BYTES) { michael@0: const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; michael@0: const char *characters = (const char *)(data + offset); michael@0: m = new BytesDictionaryMatcher(characters, transform, file); michael@0: } michael@0: else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { michael@0: const UChar *characters = (const UChar *)(data + offset); michael@0: m = new UCharsDictionaryMatcher(characters, file); michael@0: } michael@0: if (m == NULL) { michael@0: // no matcher exists to take ownership - either we are an invalid michael@0: // type or memory allocation failed michael@0: udata_close(file); michael@0: } michael@0: return m; michael@0: } else if (dictfname != NULL) { michael@0: // we don't have a dictionary matcher. michael@0: // returning NULL here will cause us to fail to find a dictionary break engine, as expected michael@0: status = U_ZERO_ERROR; michael@0: return NULL; michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_BREAK_ITERATION */