1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/dictbe.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1145 @@ 1.4 +/** 1.5 + ******************************************************************************* 1.6 + * Copyright (C) 2006-2013, International Business Machines Corporation 1.7 + * and others. All Rights Reserved. 1.8 + ******************************************************************************* 1.9 + */ 1.10 + 1.11 +#include "unicode/utypes.h" 1.12 + 1.13 +#if !UCONFIG_NO_BREAK_ITERATION 1.14 + 1.15 +#include "brkeng.h" 1.16 +#include "dictbe.h" 1.17 +#include "unicode/uniset.h" 1.18 +#include "unicode/chariter.h" 1.19 +#include "unicode/ubrk.h" 1.20 +#include "uvector.h" 1.21 +#include "uassert.h" 1.22 +#include "unicode/normlzr.h" 1.23 +#include "cmemory.h" 1.24 +#include "dictionarydata.h" 1.25 + 1.26 +U_NAMESPACE_BEGIN 1.27 + 1.28 +/* 1.29 + ****************************************************************** 1.30 + */ 1.31 + 1.32 +DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { 1.33 + fTypes = breakTypes; 1.34 +} 1.35 + 1.36 +DictionaryBreakEngine::~DictionaryBreakEngine() { 1.37 +} 1.38 + 1.39 +UBool 1.40 +DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { 1.41 + return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) 1.42 + && fSet.contains(c)); 1.43 +} 1.44 + 1.45 +int32_t 1.46 +DictionaryBreakEngine::findBreaks( UText *text, 1.47 + int32_t startPos, 1.48 + int32_t endPos, 1.49 + UBool reverse, 1.50 + int32_t breakType, 1.51 + UStack &foundBreaks ) const { 1.52 + int32_t result = 0; 1.53 + 1.54 + // Find the span of characters included in the set. 1.55 + int32_t start = (int32_t)utext_getNativeIndex(text); 1.56 + int32_t current; 1.57 + int32_t rangeStart; 1.58 + int32_t rangeEnd; 1.59 + UChar32 c = utext_current32(text); 1.60 + if (reverse) { 1.61 + UBool isDict = fSet.contains(c); 1.62 + while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { 1.63 + c = utext_previous32(text); 1.64 + isDict = fSet.contains(c); 1.65 + } 1.66 + rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); 1.67 + rangeEnd = start + 1; 1.68 + } 1.69 + else { 1.70 + while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { 1.71 + utext_next32(text); // TODO: recast loop for postincrement 1.72 + c = utext_current32(text); 1.73 + } 1.74 + rangeStart = start; 1.75 + rangeEnd = current; 1.76 + } 1.77 + if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { 1.78 + result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); 1.79 + utext_setNativeIndex(text, current); 1.80 + } 1.81 + 1.82 + return result; 1.83 +} 1.84 + 1.85 +void 1.86 +DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { 1.87 + fSet = set; 1.88 + // Compact for caching 1.89 + fSet.compact(); 1.90 +} 1.91 + 1.92 +/* 1.93 + ****************************************************************** 1.94 + * PossibleWord 1.95 + */ 1.96 + 1.97 +// Helper class for improving readability of the Thai/Lao/Khmer word break 1.98 +// algorithm. The implementation is completely inline. 1.99 + 1.100 +// List size, limited by the maximum number of words in the dictionary 1.101 +// that form a nested sequence. 1.102 +#define POSSIBLE_WORD_LIST_MAX 20 1.103 + 1.104 +class PossibleWord { 1.105 +private: 1.106 + // list of word candidate lengths, in increasing length order 1.107 + int32_t lengths[POSSIBLE_WORD_LIST_MAX]; 1.108 + int32_t count; // Count of candidates 1.109 + int32_t prefix; // The longest match with a dictionary word 1.110 + int32_t offset; // Offset in the text of these candidates 1.111 + int mark; // The preferred candidate's offset 1.112 + int current; // The candidate we're currently looking at 1.113 + 1.114 +public: 1.115 + PossibleWord(); 1.116 + ~PossibleWord(); 1.117 + 1.118 + // Fill the list of candidates if needed, select the longest, and return the number found 1.119 + int candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); 1.120 + 1.121 + // Select the currently marked candidate, point after it in the text, and invalidate self 1.122 + int32_t acceptMarked( UText *text ); 1.123 + 1.124 + // Back up from the current candidate to the next shorter one; return TRUE if that exists 1.125 + // and point the text after it 1.126 + UBool backUp( UText *text ); 1.127 + 1.128 + // Return the longest prefix this candidate location shares with a dictionary word 1.129 + int32_t longestPrefix(); 1.130 + 1.131 + // Mark the current candidate as the one we like 1.132 + void markCurrent(); 1.133 +}; 1.134 + 1.135 +inline 1.136 +PossibleWord::PossibleWord() { 1.137 + offset = -1; 1.138 +} 1.139 + 1.140 +inline 1.141 +PossibleWord::~PossibleWord() { 1.142 +} 1.143 + 1.144 +inline int 1.145 +PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { 1.146 + // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 1.147 + int32_t start = (int32_t)utext_getNativeIndex(text); 1.148 + if (start != offset) { 1.149 + offset = start; 1.150 + prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); 1.151 + // Dictionary leaves text after longest prefix, not longest word. Back up. 1.152 + if (count <= 0) { 1.153 + utext_setNativeIndex(text, start); 1.154 + } 1.155 + } 1.156 + if (count > 0) { 1.157 + utext_setNativeIndex(text, start+lengths[count-1]); 1.158 + } 1.159 + current = count-1; 1.160 + mark = current; 1.161 + return count; 1.162 +} 1.163 + 1.164 +inline int32_t 1.165 +PossibleWord::acceptMarked( UText *text ) { 1.166 + utext_setNativeIndex(text, offset + lengths[mark]); 1.167 + return lengths[mark]; 1.168 +} 1.169 + 1.170 +inline UBool 1.171 +PossibleWord::backUp( UText *text ) { 1.172 + if (current > 0) { 1.173 + utext_setNativeIndex(text, offset + lengths[--current]); 1.174 + return TRUE; 1.175 + } 1.176 + return FALSE; 1.177 +} 1.178 + 1.179 +inline int32_t 1.180 +PossibleWord::longestPrefix() { 1.181 + return prefix; 1.182 +} 1.183 + 1.184 +inline void 1.185 +PossibleWord::markCurrent() { 1.186 + mark = current; 1.187 +} 1.188 + 1.189 +/* 1.190 + ****************************************************************** 1.191 + * ThaiBreakEngine 1.192 + */ 1.193 + 1.194 +// How many words in a row are "good enough"? 1.195 +#define THAI_LOOKAHEAD 3 1.196 + 1.197 +// Will not combine a non-word with a preceding dictionary word longer than this 1.198 +#define THAI_ROOT_COMBINE_THRESHOLD 3 1.199 + 1.200 +// Will not combine a non-word that shares at least this much prefix with a 1.201 +// dictionary word, with a preceding word 1.202 +#define THAI_PREFIX_COMBINE_THRESHOLD 3 1.203 + 1.204 +// Ellision character 1.205 +#define THAI_PAIYANNOI 0x0E2F 1.206 + 1.207 +// Repeat character 1.208 +#define THAI_MAIYAMOK 0x0E46 1.209 + 1.210 +// Minimum word size 1.211 +#define THAI_MIN_WORD 2 1.212 + 1.213 +// Minimum number of characters for two words 1.214 +#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) 1.215 + 1.216 +ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 1.217 + : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 1.218 + fDictionary(adoptDictionary) 1.219 +{ 1.220 + fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); 1.221 + if (U_SUCCESS(status)) { 1.222 + setCharacters(fThaiWordSet); 1.223 + } 1.224 + fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); 1.225 + fMarkSet.add(0x0020); 1.226 + fEndWordSet = fThaiWordSet; 1.227 + fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 1.228 + fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 1.229 + fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 1.230 + fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 1.231 + fSuffixSet.add(THAI_PAIYANNOI); 1.232 + fSuffixSet.add(THAI_MAIYAMOK); 1.233 + 1.234 + // Compact for caching. 1.235 + fMarkSet.compact(); 1.236 + fEndWordSet.compact(); 1.237 + fBeginWordSet.compact(); 1.238 + fSuffixSet.compact(); 1.239 +} 1.240 + 1.241 +ThaiBreakEngine::~ThaiBreakEngine() { 1.242 + delete fDictionary; 1.243 +} 1.244 + 1.245 +int32_t 1.246 +ThaiBreakEngine::divideUpDictionaryRange( UText *text, 1.247 + int32_t rangeStart, 1.248 + int32_t rangeEnd, 1.249 + UStack &foundBreaks ) const { 1.250 + if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { 1.251 + return 0; // Not enough characters for two words 1.252 + } 1.253 + 1.254 + uint32_t wordsFound = 0; 1.255 + int32_t wordLength; 1.256 + int32_t current; 1.257 + UErrorCode status = U_ZERO_ERROR; 1.258 + PossibleWord words[THAI_LOOKAHEAD]; 1.259 + UChar32 uc; 1.260 + 1.261 + utext_setNativeIndex(text, rangeStart); 1.262 + 1.263 + while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 1.264 + wordLength = 0; 1.265 + 1.266 + // Look for candidate words at the current position 1.267 + int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 1.268 + 1.269 + // If we found exactly one, use that 1.270 + if (candidates == 1) { 1.271 + wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 1.272 + wordsFound += 1; 1.273 + } 1.274 + // If there was more than one, see which one can take us forward the most words 1.275 + else if (candidates > 1) { 1.276 + // If we're already at the end of the range, we're done 1.277 + if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 1.278 + goto foundBest; 1.279 + } 1.280 + do { 1.281 + int wordsMatched = 1; 1.282 + if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 1.283 + if (wordsMatched < 2) { 1.284 + // Followed by another dictionary word; mark first word as a good candidate 1.285 + words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 1.286 + wordsMatched = 2; 1.287 + } 1.288 + 1.289 + // If we're already at the end of the range, we're done 1.290 + if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 1.291 + goto foundBest; 1.292 + } 1.293 + 1.294 + // See if any of the possible second words is followed by a third word 1.295 + do { 1.296 + // If we find a third word, stop right away 1.297 + if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 1.298 + words[wordsFound % THAI_LOOKAHEAD].markCurrent(); 1.299 + goto foundBest; 1.300 + } 1.301 + } 1.302 + while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text)); 1.303 + } 1.304 + } 1.305 + while (words[wordsFound % THAI_LOOKAHEAD].backUp(text)); 1.306 +foundBest: 1.307 + wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 1.308 + wordsFound += 1; 1.309 + } 1.310 + 1.311 + // We come here after having either found a word or not. We look ahead to the 1.312 + // next word. If it's not a dictionary word, we will combine it withe the word we 1.313 + // just found (if there is one), but only if the preceding word does not exceed 1.314 + // the threshold. 1.315 + // The text iterator should now be positioned at the end of the word we found. 1.316 + if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { 1.317 + // if it is a dictionary word, do nothing. If it isn't, then if there is 1.318 + // no preceding word, or the non-word shares less than the minimum threshold 1.319 + // of characters with a dictionary word, then scan to resynchronize 1.320 + if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 1.321 + && (wordLength == 0 1.322 + || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 1.323 + // Look for a plausible word boundary 1.324 + //TODO: This section will need a rework for UText. 1.325 + int32_t remaining = rangeEnd - (current+wordLength); 1.326 + UChar32 pc = utext_current32(text); 1.327 + int32_t chars = 0; 1.328 + for (;;) { 1.329 + utext_next32(text); 1.330 + uc = utext_current32(text); 1.331 + // TODO: Here we're counting on the fact that the SA languages are all 1.332 + // in the BMP. This should get fixed with the UText rework. 1.333 + chars += 1; 1.334 + if (--remaining <= 0) { 1.335 + break; 1.336 + } 1.337 + if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 1.338 + // Maybe. See if it's in the dictionary. 1.339 + // NOTE: In the original Apple code, checked that the next 1.340 + // two characters after uc were not 0x0E4C THANTHAKHAT before 1.341 + // checking the dictionary. That is just a performance filter, 1.342 + // but it's not clear it's faster than checking the trie. 1.343 + int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 1.344 + utext_setNativeIndex(text, current + wordLength + chars); 1.345 + if (candidates > 0) { 1.346 + break; 1.347 + } 1.348 + } 1.349 + pc = uc; 1.350 + } 1.351 + 1.352 + // Bump the word count if there wasn't already one 1.353 + if (wordLength <= 0) { 1.354 + wordsFound += 1; 1.355 + } 1.356 + 1.357 + // Update the length with the passed-over characters 1.358 + wordLength += chars; 1.359 + } 1.360 + else { 1.361 + // Back up to where we were for next iteration 1.362 + utext_setNativeIndex(text, current+wordLength); 1.363 + } 1.364 + } 1.365 + 1.366 + // Never stop before a combining mark. 1.367 + int32_t currPos; 1.368 + while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 1.369 + utext_next32(text); 1.370 + wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 1.371 + } 1.372 + 1.373 + // Look ahead for possible suffixes if a dictionary word does not follow. 1.374 + // We do this in code rather than using a rule so that the heuristic 1.375 + // resynch continues to function. For example, one of the suffix characters 1.376 + // could be a typo in the middle of a word. 1.377 + if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 1.378 + if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 1.379 + && fSuffixSet.contains(uc = utext_current32(text))) { 1.380 + if (uc == THAI_PAIYANNOI) { 1.381 + if (!fSuffixSet.contains(utext_previous32(text))) { 1.382 + // Skip over previous end and PAIYANNOI 1.383 + utext_next32(text); 1.384 + utext_next32(text); 1.385 + wordLength += 1; // Add PAIYANNOI to word 1.386 + uc = utext_current32(text); // Fetch next character 1.387 + } 1.388 + else { 1.389 + // Restore prior position 1.390 + utext_next32(text); 1.391 + } 1.392 + } 1.393 + if (uc == THAI_MAIYAMOK) { 1.394 + if (utext_previous32(text) != THAI_MAIYAMOK) { 1.395 + // Skip over previous end and MAIYAMOK 1.396 + utext_next32(text); 1.397 + utext_next32(text); 1.398 + wordLength += 1; // Add MAIYAMOK to word 1.399 + } 1.400 + else { 1.401 + // Restore prior position 1.402 + utext_next32(text); 1.403 + } 1.404 + } 1.405 + } 1.406 + else { 1.407 + utext_setNativeIndex(text, current+wordLength); 1.408 + } 1.409 + } 1.410 + 1.411 + // Did we find a word on this iteration? If so, push it on the break stack 1.412 + if (wordLength > 0) { 1.413 + foundBreaks.push((current+wordLength), status); 1.414 + } 1.415 + } 1.416 + 1.417 + // Don't return a break for the end of the dictionary range if there is one there. 1.418 + if (foundBreaks.peeki() >= rangeEnd) { 1.419 + (void) foundBreaks.popi(); 1.420 + wordsFound -= 1; 1.421 + } 1.422 + 1.423 + return wordsFound; 1.424 +} 1.425 + 1.426 +/* 1.427 + ****************************************************************** 1.428 + * LaoBreakEngine 1.429 + */ 1.430 + 1.431 +// How many words in a row are "good enough"? 1.432 +#define LAO_LOOKAHEAD 3 1.433 + 1.434 +// Will not combine a non-word with a preceding dictionary word longer than this 1.435 +#define LAO_ROOT_COMBINE_THRESHOLD 3 1.436 + 1.437 +// Will not combine a non-word that shares at least this much prefix with a 1.438 +// dictionary word, with a preceding word 1.439 +#define LAO_PREFIX_COMBINE_THRESHOLD 3 1.440 + 1.441 +// Minimum word size 1.442 +#define LAO_MIN_WORD 2 1.443 + 1.444 +// Minimum number of characters for two words 1.445 +#define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2) 1.446 + 1.447 +LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 1.448 + : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 1.449 + fDictionary(adoptDictionary) 1.450 +{ 1.451 + fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); 1.452 + if (U_SUCCESS(status)) { 1.453 + setCharacters(fLaoWordSet); 1.454 + } 1.455 + fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); 1.456 + fMarkSet.add(0x0020); 1.457 + fEndWordSet = fLaoWordSet; 1.458 + fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels 1.459 + fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) 1.460 + fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) 1.461 + fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels 1.462 + 1.463 + // Compact for caching. 1.464 + fMarkSet.compact(); 1.465 + fEndWordSet.compact(); 1.466 + fBeginWordSet.compact(); 1.467 +} 1.468 + 1.469 +LaoBreakEngine::~LaoBreakEngine() { 1.470 + delete fDictionary; 1.471 +} 1.472 + 1.473 +int32_t 1.474 +LaoBreakEngine::divideUpDictionaryRange( UText *text, 1.475 + int32_t rangeStart, 1.476 + int32_t rangeEnd, 1.477 + UStack &foundBreaks ) const { 1.478 + if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { 1.479 + return 0; // Not enough characters for two words 1.480 + } 1.481 + 1.482 + uint32_t wordsFound = 0; 1.483 + int32_t wordLength; 1.484 + int32_t current; 1.485 + UErrorCode status = U_ZERO_ERROR; 1.486 + PossibleWord words[LAO_LOOKAHEAD]; 1.487 + UChar32 uc; 1.488 + 1.489 + utext_setNativeIndex(text, rangeStart); 1.490 + 1.491 + while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 1.492 + wordLength = 0; 1.493 + 1.494 + // Look for candidate words at the current position 1.495 + int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 1.496 + 1.497 + // If we found exactly one, use that 1.498 + if (candidates == 1) { 1.499 + wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 1.500 + wordsFound += 1; 1.501 + } 1.502 + // If there was more than one, see which one can take us forward the most words 1.503 + else if (candidates > 1) { 1.504 + // If we're already at the end of the range, we're done 1.505 + if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 1.506 + goto foundBest; 1.507 + } 1.508 + do { 1.509 + int wordsMatched = 1; 1.510 + if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 1.511 + if (wordsMatched < 2) { 1.512 + // Followed by another dictionary word; mark first word as a good candidate 1.513 + words[wordsFound%LAO_LOOKAHEAD].markCurrent(); 1.514 + wordsMatched = 2; 1.515 + } 1.516 + 1.517 + // If we're already at the end of the range, we're done 1.518 + if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 1.519 + goto foundBest; 1.520 + } 1.521 + 1.522 + // See if any of the possible second words is followed by a third word 1.523 + do { 1.524 + // If we find a third word, stop right away 1.525 + if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 1.526 + words[wordsFound % LAO_LOOKAHEAD].markCurrent(); 1.527 + goto foundBest; 1.528 + } 1.529 + } 1.530 + while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)); 1.531 + } 1.532 + } 1.533 + while (words[wordsFound % LAO_LOOKAHEAD].backUp(text)); 1.534 +foundBest: 1.535 + wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 1.536 + wordsFound += 1; 1.537 + } 1.538 + 1.539 + // We come here after having either found a word or not. We look ahead to the 1.540 + // next word. If it's not a dictionary word, we will combine it withe the word we 1.541 + // just found (if there is one), but only if the preceding word does not exceed 1.542 + // the threshold. 1.543 + // The text iterator should now be positioned at the end of the word we found. 1.544 + if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) { 1.545 + // if it is a dictionary word, do nothing. If it isn't, then if there is 1.546 + // no preceding word, or the non-word shares less than the minimum threshold 1.547 + // of characters with a dictionary word, then scan to resynchronize 1.548 + if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 1.549 + && (wordLength == 0 1.550 + || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) { 1.551 + // Look for a plausible word boundary 1.552 + //TODO: This section will need a rework for UText. 1.553 + int32_t remaining = rangeEnd - (current+wordLength); 1.554 + UChar32 pc = utext_current32(text); 1.555 + int32_t chars = 0; 1.556 + for (;;) { 1.557 + utext_next32(text); 1.558 + uc = utext_current32(text); 1.559 + // TODO: Here we're counting on the fact that the SA languages are all 1.560 + // in the BMP. This should get fixed with the UText rework. 1.561 + chars += 1; 1.562 + if (--remaining <= 0) { 1.563 + break; 1.564 + } 1.565 + if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 1.566 + // Maybe. See if it's in the dictionary. 1.567 + int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 1.568 + utext_setNativeIndex(text, current + wordLength + chars); 1.569 + if (candidates > 0) { 1.570 + break; 1.571 + } 1.572 + } 1.573 + pc = uc; 1.574 + } 1.575 + 1.576 + // Bump the word count if there wasn't already one 1.577 + if (wordLength <= 0) { 1.578 + wordsFound += 1; 1.579 + } 1.580 + 1.581 + // Update the length with the passed-over characters 1.582 + wordLength += chars; 1.583 + } 1.584 + else { 1.585 + // Back up to where we were for next iteration 1.586 + utext_setNativeIndex(text, current+wordLength); 1.587 + } 1.588 + } 1.589 + 1.590 + // Never stop before a combining mark. 1.591 + int32_t currPos; 1.592 + while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 1.593 + utext_next32(text); 1.594 + wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 1.595 + } 1.596 + 1.597 + // Look ahead for possible suffixes if a dictionary word does not follow. 1.598 + // We do this in code rather than using a rule so that the heuristic 1.599 + // resynch continues to function. For example, one of the suffix characters 1.600 + // could be a typo in the middle of a word. 1.601 + // NOT CURRENTLY APPLICABLE TO LAO 1.602 + 1.603 + // Did we find a word on this iteration? If so, push it on the break stack 1.604 + if (wordLength > 0) { 1.605 + foundBreaks.push((current+wordLength), status); 1.606 + } 1.607 + } 1.608 + 1.609 + // Don't return a break for the end of the dictionary range if there is one there. 1.610 + if (foundBreaks.peeki() >= rangeEnd) { 1.611 + (void) foundBreaks.popi(); 1.612 + wordsFound -= 1; 1.613 + } 1.614 + 1.615 + return wordsFound; 1.616 +} 1.617 + 1.618 +/* 1.619 + ****************************************************************** 1.620 + * KhmerBreakEngine 1.621 + */ 1.622 + 1.623 +// How many words in a row are "good enough"? 1.624 +#define KHMER_LOOKAHEAD 3 1.625 + 1.626 +// Will not combine a non-word with a preceding dictionary word longer than this 1.627 +#define KHMER_ROOT_COMBINE_THRESHOLD 3 1.628 + 1.629 +// Will not combine a non-word that shares at least this much prefix with a 1.630 +// dictionary word, with a preceding word 1.631 +#define KHMER_PREFIX_COMBINE_THRESHOLD 3 1.632 + 1.633 +// Minimum word size 1.634 +#define KHMER_MIN_WORD 2 1.635 + 1.636 +// Minimum number of characters for two words 1.637 +#define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2) 1.638 + 1.639 +KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 1.640 + : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), 1.641 + fDictionary(adoptDictionary) 1.642 +{ 1.643 + fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); 1.644 + if (U_SUCCESS(status)) { 1.645 + setCharacters(fKhmerWordSet); 1.646 + } 1.647 + fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); 1.648 + fMarkSet.add(0x0020); 1.649 + fEndWordSet = fKhmerWordSet; 1.650 + fBeginWordSet.add(0x1780, 0x17B3); 1.651 + //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels 1.652 + //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word 1.653 + //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word 1.654 + fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters 1.655 + //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels 1.656 +// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 1.657 +// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 1.658 +// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 1.659 +// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 1.660 +// fSuffixSet.add(THAI_PAIYANNOI); 1.661 +// fSuffixSet.add(THAI_MAIYAMOK); 1.662 + 1.663 + // Compact for caching. 1.664 + fMarkSet.compact(); 1.665 + fEndWordSet.compact(); 1.666 + fBeginWordSet.compact(); 1.667 +// fSuffixSet.compact(); 1.668 +} 1.669 + 1.670 +KhmerBreakEngine::~KhmerBreakEngine() { 1.671 + delete fDictionary; 1.672 +} 1.673 + 1.674 +int32_t 1.675 +KhmerBreakEngine::divideUpDictionaryRange( UText *text, 1.676 + int32_t rangeStart, 1.677 + int32_t rangeEnd, 1.678 + UStack &foundBreaks ) const { 1.679 + if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 1.680 + return 0; // Not enough characters for two words 1.681 + } 1.682 + 1.683 + uint32_t wordsFound = 0; 1.684 + int32_t wordLength; 1.685 + int32_t current; 1.686 + UErrorCode status = U_ZERO_ERROR; 1.687 + PossibleWord words[KHMER_LOOKAHEAD]; 1.688 + UChar32 uc; 1.689 + 1.690 + utext_setNativeIndex(text, rangeStart); 1.691 + 1.692 + while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 1.693 + wordLength = 0; 1.694 + 1.695 + // Look for candidate words at the current position 1.696 + int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 1.697 + 1.698 + // If we found exactly one, use that 1.699 + if (candidates == 1) { 1.700 + wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); 1.701 + wordsFound += 1; 1.702 + } 1.703 + 1.704 + // If there was more than one, see which one can take us forward the most words 1.705 + else if (candidates > 1) { 1.706 + // If we're already at the end of the range, we're done 1.707 + if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 1.708 + goto foundBest; 1.709 + } 1.710 + do { 1.711 + int wordsMatched = 1; 1.712 + if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 1.713 + if (wordsMatched < 2) { 1.714 + // Followed by another dictionary word; mark first word as a good candidate 1.715 + words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 1.716 + wordsMatched = 2; 1.717 + } 1.718 + 1.719 + // If we're already at the end of the range, we're done 1.720 + if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 1.721 + goto foundBest; 1.722 + } 1.723 + 1.724 + // See if any of the possible second words is followed by a third word 1.725 + do { 1.726 + // If we find a third word, stop right away 1.727 + if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 1.728 + words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 1.729 + goto foundBest; 1.730 + } 1.731 + } 1.732 + while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); 1.733 + } 1.734 + } 1.735 + while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); 1.736 +foundBest: 1.737 + wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 1.738 + wordsFound += 1; 1.739 + } 1.740 + 1.741 + // We come here after having either found a word or not. We look ahead to the 1.742 + // next word. If it's not a dictionary word, we will combine it with the word we 1.743 + // just found (if there is one), but only if the preceding word does not exceed 1.744 + // the threshold. 1.745 + // The text iterator should now be positioned at the end of the word we found. 1.746 + if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { 1.747 + // if it is a dictionary word, do nothing. If it isn't, then if there is 1.748 + // no preceding word, or the non-word shares less than the minimum threshold 1.749 + // of characters with a dictionary word, then scan to resynchronize 1.750 + if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 1.751 + && (wordLength == 0 1.752 + || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { 1.753 + // Look for a plausible word boundary 1.754 + //TODO: This section will need a rework for UText. 1.755 + int32_t remaining = rangeEnd - (current+wordLength); 1.756 + UChar32 pc = utext_current32(text); 1.757 + int32_t chars = 0; 1.758 + for (;;) { 1.759 + utext_next32(text); 1.760 + uc = utext_current32(text); 1.761 + // TODO: Here we're counting on the fact that the SA languages are all 1.762 + // in the BMP. This should get fixed with the UText rework. 1.763 + chars += 1; 1.764 + if (--remaining <= 0) { 1.765 + break; 1.766 + } 1.767 + if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 1.768 + // Maybe. See if it's in the dictionary. 1.769 + int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 1.770 + utext_setNativeIndex(text, current+wordLength+chars); 1.771 + if (candidates > 0) { 1.772 + break; 1.773 + } 1.774 + } 1.775 + pc = uc; 1.776 + } 1.777 + 1.778 + // Bump the word count if there wasn't already one 1.779 + if (wordLength <= 0) { 1.780 + wordsFound += 1; 1.781 + } 1.782 + 1.783 + // Update the length with the passed-over characters 1.784 + wordLength += chars; 1.785 + } 1.786 + else { 1.787 + // Back up to where we were for next iteration 1.788 + utext_setNativeIndex(text, current+wordLength); 1.789 + } 1.790 + } 1.791 + 1.792 + // Never stop before a combining mark. 1.793 + int32_t currPos; 1.794 + while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 1.795 + utext_next32(text); 1.796 + wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 1.797 + } 1.798 + 1.799 + // Look ahead for possible suffixes if a dictionary word does not follow. 1.800 + // We do this in code rather than using a rule so that the heuristic 1.801 + // resynch continues to function. For example, one of the suffix characters 1.802 + // could be a typo in the middle of a word. 1.803 +// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 1.804 +// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 1.805 +// && fSuffixSet.contains(uc = utext_current32(text))) { 1.806 +// if (uc == KHMER_PAIYANNOI) { 1.807 +// if (!fSuffixSet.contains(utext_previous32(text))) { 1.808 +// // Skip over previous end and PAIYANNOI 1.809 +// utext_next32(text); 1.810 +// utext_next32(text); 1.811 +// wordLength += 1; // Add PAIYANNOI to word 1.812 +// uc = utext_current32(text); // Fetch next character 1.813 +// } 1.814 +// else { 1.815 +// // Restore prior position 1.816 +// utext_next32(text); 1.817 +// } 1.818 +// } 1.819 +// if (uc == KHMER_MAIYAMOK) { 1.820 +// if (utext_previous32(text) != KHMER_MAIYAMOK) { 1.821 +// // Skip over previous end and MAIYAMOK 1.822 +// utext_next32(text); 1.823 +// utext_next32(text); 1.824 +// wordLength += 1; // Add MAIYAMOK to word 1.825 +// } 1.826 +// else { 1.827 +// // Restore prior position 1.828 +// utext_next32(text); 1.829 +// } 1.830 +// } 1.831 +// } 1.832 +// else { 1.833 +// utext_setNativeIndex(text, current+wordLength); 1.834 +// } 1.835 +// } 1.836 + 1.837 + // Did we find a word on this iteration? If so, push it on the break stack 1.838 + if (wordLength > 0) { 1.839 + foundBreaks.push((current+wordLength), status); 1.840 + } 1.841 + } 1.842 + 1.843 + // Don't return a break for the end of the dictionary range if there is one there. 1.844 + if (foundBreaks.peeki() >= rangeEnd) { 1.845 + (void) foundBreaks.popi(); 1.846 + wordsFound -= 1; 1.847 + } 1.848 + 1.849 + return wordsFound; 1.850 +} 1.851 + 1.852 +#if !UCONFIG_NO_NORMALIZATION 1.853 +/* 1.854 + ****************************************************************** 1.855 + * CjkBreakEngine 1.856 + */ 1.857 +static const uint32_t kuint32max = 0xFFFFFFFF; 1.858 +CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) 1.859 +: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) { 1.860 + // Korean dictionary only includes Hangul syllables 1.861 + fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); 1.862 + fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); 1.863 + fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); 1.864 + fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); 1.865 + 1.866 + if (U_SUCCESS(status)) { 1.867 + // handle Korean and Japanese/Chinese using different dictionaries 1.868 + if (type == kKorean) { 1.869 + setCharacters(fHangulWordSet); 1.870 + } else { //Chinese and Japanese 1.871 + UnicodeSet cjSet; 1.872 + cjSet.addAll(fHanWordSet); 1.873 + cjSet.addAll(fKatakanaWordSet); 1.874 + cjSet.addAll(fHiraganaWordSet); 1.875 + cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 1.876 + cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK 1.877 + setCharacters(cjSet); 1.878 + } 1.879 + } 1.880 +} 1.881 + 1.882 +CjkBreakEngine::~CjkBreakEngine(){ 1.883 + delete fDictionary; 1.884 +} 1.885 + 1.886 +// The katakanaCost values below are based on the length frequencies of all 1.887 +// katakana phrases in the dictionary 1.888 +static const int kMaxKatakanaLength = 8; 1.889 +static const int kMaxKatakanaGroupLength = 20; 1.890 +static const uint32_t maxSnlp = 255; 1.891 + 1.892 +static inline uint32_t getKatakanaCost(int wordLength){ 1.893 + //TODO: fill array with actual values from dictionary! 1.894 + static const uint32_t katakanaCost[kMaxKatakanaLength + 1] 1.895 + = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; 1.896 + return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; 1.897 +} 1.898 + 1.899 +static inline bool isKatakana(uint16_t value) { 1.900 + return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || 1.901 + (value >= 0xFF66u && value <= 0xFF9fu); 1.902 +} 1.903 + 1.904 +// A very simple helper class to streamline the buffer handling in 1.905 +// divideUpDictionaryRange. 1.906 +template<class T, size_t N> 1.907 +class AutoBuffer { 1.908 +public: 1.909 + AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) { 1.910 + if (size > N) { 1.911 + buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); 1.912 + capacity = size; 1.913 + } 1.914 + } 1.915 + ~AutoBuffer() { 1.916 + if (buffer != stackBuffer) 1.917 + uprv_free(buffer); 1.918 + } 1.919 + 1.920 + T* elems() { 1.921 + return buffer; 1.922 + } 1.923 + 1.924 + const T& operator[] (size_t i) const { 1.925 + return buffer[i]; 1.926 + } 1.927 + 1.928 + T& operator[] (size_t i) { 1.929 + return buffer[i]; 1.930 + } 1.931 + 1.932 + // resize without copy 1.933 + void resize(size_t size) { 1.934 + if (size <= capacity) 1.935 + return; 1.936 + if (buffer != stackBuffer) 1.937 + uprv_free(buffer); 1.938 + buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); 1.939 + capacity = size; 1.940 + } 1.941 + 1.942 +private: 1.943 + T stackBuffer[N]; 1.944 + T* buffer; 1.945 + AutoBuffer(); 1.946 + size_t capacity; 1.947 +}; 1.948 + 1.949 + 1.950 +/* 1.951 + * @param text A UText representing the text 1.952 + * @param rangeStart The start of the range of dictionary characters 1.953 + * @param rangeEnd The end of the range of dictionary characters 1.954 + * @param foundBreaks Output of C array of int32_t break positions, or 0 1.955 + * @return The number of breaks found 1.956 + */ 1.957 +int32_t 1.958 +CjkBreakEngine::divideUpDictionaryRange( UText *text, 1.959 + int32_t rangeStart, 1.960 + int32_t rangeEnd, 1.961 + UStack &foundBreaks ) const { 1.962 + if (rangeStart >= rangeEnd) { 1.963 + return 0; 1.964 + } 1.965 + 1.966 + const size_t defaultInputLength = 80; 1.967 + size_t inputLength = rangeEnd - rangeStart; 1.968 + // TODO: Replace by UnicodeString. 1.969 + AutoBuffer<UChar, defaultInputLength> charString(inputLength); 1.970 + 1.971 + // Normalize the input string and put it in normalizedText. 1.972 + // The map from the indices of the normalized input to the raw 1.973 + // input is kept in charPositions. 1.974 + UErrorCode status = U_ZERO_ERROR; 1.975 + utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); 1.976 + if (U_FAILURE(status)) { 1.977 + return 0; 1.978 + } 1.979 + 1.980 + UnicodeString inputString(charString.elems(), inputLength); 1.981 + // TODO: Use Normalizer2. 1.982 + UNormalizationMode norm_mode = UNORM_NFKC; 1.983 + UBool isNormalized = 1.984 + Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || 1.985 + Normalizer::isNormalized(inputString, norm_mode, status); 1.986 + 1.987 + // TODO: Replace by UVector32. 1.988 + AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); 1.989 + int numChars = 0; 1.990 + UText normalizedText = UTEXT_INITIALIZER; 1.991 + // Needs to be declared here because normalizedText holds onto its buffer. 1.992 + UnicodeString normalizedString; 1.993 + if (isNormalized) { 1.994 + int32_t index = 0; 1.995 + charPositions[0] = 0; 1.996 + while(index < inputString.length()) { 1.997 + index = inputString.moveIndex32(index, 1); 1.998 + charPositions[++numChars] = index; 1.999 + } 1.1000 + utext_openUnicodeString(&normalizedText, &inputString, &status); 1.1001 + } 1.1002 + else { 1.1003 + Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); 1.1004 + if (U_FAILURE(status)) { 1.1005 + return 0; 1.1006 + } 1.1007 + charPositions.resize(normalizedString.length() + 1); 1.1008 + Normalizer normalizer(charString.elems(), inputLength, norm_mode); 1.1009 + int32_t index = 0; 1.1010 + charPositions[0] = 0; 1.1011 + while(index < normalizer.endIndex()){ 1.1012 + /* UChar32 uc = */ normalizer.next(); 1.1013 + charPositions[++numChars] = index = normalizer.getIndex(); 1.1014 + } 1.1015 + utext_openUnicodeString(&normalizedText, &normalizedString, &status); 1.1016 + } 1.1017 + 1.1018 + if (U_FAILURE(status)) { 1.1019 + return 0; 1.1020 + } 1.1021 + 1.1022 + // From this point on, all the indices refer to the indices of 1.1023 + // the normalized input string. 1.1024 + 1.1025 + // bestSnlp[i] is the snlp of the best segmentation of the first i 1.1026 + // characters in the range to be matched. 1.1027 + // TODO: Replace by UVector32. 1.1028 + AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); 1.1029 + bestSnlp[0] = 0; 1.1030 + for(int i = 1; i <= numChars; i++) { 1.1031 + bestSnlp[i] = kuint32max; 1.1032 + } 1.1033 + 1.1034 + // prev[i] is the index of the last CJK character in the previous word in 1.1035 + // the best segmentation of the first i characters. 1.1036 + // TODO: Replace by UVector32. 1.1037 + AutoBuffer<int, defaultInputLength> prev(numChars + 1); 1.1038 + for(int i = 0; i <= numChars; i++){ 1.1039 + prev[i] = -1; 1.1040 + } 1.1041 + 1.1042 + const size_t maxWordSize = 20; 1.1043 + // TODO: Replace both with UVector32. 1.1044 + AutoBuffer<int32_t, maxWordSize> values(numChars); 1.1045 + AutoBuffer<int32_t, maxWordSize> lengths(numChars); 1.1046 + 1.1047 + // Dynamic programming to find the best segmentation. 1.1048 + bool is_prev_katakana = false; 1.1049 + for (int32_t i = 0; i < numChars; ++i) { 1.1050 + //utext_setNativeIndex(text, rangeStart + i); 1.1051 + utext_setNativeIndex(&normalizedText, i); 1.1052 + if (bestSnlp[i] == kuint32max) 1.1053 + continue; 1.1054 + 1.1055 + int32_t count; 1.1056 + // limit maximum word length matched to size of current substring 1.1057 + int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i); 1.1058 + 1.1059 + fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); 1.1060 + 1.1061 + // if there are no single character matches found in the dictionary 1.1062 + // starting with this charcter, treat character as a 1-character word 1.1063 + // with the highest value possible, i.e. the least likely to occur. 1.1064 + // Exclude Korean characters from this treatment, as they should be left 1.1065 + // together by default. 1.1066 + if((count == 0 || lengths[0] != 1) && 1.1067 + !fHangulWordSet.contains(utext_current32(&normalizedText))) { 1.1068 + values[count] = maxSnlp; 1.1069 + lengths[count++] = 1; 1.1070 + } 1.1071 + 1.1072 + for (int j = 0; j < count; j++) { 1.1073 + uint32_t newSnlp = bestSnlp[i] + values[j]; 1.1074 + if (newSnlp < bestSnlp[lengths[j] + i]) { 1.1075 + bestSnlp[lengths[j] + i] = newSnlp; 1.1076 + prev[lengths[j] + i] = i; 1.1077 + } 1.1078 + } 1.1079 + 1.1080 + // In Japanese, 1.1081 + // Katakana word in single character is pretty rare. So we apply 1.1082 + // the following heuristic to Katakana: any continuous run of Katakana 1.1083 + // characters is considered a candidate word with a default cost 1.1084 + // specified in the katakanaCost table according to its length. 1.1085 + //utext_setNativeIndex(text, rangeStart + i); 1.1086 + utext_setNativeIndex(&normalizedText, i); 1.1087 + bool is_katakana = isKatakana(utext_current32(&normalizedText)); 1.1088 + if (!is_prev_katakana && is_katakana) { 1.1089 + int j = i + 1; 1.1090 + utext_next32(&normalizedText); 1.1091 + // Find the end of the continuous run of Katakana characters 1.1092 + while (j < numChars && (j - i) < kMaxKatakanaGroupLength && 1.1093 + isKatakana(utext_current32(&normalizedText))) { 1.1094 + utext_next32(&normalizedText); 1.1095 + ++j; 1.1096 + } 1.1097 + if ((j - i) < kMaxKatakanaGroupLength) { 1.1098 + uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); 1.1099 + if (newSnlp < bestSnlp[j]) { 1.1100 + bestSnlp[j] = newSnlp; 1.1101 + prev[j] = i; 1.1102 + } 1.1103 + } 1.1104 + } 1.1105 + is_prev_katakana = is_katakana; 1.1106 + } 1.1107 + 1.1108 + // Start pushing the optimal offset index into t_boundary (t for tentative). 1.1109 + // prev[numChars] is guaranteed to be meaningful. 1.1110 + // We'll first push in the reverse order, i.e., 1.1111 + // t_boundary[0] = numChars, and afterwards do a swap. 1.1112 + // TODO: Replace by UVector32. 1.1113 + AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); 1.1114 + 1.1115 + int numBreaks = 0; 1.1116 + // No segmentation found, set boundary to end of range 1.1117 + if (bestSnlp[numChars] == kuint32max) { 1.1118 + t_boundary[numBreaks++] = numChars; 1.1119 + } else { 1.1120 + for (int i = numChars; i > 0; i = prev[i]) { 1.1121 + t_boundary[numBreaks++] = i; 1.1122 + } 1.1123 + U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0); 1.1124 + } 1.1125 + 1.1126 + // Reverse offset index in t_boundary. 1.1127 + // Don't add a break for the start of the dictionary range if there is one 1.1128 + // there already. 1.1129 + if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { 1.1130 + t_boundary[numBreaks++] = 0; 1.1131 + } 1.1132 + 1.1133 + // Now that we're done, convert positions in t_bdry[] (indices in 1.1134 + // the normalized input string) back to indices in the raw input string 1.1135 + // while reversing t_bdry and pushing values to foundBreaks. 1.1136 + for (int i = numBreaks-1; i >= 0; i--) { 1.1137 + foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); 1.1138 + } 1.1139 + 1.1140 + utext_close(&normalizedText); 1.1141 + return numBreaks; 1.1142 +} 1.1143 +#endif 1.1144 + 1.1145 +U_NAMESPACE_END 1.1146 + 1.1147 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1.1148 +