intl/icu/source/common/dictbe.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/dictbe.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1145 @@
     1.4 +/**
     1.5 + *******************************************************************************
     1.6 + * Copyright (C) 2006-2013, International Business Machines Corporation
     1.7 + * and others. All Rights Reserved.
     1.8 + *******************************************************************************
     1.9 + */
    1.10 +
    1.11 +#include "unicode/utypes.h"
    1.12 +
    1.13 +#if !UCONFIG_NO_BREAK_ITERATION
    1.14 +
    1.15 +#include "brkeng.h"
    1.16 +#include "dictbe.h"
    1.17 +#include "unicode/uniset.h"
    1.18 +#include "unicode/chariter.h"
    1.19 +#include "unicode/ubrk.h"
    1.20 +#include "uvector.h"
    1.21 +#include "uassert.h"
    1.22 +#include "unicode/normlzr.h"
    1.23 +#include "cmemory.h"
    1.24 +#include "dictionarydata.h"
    1.25 +
    1.26 +U_NAMESPACE_BEGIN
    1.27 +
    1.28 +/*
    1.29 + ******************************************************************
    1.30 + */
    1.31 +
    1.32 +DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
    1.33 +    fTypes = breakTypes;
    1.34 +}
    1.35 +
    1.36 +DictionaryBreakEngine::~DictionaryBreakEngine() {
    1.37 +}
    1.38 +
    1.39 +UBool
    1.40 +DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
    1.41 +    return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
    1.42 +            && fSet.contains(c));
    1.43 +}
    1.44 +
    1.45 +int32_t
    1.46 +DictionaryBreakEngine::findBreaks( UText *text,
    1.47 +                                 int32_t startPos,
    1.48 +                                 int32_t endPos,
    1.49 +                                 UBool reverse,
    1.50 +                                 int32_t breakType,
    1.51 +                                 UStack &foundBreaks ) const {
    1.52 +    int32_t result = 0;
    1.53 +
    1.54 +    // Find the span of characters included in the set.
    1.55 +    int32_t start = (int32_t)utext_getNativeIndex(text);
    1.56 +    int32_t current;
    1.57 +    int32_t rangeStart;
    1.58 +    int32_t rangeEnd;
    1.59 +    UChar32 c = utext_current32(text);
    1.60 +    if (reverse) {
    1.61 +        UBool   isDict = fSet.contains(c);
    1.62 +        while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
    1.63 +            c = utext_previous32(text);
    1.64 +            isDict = fSet.contains(c);
    1.65 +        }
    1.66 +        rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
    1.67 +        rangeEnd = start + 1;
    1.68 +    }
    1.69 +    else {
    1.70 +        while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
    1.71 +            utext_next32(text);         // TODO:  recast loop for postincrement
    1.72 +            c = utext_current32(text);
    1.73 +        }
    1.74 +        rangeStart = start;
    1.75 +        rangeEnd = current;
    1.76 +    }
    1.77 +    if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
    1.78 +        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
    1.79 +        utext_setNativeIndex(text, current);
    1.80 +    }
    1.81 +    
    1.82 +    return result;
    1.83 +}
    1.84 +
    1.85 +void
    1.86 +DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
    1.87 +    fSet = set;
    1.88 +    // Compact for caching
    1.89 +    fSet.compact();
    1.90 +}
    1.91 +
    1.92 +/*
    1.93 + ******************************************************************
    1.94 + * PossibleWord
    1.95 + */
    1.96 +
    1.97 +// Helper class for improving readability of the Thai/Lao/Khmer word break
    1.98 +// algorithm. The implementation is completely inline.
    1.99 +
   1.100 +// List size, limited by the maximum number of words in the dictionary
   1.101 +// that form a nested sequence.
   1.102 +#define POSSIBLE_WORD_LIST_MAX 20
   1.103 +
   1.104 +class PossibleWord {
   1.105 +private:
   1.106 +    // list of word candidate lengths, in increasing length order
   1.107 +    int32_t   lengths[POSSIBLE_WORD_LIST_MAX];
   1.108 +    int32_t   count;      // Count of candidates
   1.109 +    int32_t   prefix;     // The longest match with a dictionary word
   1.110 +    int32_t   offset;     // Offset in the text of these candidates
   1.111 +    int       mark;       // The preferred candidate's offset
   1.112 +    int       current;    // The candidate we're currently looking at
   1.113 +
   1.114 +public:
   1.115 +    PossibleWord();
   1.116 +    ~PossibleWord();
   1.117 +  
   1.118 +    // Fill the list of candidates if needed, select the longest, and return the number found
   1.119 +    int       candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
   1.120 +  
   1.121 +    // Select the currently marked candidate, point after it in the text, and invalidate self
   1.122 +    int32_t   acceptMarked( UText *text );
   1.123 +  
   1.124 +    // Back up from the current candidate to the next shorter one; return TRUE if that exists
   1.125 +    // and point the text after it
   1.126 +    UBool     backUp( UText *text );
   1.127 +  
   1.128 +    // Return the longest prefix this candidate location shares with a dictionary word
   1.129 +    int32_t   longestPrefix();
   1.130 +  
   1.131 +    // Mark the current candidate as the one we like
   1.132 +    void      markCurrent();
   1.133 +};
   1.134 +
   1.135 +inline
   1.136 +PossibleWord::PossibleWord() {
   1.137 +    offset = -1;
   1.138 +}
   1.139 +
   1.140 +inline
   1.141 +PossibleWord::~PossibleWord() {
   1.142 +}
   1.143 +
   1.144 +inline int
   1.145 +PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
   1.146 +    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
   1.147 +    int32_t start = (int32_t)utext_getNativeIndex(text);
   1.148 +    if (start != offset) {
   1.149 +        offset = start;
   1.150 +        prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
   1.151 +        // Dictionary leaves text after longest prefix, not longest word. Back up.
   1.152 +        if (count <= 0) {
   1.153 +            utext_setNativeIndex(text, start);
   1.154 +        }
   1.155 +    }
   1.156 +    if (count > 0) {
   1.157 +        utext_setNativeIndex(text, start+lengths[count-1]);
   1.158 +    }
   1.159 +    current = count-1;
   1.160 +    mark = current;
   1.161 +    return count;
   1.162 +}
   1.163 +
   1.164 +inline int32_t
   1.165 +PossibleWord::acceptMarked( UText *text ) {
   1.166 +    utext_setNativeIndex(text, offset + lengths[mark]);
   1.167 +    return lengths[mark];
   1.168 +}
   1.169 +
   1.170 +inline UBool
   1.171 +PossibleWord::backUp( UText *text ) {
   1.172 +    if (current > 0) {
   1.173 +        utext_setNativeIndex(text, offset + lengths[--current]);
   1.174 +        return TRUE;
   1.175 +    }
   1.176 +    return FALSE;
   1.177 +}
   1.178 +
   1.179 +inline int32_t
   1.180 +PossibleWord::longestPrefix() {
   1.181 +    return prefix;
   1.182 +}
   1.183 +
   1.184 +inline void
   1.185 +PossibleWord::markCurrent() {
   1.186 +    mark = current;
   1.187 +}
   1.188 +
   1.189 +/*
   1.190 + ******************************************************************
   1.191 + * ThaiBreakEngine
   1.192 + */
   1.193 +
   1.194 +// How many words in a row are "good enough"?
   1.195 +#define THAI_LOOKAHEAD 3
   1.196 +
   1.197 +// Will not combine a non-word with a preceding dictionary word longer than this
   1.198 +#define THAI_ROOT_COMBINE_THRESHOLD 3
   1.199 +
   1.200 +// Will not combine a non-word that shares at least this much prefix with a
   1.201 +// dictionary word, with a preceding word
   1.202 +#define THAI_PREFIX_COMBINE_THRESHOLD 3
   1.203 +
   1.204 +// Ellision character
   1.205 +#define THAI_PAIYANNOI 0x0E2F
   1.206 +
   1.207 +// Repeat character
   1.208 +#define THAI_MAIYAMOK 0x0E46
   1.209 +
   1.210 +// Minimum word size
   1.211 +#define THAI_MIN_WORD 2
   1.212 +
   1.213 +// Minimum number of characters for two words
   1.214 +#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
   1.215 +
   1.216 +ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
   1.217 +    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
   1.218 +      fDictionary(adoptDictionary)
   1.219 +{
   1.220 +    fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
   1.221 +    if (U_SUCCESS(status)) {
   1.222 +        setCharacters(fThaiWordSet);
   1.223 +    }
   1.224 +    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
   1.225 +    fMarkSet.add(0x0020);
   1.226 +    fEndWordSet = fThaiWordSet;
   1.227 +    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
   1.228 +    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
   1.229 +    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
   1.230 +    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
   1.231 +    fSuffixSet.add(THAI_PAIYANNOI);
   1.232 +    fSuffixSet.add(THAI_MAIYAMOK);
   1.233 +
   1.234 +    // Compact for caching.
   1.235 +    fMarkSet.compact();
   1.236 +    fEndWordSet.compact();
   1.237 +    fBeginWordSet.compact();
   1.238 +    fSuffixSet.compact();
   1.239 +}
   1.240 +
   1.241 +ThaiBreakEngine::~ThaiBreakEngine() {
   1.242 +    delete fDictionary;
   1.243 +}
   1.244 +
   1.245 +int32_t
   1.246 +ThaiBreakEngine::divideUpDictionaryRange( UText *text,
   1.247 +                                                int32_t rangeStart,
   1.248 +                                                int32_t rangeEnd,
   1.249 +                                                UStack &foundBreaks ) const {
   1.250 +    if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
   1.251 +        return 0;       // Not enough characters for two words
   1.252 +    }
   1.253 +
   1.254 +    uint32_t wordsFound = 0;
   1.255 +    int32_t wordLength;
   1.256 +    int32_t current;
   1.257 +    UErrorCode status = U_ZERO_ERROR;
   1.258 +    PossibleWord words[THAI_LOOKAHEAD];
   1.259 +    UChar32 uc;
   1.260 +    
   1.261 +    utext_setNativeIndex(text, rangeStart);
   1.262 +    
   1.263 +    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
   1.264 +        wordLength = 0;
   1.265 +
   1.266 +        // Look for candidate words at the current position
   1.267 +        int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
   1.268 +        
   1.269 +        // If we found exactly one, use that
   1.270 +        if (candidates == 1) {
   1.271 +            wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
   1.272 +            wordsFound += 1;
   1.273 +        }
   1.274 +        // If there was more than one, see which one can take us forward the most words
   1.275 +        else if (candidates > 1) {
   1.276 +            // If we're already at the end of the range, we're done
   1.277 +            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
   1.278 +                goto foundBest;
   1.279 +            }
   1.280 +            do {
   1.281 +                int wordsMatched = 1;
   1.282 +                if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
   1.283 +                    if (wordsMatched < 2) {
   1.284 +                        // Followed by another dictionary word; mark first word as a good candidate
   1.285 +                        words[wordsFound%THAI_LOOKAHEAD].markCurrent();
   1.286 +                        wordsMatched = 2;
   1.287 +                    }
   1.288 +                    
   1.289 +                    // If we're already at the end of the range, we're done
   1.290 +                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
   1.291 +                        goto foundBest;
   1.292 +                    }
   1.293 +                    
   1.294 +                    // See if any of the possible second words is followed by a third word
   1.295 +                    do {
   1.296 +                        // If we find a third word, stop right away
   1.297 +                        if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
   1.298 +                            words[wordsFound % THAI_LOOKAHEAD].markCurrent();
   1.299 +                            goto foundBest;
   1.300 +                        }
   1.301 +                    }
   1.302 +                    while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
   1.303 +                }
   1.304 +            }
   1.305 +            while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
   1.306 +foundBest:
   1.307 +            wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
   1.308 +            wordsFound += 1;
   1.309 +        }
   1.310 +        
   1.311 +        // We come here after having either found a word or not. We look ahead to the
   1.312 +        // next word. If it's not a dictionary word, we will combine it withe the word we
   1.313 +        // just found (if there is one), but only if the preceding word does not exceed
   1.314 +        // the threshold.
   1.315 +        // The text iterator should now be positioned at the end of the word we found.
   1.316 +        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
   1.317 +            // if it is a dictionary word, do nothing. If it isn't, then if there is
   1.318 +            // no preceding word, or the non-word shares less than the minimum threshold
   1.319 +            // of characters with a dictionary word, then scan to resynchronize
   1.320 +            if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
   1.321 +                  && (wordLength == 0
   1.322 +                      || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
   1.323 +                // Look for a plausible word boundary
   1.324 +                //TODO: This section will need a rework for UText.
   1.325 +                int32_t remaining = rangeEnd - (current+wordLength);
   1.326 +                UChar32 pc = utext_current32(text);
   1.327 +                int32_t chars = 0;
   1.328 +                for (;;) {
   1.329 +                    utext_next32(text);
   1.330 +                    uc = utext_current32(text);
   1.331 +                    // TODO: Here we're counting on the fact that the SA languages are all
   1.332 +                    // in the BMP. This should get fixed with the UText rework.
   1.333 +                    chars += 1;
   1.334 +                    if (--remaining <= 0) {
   1.335 +                        break;
   1.336 +                    }
   1.337 +                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
   1.338 +                        // Maybe. See if it's in the dictionary.
   1.339 +                        // NOTE: In the original Apple code, checked that the next
   1.340 +                        // two characters after uc were not 0x0E4C THANTHAKHAT before
   1.341 +                        // checking the dictionary. That is just a performance filter,
   1.342 +                        // but it's not clear it's faster than checking the trie.
   1.343 +                        int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
   1.344 +                        utext_setNativeIndex(text, current + wordLength + chars);
   1.345 +                        if (candidates > 0) {
   1.346 +                            break;
   1.347 +                        }
   1.348 +                    }
   1.349 +                    pc = uc;
   1.350 +                }
   1.351 +                
   1.352 +                // Bump the word count if there wasn't already one
   1.353 +                if (wordLength <= 0) {
   1.354 +                    wordsFound += 1;
   1.355 +                }
   1.356 +                
   1.357 +                // Update the length with the passed-over characters
   1.358 +                wordLength += chars;
   1.359 +            }
   1.360 +            else {
   1.361 +                // Back up to where we were for next iteration
   1.362 +                utext_setNativeIndex(text, current+wordLength);
   1.363 +            }
   1.364 +        }
   1.365 +        
   1.366 +        // Never stop before a combining mark.
   1.367 +        int32_t currPos;
   1.368 +        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
   1.369 +            utext_next32(text);
   1.370 +            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
   1.371 +        }
   1.372 +        
   1.373 +        // Look ahead for possible suffixes if a dictionary word does not follow.
   1.374 +        // We do this in code rather than using a rule so that the heuristic
   1.375 +        // resynch continues to function. For example, one of the suffix characters
   1.376 +        // could be a typo in the middle of a word.
   1.377 +        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
   1.378 +            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
   1.379 +                && fSuffixSet.contains(uc = utext_current32(text))) {
   1.380 +                if (uc == THAI_PAIYANNOI) {
   1.381 +                    if (!fSuffixSet.contains(utext_previous32(text))) {
   1.382 +                        // Skip over previous end and PAIYANNOI
   1.383 +                        utext_next32(text);
   1.384 +                        utext_next32(text);
   1.385 +                        wordLength += 1;            // Add PAIYANNOI to word
   1.386 +                        uc = utext_current32(text);     // Fetch next character
   1.387 +                    }
   1.388 +                    else {
   1.389 +                        // Restore prior position
   1.390 +                        utext_next32(text);
   1.391 +                    }
   1.392 +                }
   1.393 +                if (uc == THAI_MAIYAMOK) {
   1.394 +                    if (utext_previous32(text) != THAI_MAIYAMOK) {
   1.395 +                        // Skip over previous end and MAIYAMOK
   1.396 +                        utext_next32(text);
   1.397 +                        utext_next32(text);
   1.398 +                        wordLength += 1;            // Add MAIYAMOK to word
   1.399 +                    }
   1.400 +                    else {
   1.401 +                        // Restore prior position
   1.402 +                        utext_next32(text);
   1.403 +                    }
   1.404 +                }
   1.405 +            }
   1.406 +            else {
   1.407 +                utext_setNativeIndex(text, current+wordLength);
   1.408 +            }
   1.409 +        }
   1.410 +
   1.411 +        // Did we find a word on this iteration? If so, push it on the break stack
   1.412 +        if (wordLength > 0) {
   1.413 +            foundBreaks.push((current+wordLength), status);
   1.414 +        }
   1.415 +    }
   1.416 +
   1.417 +    // Don't return a break for the end of the dictionary range if there is one there.
   1.418 +    if (foundBreaks.peeki() >= rangeEnd) {
   1.419 +        (void) foundBreaks.popi();
   1.420 +        wordsFound -= 1;
   1.421 +    }
   1.422 +
   1.423 +    return wordsFound;
   1.424 +}
   1.425 +
   1.426 +/*
   1.427 + ******************************************************************
   1.428 + * LaoBreakEngine
   1.429 + */
   1.430 +
   1.431 +// How many words in a row are "good enough"?
   1.432 +#define LAO_LOOKAHEAD 3
   1.433 +
   1.434 +// Will not combine a non-word with a preceding dictionary word longer than this
   1.435 +#define LAO_ROOT_COMBINE_THRESHOLD 3
   1.436 +
   1.437 +// Will not combine a non-word that shares at least this much prefix with a
   1.438 +// dictionary word, with a preceding word
   1.439 +#define LAO_PREFIX_COMBINE_THRESHOLD 3
   1.440 +
   1.441 +// Minimum word size
   1.442 +#define LAO_MIN_WORD 2
   1.443 +
   1.444 +// Minimum number of characters for two words
   1.445 +#define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2)
   1.446 +
   1.447 +LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
   1.448 +    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
   1.449 +      fDictionary(adoptDictionary)
   1.450 +{
   1.451 +    fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
   1.452 +    if (U_SUCCESS(status)) {
   1.453 +        setCharacters(fLaoWordSet);
   1.454 +    }
   1.455 +    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
   1.456 +    fMarkSet.add(0x0020);
   1.457 +    fEndWordSet = fLaoWordSet;
   1.458 +    fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
   1.459 +    fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
   1.460 +    fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
   1.461 +    fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
   1.462 +
   1.463 +    // Compact for caching.
   1.464 +    fMarkSet.compact();
   1.465 +    fEndWordSet.compact();
   1.466 +    fBeginWordSet.compact();
   1.467 +}
   1.468 +
   1.469 +LaoBreakEngine::~LaoBreakEngine() {
   1.470 +    delete fDictionary;
   1.471 +}
   1.472 +
   1.473 +int32_t
   1.474 +LaoBreakEngine::divideUpDictionaryRange( UText *text,
   1.475 +                                                int32_t rangeStart,
   1.476 +                                                int32_t rangeEnd,
   1.477 +                                                UStack &foundBreaks ) const {
   1.478 +    if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
   1.479 +        return 0;       // Not enough characters for two words
   1.480 +    }
   1.481 +
   1.482 +    uint32_t wordsFound = 0;
   1.483 +    int32_t wordLength;
   1.484 +    int32_t current;
   1.485 +    UErrorCode status = U_ZERO_ERROR;
   1.486 +    PossibleWord words[LAO_LOOKAHEAD];
   1.487 +    UChar32 uc;
   1.488 +    
   1.489 +    utext_setNativeIndex(text, rangeStart);
   1.490 +    
   1.491 +    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
   1.492 +        wordLength = 0;
   1.493 +
   1.494 +        // Look for candidate words at the current position
   1.495 +        int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
   1.496 +        
   1.497 +        // If we found exactly one, use that
   1.498 +        if (candidates == 1) {
   1.499 +            wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
   1.500 +            wordsFound += 1;
   1.501 +        }
   1.502 +        // If there was more than one, see which one can take us forward the most words
   1.503 +        else if (candidates > 1) {
   1.504 +            // If we're already at the end of the range, we're done
   1.505 +            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
   1.506 +                goto foundBest;
   1.507 +            }
   1.508 +            do {
   1.509 +                int wordsMatched = 1;
   1.510 +                if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
   1.511 +                    if (wordsMatched < 2) {
   1.512 +                        // Followed by another dictionary word; mark first word as a good candidate
   1.513 +                        words[wordsFound%LAO_LOOKAHEAD].markCurrent();
   1.514 +                        wordsMatched = 2;
   1.515 +                    }
   1.516 +                    
   1.517 +                    // If we're already at the end of the range, we're done
   1.518 +                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
   1.519 +                        goto foundBest;
   1.520 +                    }
   1.521 +                    
   1.522 +                    // See if any of the possible second words is followed by a third word
   1.523 +                    do {
   1.524 +                        // If we find a third word, stop right away
   1.525 +                        if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
   1.526 +                            words[wordsFound % LAO_LOOKAHEAD].markCurrent();
   1.527 +                            goto foundBest;
   1.528 +                        }
   1.529 +                    }
   1.530 +                    while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
   1.531 +                }
   1.532 +            }
   1.533 +            while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
   1.534 +foundBest:
   1.535 +            wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
   1.536 +            wordsFound += 1;
   1.537 +        }
   1.538 +        
   1.539 +        // We come here after having either found a word or not. We look ahead to the
   1.540 +        // next word. If it's not a dictionary word, we will combine it withe the word we
   1.541 +        // just found (if there is one), but only if the preceding word does not exceed
   1.542 +        // the threshold.
   1.543 +        // The text iterator should now be positioned at the end of the word we found.
   1.544 +        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) {
   1.545 +            // if it is a dictionary word, do nothing. If it isn't, then if there is
   1.546 +            // no preceding word, or the non-word shares less than the minimum threshold
   1.547 +            // of characters with a dictionary word, then scan to resynchronize
   1.548 +            if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
   1.549 +                  && (wordLength == 0
   1.550 +                      || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
   1.551 +                // Look for a plausible word boundary
   1.552 +                //TODO: This section will need a rework for UText.
   1.553 +                int32_t remaining = rangeEnd - (current+wordLength);
   1.554 +                UChar32 pc = utext_current32(text);
   1.555 +                int32_t chars = 0;
   1.556 +                for (;;) {
   1.557 +                    utext_next32(text);
   1.558 +                    uc = utext_current32(text);
   1.559 +                    // TODO: Here we're counting on the fact that the SA languages are all
   1.560 +                    // in the BMP. This should get fixed with the UText rework.
   1.561 +                    chars += 1;
   1.562 +                    if (--remaining <= 0) {
   1.563 +                        break;
   1.564 +                    }
   1.565 +                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
   1.566 +                        // Maybe. See if it's in the dictionary.
   1.567 +                        int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
   1.568 +                        utext_setNativeIndex(text, current + wordLength + chars);
   1.569 +                        if (candidates > 0) {
   1.570 +                            break;
   1.571 +                        }
   1.572 +                    }
   1.573 +                    pc = uc;
   1.574 +                }
   1.575 +                
   1.576 +                // Bump the word count if there wasn't already one
   1.577 +                if (wordLength <= 0) {
   1.578 +                    wordsFound += 1;
   1.579 +                }
   1.580 +                
   1.581 +                // Update the length with the passed-over characters
   1.582 +                wordLength += chars;
   1.583 +            }
   1.584 +            else {
   1.585 +                // Back up to where we were for next iteration
   1.586 +                utext_setNativeIndex(text, current+wordLength);
   1.587 +            }
   1.588 +        }
   1.589 +        
   1.590 +        // Never stop before a combining mark.
   1.591 +        int32_t currPos;
   1.592 +        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
   1.593 +            utext_next32(text);
   1.594 +            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
   1.595 +        }
   1.596 +        
   1.597 +        // Look ahead for possible suffixes if a dictionary word does not follow.
   1.598 +        // We do this in code rather than using a rule so that the heuristic
   1.599 +        // resynch continues to function. For example, one of the suffix characters
   1.600 +        // could be a typo in the middle of a word.
   1.601 +        // NOT CURRENTLY APPLICABLE TO LAO
   1.602 +
   1.603 +        // Did we find a word on this iteration? If so, push it on the break stack
   1.604 +        if (wordLength > 0) {
   1.605 +            foundBreaks.push((current+wordLength), status);
   1.606 +        }
   1.607 +    }
   1.608 +
   1.609 +    // Don't return a break for the end of the dictionary range if there is one there.
   1.610 +    if (foundBreaks.peeki() >= rangeEnd) {
   1.611 +        (void) foundBreaks.popi();
   1.612 +        wordsFound -= 1;
   1.613 +    }
   1.614 +
   1.615 +    return wordsFound;
   1.616 +}
   1.617 +
   1.618 +/*
   1.619 + ******************************************************************
   1.620 + * KhmerBreakEngine
   1.621 + */
   1.622 +
   1.623 +// How many words in a row are "good enough"?
   1.624 +#define KHMER_LOOKAHEAD 3
   1.625 +
   1.626 +// Will not combine a non-word with a preceding dictionary word longer than this
   1.627 +#define KHMER_ROOT_COMBINE_THRESHOLD 3
   1.628 +
   1.629 +// Will not combine a non-word that shares at least this much prefix with a
   1.630 +// dictionary word, with a preceding word
   1.631 +#define KHMER_PREFIX_COMBINE_THRESHOLD 3
   1.632 +
   1.633 +// Minimum word size
   1.634 +#define KHMER_MIN_WORD 2
   1.635 +
   1.636 +// Minimum number of characters for two words
   1.637 +#define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2)
   1.638 +
   1.639 +KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
   1.640 +    : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
   1.641 +      fDictionary(adoptDictionary)
   1.642 +{
   1.643 +    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
   1.644 +    if (U_SUCCESS(status)) {
   1.645 +        setCharacters(fKhmerWordSet);
   1.646 +    }
   1.647 +    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
   1.648 +    fMarkSet.add(0x0020);
   1.649 +    fEndWordSet = fKhmerWordSet;
   1.650 +    fBeginWordSet.add(0x1780, 0x17B3);
   1.651 +    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
   1.652 +    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
   1.653 +    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
   1.654 +    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
   1.655 +    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
   1.656 +//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
   1.657 +//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
   1.658 +//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
   1.659 +//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
   1.660 +//    fSuffixSet.add(THAI_PAIYANNOI);
   1.661 +//    fSuffixSet.add(THAI_MAIYAMOK);
   1.662 +
   1.663 +    // Compact for caching.
   1.664 +    fMarkSet.compact();
   1.665 +    fEndWordSet.compact();
   1.666 +    fBeginWordSet.compact();
   1.667 +//    fSuffixSet.compact();
   1.668 +}
   1.669 +
   1.670 +KhmerBreakEngine::~KhmerBreakEngine() {
   1.671 +    delete fDictionary;
   1.672 +}
   1.673 +
   1.674 +int32_t
   1.675 +KhmerBreakEngine::divideUpDictionaryRange( UText *text,
   1.676 +                                                int32_t rangeStart,
   1.677 +                                                int32_t rangeEnd,
   1.678 +                                                UStack &foundBreaks ) const {
   1.679 +    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
   1.680 +        return 0;       // Not enough characters for two words
   1.681 +    }
   1.682 +
   1.683 +    uint32_t wordsFound = 0;
   1.684 +    int32_t wordLength;
   1.685 +    int32_t current;
   1.686 +    UErrorCode status = U_ZERO_ERROR;
   1.687 +    PossibleWord words[KHMER_LOOKAHEAD];
   1.688 +    UChar32 uc;
   1.689 +
   1.690 +    utext_setNativeIndex(text, rangeStart);
   1.691 +
   1.692 +    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
   1.693 +        wordLength = 0;
   1.694 +
   1.695 +        // Look for candidate words at the current position
   1.696 +        int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
   1.697 +
   1.698 +        // If we found exactly one, use that
   1.699 +        if (candidates == 1) {
   1.700 +            wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
   1.701 +            wordsFound += 1;
   1.702 +        }
   1.703 +
   1.704 +        // If there was more than one, see which one can take us forward the most words
   1.705 +        else if (candidates > 1) {
   1.706 +            // If we're already at the end of the range, we're done
   1.707 +            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
   1.708 +                goto foundBest;
   1.709 +            }
   1.710 +            do {
   1.711 +                int wordsMatched = 1;
   1.712 +                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
   1.713 +                    if (wordsMatched < 2) {
   1.714 +                        // Followed by another dictionary word; mark first word as a good candidate
   1.715 +                        words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
   1.716 +                        wordsMatched = 2;
   1.717 +                    }
   1.718 +
   1.719 +                    // If we're already at the end of the range, we're done
   1.720 +                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
   1.721 +                        goto foundBest;
   1.722 +                    }
   1.723 +
   1.724 +                    // See if any of the possible second words is followed by a third word
   1.725 +                    do {
   1.726 +                        // If we find a third word, stop right away
   1.727 +                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
   1.728 +                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
   1.729 +                            goto foundBest;
   1.730 +                        }
   1.731 +                    }
   1.732 +                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
   1.733 +                }
   1.734 +            }
   1.735 +            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
   1.736 +foundBest:
   1.737 +            wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
   1.738 +            wordsFound += 1;
   1.739 +        }
   1.740 +
   1.741 +        // We come here after having either found a word or not. We look ahead to the
   1.742 +        // next word. If it's not a dictionary word, we will combine it with the word we
   1.743 +        // just found (if there is one), but only if the preceding word does not exceed
   1.744 +        // the threshold.
   1.745 +        // The text iterator should now be positioned at the end of the word we found.
   1.746 +        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
   1.747 +            // if it is a dictionary word, do nothing. If it isn't, then if there is
   1.748 +            // no preceding word, or the non-word shares less than the minimum threshold
   1.749 +            // of characters with a dictionary word, then scan to resynchronize
   1.750 +            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
   1.751 +                  && (wordLength == 0
   1.752 +                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
   1.753 +                // Look for a plausible word boundary
   1.754 +                //TODO: This section will need a rework for UText.
   1.755 +                int32_t remaining = rangeEnd - (current+wordLength);
   1.756 +                UChar32 pc = utext_current32(text);
   1.757 +                int32_t chars = 0;
   1.758 +                for (;;) {
   1.759 +                    utext_next32(text);
   1.760 +                    uc = utext_current32(text);
   1.761 +                    // TODO: Here we're counting on the fact that the SA languages are all
   1.762 +                    // in the BMP. This should get fixed with the UText rework.
   1.763 +                    chars += 1;
   1.764 +                    if (--remaining <= 0) {
   1.765 +                        break;
   1.766 +                    }
   1.767 +                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
   1.768 +                        // Maybe. See if it's in the dictionary.
   1.769 +                        int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
   1.770 +                        utext_setNativeIndex(text, current+wordLength+chars);
   1.771 +                        if (candidates > 0) {
   1.772 +                            break;
   1.773 +                        }
   1.774 +                    }
   1.775 +                    pc = uc;
   1.776 +                }
   1.777 +
   1.778 +                // Bump the word count if there wasn't already one
   1.779 +                if (wordLength <= 0) {
   1.780 +                    wordsFound += 1;
   1.781 +                }
   1.782 +
   1.783 +                // Update the length with the passed-over characters
   1.784 +                wordLength += chars;
   1.785 +            }
   1.786 +            else {
   1.787 +                // Back up to where we were for next iteration
   1.788 +                utext_setNativeIndex(text, current+wordLength);
   1.789 +            }
   1.790 +        }
   1.791 +
   1.792 +        // Never stop before a combining mark.
   1.793 +        int32_t currPos;
   1.794 +        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
   1.795 +            utext_next32(text);
   1.796 +            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
   1.797 +        }
   1.798 +
   1.799 +        // Look ahead for possible suffixes if a dictionary word does not follow.
   1.800 +        // We do this in code rather than using a rule so that the heuristic
   1.801 +        // resynch continues to function. For example, one of the suffix characters
   1.802 +        // could be a typo in the middle of a word.
   1.803 +//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
   1.804 +//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
   1.805 +//                && fSuffixSet.contains(uc = utext_current32(text))) {
   1.806 +//                if (uc == KHMER_PAIYANNOI) {
   1.807 +//                    if (!fSuffixSet.contains(utext_previous32(text))) {
   1.808 +//                        // Skip over previous end and PAIYANNOI
   1.809 +//                        utext_next32(text);
   1.810 +//                        utext_next32(text);
   1.811 +//                        wordLength += 1;            // Add PAIYANNOI to word
   1.812 +//                        uc = utext_current32(text);     // Fetch next character
   1.813 +//                    }
   1.814 +//                    else {
   1.815 +//                        // Restore prior position
   1.816 +//                        utext_next32(text);
   1.817 +//                    }
   1.818 +//                }
   1.819 +//                if (uc == KHMER_MAIYAMOK) {
   1.820 +//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
   1.821 +//                        // Skip over previous end and MAIYAMOK
   1.822 +//                        utext_next32(text);
   1.823 +//                        utext_next32(text);
   1.824 +//                        wordLength += 1;            // Add MAIYAMOK to word
   1.825 +//                    }
   1.826 +//                    else {
   1.827 +//                        // Restore prior position
   1.828 +//                        utext_next32(text);
   1.829 +//                    }
   1.830 +//                }
   1.831 +//            }
   1.832 +//            else {
   1.833 +//                utext_setNativeIndex(text, current+wordLength);
   1.834 +//            }
   1.835 +//        }
   1.836 +
   1.837 +        // Did we find a word on this iteration? If so, push it on the break stack
   1.838 +        if (wordLength > 0) {
   1.839 +            foundBreaks.push((current+wordLength), status);
   1.840 +        }
   1.841 +    }
   1.842 +    
   1.843 +    // Don't return a break for the end of the dictionary range if there is one there.
   1.844 +    if (foundBreaks.peeki() >= rangeEnd) {
   1.845 +        (void) foundBreaks.popi();
   1.846 +        wordsFound -= 1;
   1.847 +    }
   1.848 +
   1.849 +    return wordsFound;
   1.850 +}
   1.851 +
   1.852 +#if !UCONFIG_NO_NORMALIZATION
   1.853 +/*
   1.854 + ******************************************************************
   1.855 + * CjkBreakEngine
   1.856 + */
   1.857 +static const uint32_t kuint32max = 0xFFFFFFFF;
   1.858 +CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
   1.859 +: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
   1.860 +    // Korean dictionary only includes Hangul syllables
   1.861 +    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
   1.862 +    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
   1.863 +    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
   1.864 +    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
   1.865 +
   1.866 +    if (U_SUCCESS(status)) {
   1.867 +        // handle Korean and Japanese/Chinese using different dictionaries
   1.868 +        if (type == kKorean) {
   1.869 +            setCharacters(fHangulWordSet);
   1.870 +        } else { //Chinese and Japanese
   1.871 +            UnicodeSet cjSet;
   1.872 +            cjSet.addAll(fHanWordSet);
   1.873 +            cjSet.addAll(fKatakanaWordSet);
   1.874 +            cjSet.addAll(fHiraganaWordSet);
   1.875 +            cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
   1.876 +            cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
   1.877 +            setCharacters(cjSet);
   1.878 +        }
   1.879 +    }
   1.880 +}
   1.881 +
   1.882 +CjkBreakEngine::~CjkBreakEngine(){
   1.883 +    delete fDictionary;
   1.884 +}
   1.885 +
   1.886 +// The katakanaCost values below are based on the length frequencies of all
   1.887 +// katakana phrases in the dictionary
   1.888 +static const int kMaxKatakanaLength = 8;
   1.889 +static const int kMaxKatakanaGroupLength = 20;
   1.890 +static const uint32_t maxSnlp = 255;
   1.891 +
   1.892 +static inline uint32_t getKatakanaCost(int wordLength){
   1.893 +    //TODO: fill array with actual values from dictionary!
   1.894 +    static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
   1.895 +                                       = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
   1.896 +    return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
   1.897 +}
   1.898 +
   1.899 +static inline bool isKatakana(uint16_t value) {
   1.900 +    return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
   1.901 +            (value >= 0xFF66u && value <= 0xFF9fu);
   1.902 +}
   1.903 +
   1.904 +// A very simple helper class to streamline the buffer handling in
   1.905 +// divideUpDictionaryRange. 
   1.906 +template<class T, size_t N>
   1.907 +class AutoBuffer {
   1.908 +public:
   1.909 +    AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
   1.910 +        if (size > N) {
   1.911 +            buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
   1.912 +            capacity = size;
   1.913 +        }
   1.914 +    }
   1.915 +    ~AutoBuffer() {
   1.916 +        if (buffer != stackBuffer)
   1.917 +            uprv_free(buffer);
   1.918 +    }
   1.919 +
   1.920 +    T* elems() {
   1.921 +        return buffer;
   1.922 +    }
   1.923 +
   1.924 +    const T& operator[] (size_t i) const {
   1.925 +        return buffer[i];
   1.926 +    }
   1.927 +
   1.928 +    T& operator[] (size_t i) {
   1.929 +        return buffer[i];
   1.930 +    }
   1.931 +
   1.932 +    // resize without copy
   1.933 +    void resize(size_t size) {
   1.934 +        if (size <= capacity)
   1.935 +            return;
   1.936 +        if (buffer != stackBuffer)
   1.937 +            uprv_free(buffer);
   1.938 +        buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
   1.939 +        capacity = size;
   1.940 +    }
   1.941 +
   1.942 +private:
   1.943 +    T stackBuffer[N];
   1.944 +    T* buffer;
   1.945 +    AutoBuffer();
   1.946 +    size_t capacity;
   1.947 +};
   1.948 +
   1.949 +
   1.950 +/*
   1.951 + * @param text A UText representing the text
   1.952 + * @param rangeStart The start of the range of dictionary characters
   1.953 + * @param rangeEnd The end of the range of dictionary characters
   1.954 + * @param foundBreaks Output of C array of int32_t break positions, or 0
   1.955 + * @return The number of breaks found
   1.956 + */
   1.957 +int32_t 
   1.958 +CjkBreakEngine::divideUpDictionaryRange( UText *text,
   1.959 +        int32_t rangeStart,
   1.960 +        int32_t rangeEnd,
   1.961 +        UStack &foundBreaks ) const {
   1.962 +    if (rangeStart >= rangeEnd) {
   1.963 +        return 0;
   1.964 +    }
   1.965 +
   1.966 +    const size_t defaultInputLength = 80;
   1.967 +    size_t inputLength = rangeEnd - rangeStart;
   1.968 +    // TODO: Replace by UnicodeString.
   1.969 +    AutoBuffer<UChar, defaultInputLength> charString(inputLength);
   1.970 +
   1.971 +    // Normalize the input string and put it in normalizedText.
   1.972 +    // The map from the indices of the normalized input to the raw
   1.973 +    // input is kept in charPositions.
   1.974 +    UErrorCode status = U_ZERO_ERROR;
   1.975 +    utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
   1.976 +    if (U_FAILURE(status)) {
   1.977 +        return 0;
   1.978 +    }
   1.979 +
   1.980 +    UnicodeString inputString(charString.elems(), inputLength);
   1.981 +    // TODO: Use Normalizer2.
   1.982 +    UNormalizationMode norm_mode = UNORM_NFKC;
   1.983 +    UBool isNormalized =
   1.984 +        Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
   1.985 +        Normalizer::isNormalized(inputString, norm_mode, status);
   1.986 +
   1.987 +    // TODO: Replace by UVector32.
   1.988 +    AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
   1.989 +    int numChars = 0;
   1.990 +    UText normalizedText = UTEXT_INITIALIZER;
   1.991 +    // Needs to be declared here because normalizedText holds onto its buffer.
   1.992 +    UnicodeString normalizedString;
   1.993 +    if (isNormalized) {
   1.994 +        int32_t index = 0;
   1.995 +        charPositions[0] = 0;
   1.996 +        while(index < inputString.length()) {
   1.997 +            index = inputString.moveIndex32(index, 1);
   1.998 +            charPositions[++numChars] = index;
   1.999 +        }
  1.1000 +        utext_openUnicodeString(&normalizedText, &inputString, &status);
  1.1001 +    }
  1.1002 +    else {
  1.1003 +        Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
  1.1004 +        if (U_FAILURE(status)) {
  1.1005 +            return 0;
  1.1006 +        }
  1.1007 +        charPositions.resize(normalizedString.length() + 1);
  1.1008 +        Normalizer normalizer(charString.elems(), inputLength, norm_mode);
  1.1009 +        int32_t index = 0;
  1.1010 +        charPositions[0] = 0;
  1.1011 +        while(index < normalizer.endIndex()){
  1.1012 +            /* UChar32 uc = */ normalizer.next();
  1.1013 +            charPositions[++numChars] = index = normalizer.getIndex();
  1.1014 +        }
  1.1015 +        utext_openUnicodeString(&normalizedText, &normalizedString, &status);
  1.1016 +    }
  1.1017 +
  1.1018 +    if (U_FAILURE(status)) {
  1.1019 +        return 0;
  1.1020 +    }
  1.1021 +
  1.1022 +    // From this point on, all the indices refer to the indices of
  1.1023 +    // the normalized input string.
  1.1024 +
  1.1025 +    // bestSnlp[i] is the snlp of the best segmentation of the first i
  1.1026 +    // characters in the range to be matched.
  1.1027 +    // TODO: Replace by UVector32.
  1.1028 +    AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
  1.1029 +    bestSnlp[0] = 0;
  1.1030 +    for(int i = 1; i <= numChars; i++) {
  1.1031 +        bestSnlp[i] = kuint32max;
  1.1032 +    }
  1.1033 +
  1.1034 +    // prev[i] is the index of the last CJK character in the previous word in 
  1.1035 +    // the best segmentation of the first i characters.
  1.1036 +    // TODO: Replace by UVector32.
  1.1037 +    AutoBuffer<int, defaultInputLength> prev(numChars + 1);
  1.1038 +    for(int i = 0; i <= numChars; i++){
  1.1039 +        prev[i] = -1;
  1.1040 +    }
  1.1041 +
  1.1042 +    const size_t maxWordSize = 20;
  1.1043 +    // TODO: Replace both with UVector32.
  1.1044 +    AutoBuffer<int32_t, maxWordSize> values(numChars);
  1.1045 +    AutoBuffer<int32_t, maxWordSize> lengths(numChars);
  1.1046 +
  1.1047 +    // Dynamic programming to find the best segmentation.
  1.1048 +    bool is_prev_katakana = false;
  1.1049 +    for (int32_t i = 0; i < numChars; ++i) {
  1.1050 +        //utext_setNativeIndex(text, rangeStart + i);
  1.1051 +        utext_setNativeIndex(&normalizedText, i);
  1.1052 +        if (bestSnlp[i] == kuint32max)
  1.1053 +            continue;
  1.1054 +
  1.1055 +        int32_t count;
  1.1056 +        // limit maximum word length matched to size of current substring
  1.1057 +        int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i);
  1.1058 +
  1.1059 +        fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
  1.1060 +
  1.1061 +        // if there are no single character matches found in the dictionary 
  1.1062 +        // starting with this charcter, treat character as a 1-character word 
  1.1063 +        // with the highest value possible, i.e. the least likely to occur.
  1.1064 +        // Exclude Korean characters from this treatment, as they should be left
  1.1065 +        // together by default.
  1.1066 +        if((count == 0 || lengths[0] != 1) &&
  1.1067 +                !fHangulWordSet.contains(utext_current32(&normalizedText))) {
  1.1068 +            values[count] = maxSnlp;
  1.1069 +            lengths[count++] = 1;
  1.1070 +        }
  1.1071 +
  1.1072 +        for (int j = 0; j < count; j++) {
  1.1073 +            uint32_t newSnlp = bestSnlp[i] + values[j];
  1.1074 +            if (newSnlp < bestSnlp[lengths[j] + i]) {
  1.1075 +                bestSnlp[lengths[j] + i] = newSnlp;
  1.1076 +                prev[lengths[j] + i] = i;
  1.1077 +            }
  1.1078 +        }
  1.1079 +
  1.1080 +        // In Japanese,
  1.1081 +        // Katakana word in single character is pretty rare. So we apply
  1.1082 +        // the following heuristic to Katakana: any continuous run of Katakana
  1.1083 +        // characters is considered a candidate word with a default cost
  1.1084 +        // specified in the katakanaCost table according to its length.
  1.1085 +        //utext_setNativeIndex(text, rangeStart + i);
  1.1086 +        utext_setNativeIndex(&normalizedText, i);
  1.1087 +        bool is_katakana = isKatakana(utext_current32(&normalizedText));
  1.1088 +        if (!is_prev_katakana && is_katakana) {
  1.1089 +            int j = i + 1;
  1.1090 +            utext_next32(&normalizedText);
  1.1091 +            // Find the end of the continuous run of Katakana characters
  1.1092 +            while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
  1.1093 +                    isKatakana(utext_current32(&normalizedText))) {
  1.1094 +                utext_next32(&normalizedText);
  1.1095 +                ++j;
  1.1096 +            }
  1.1097 +            if ((j - i) < kMaxKatakanaGroupLength) {
  1.1098 +                uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
  1.1099 +                if (newSnlp < bestSnlp[j]) {
  1.1100 +                    bestSnlp[j] = newSnlp;
  1.1101 +                    prev[j] = i;
  1.1102 +                }
  1.1103 +            }
  1.1104 +        }
  1.1105 +        is_prev_katakana = is_katakana;
  1.1106 +    }
  1.1107 +
  1.1108 +    // Start pushing the optimal offset index into t_boundary (t for tentative).
  1.1109 +    // prev[numChars] is guaranteed to be meaningful.
  1.1110 +    // We'll first push in the reverse order, i.e.,
  1.1111 +    // t_boundary[0] = numChars, and afterwards do a swap.
  1.1112 +    // TODO: Replace by UVector32.
  1.1113 +    AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
  1.1114 +
  1.1115 +    int numBreaks = 0;
  1.1116 +    // No segmentation found, set boundary to end of range
  1.1117 +    if (bestSnlp[numChars] == kuint32max) {
  1.1118 +        t_boundary[numBreaks++] = numChars;
  1.1119 +    } else {
  1.1120 +        for (int i = numChars; i > 0; i = prev[i]) {
  1.1121 +            t_boundary[numBreaks++] = i;
  1.1122 +        }
  1.1123 +        U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0);
  1.1124 +    }
  1.1125 +
  1.1126 +    // Reverse offset index in t_boundary.
  1.1127 +    // Don't add a break for the start of the dictionary range if there is one
  1.1128 +    // there already.
  1.1129 +    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
  1.1130 +        t_boundary[numBreaks++] = 0;
  1.1131 +    }
  1.1132 +
  1.1133 +    // Now that we're done, convert positions in t_bdry[] (indices in 
  1.1134 +    // the normalized input string) back to indices in the raw input string
  1.1135 +    // while reversing t_bdry and pushing values to foundBreaks.
  1.1136 +    for (int i = numBreaks-1; i >= 0; i--) {
  1.1137 +        foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
  1.1138 +    }
  1.1139 +
  1.1140 +    utext_close(&normalizedText);
  1.1141 +    return numBreaks;
  1.1142 +}
  1.1143 +#endif
  1.1144 +
  1.1145 +U_NAMESPACE_END
  1.1146 +
  1.1147 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  1.1148 +

mercurial