1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/brktrans.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,188 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2008-2010, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* Date Name Description 1.10 +* 05/11/2008 Andy Heninger Port from Java 1.11 +********************************************************************** 1.12 +*/ 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION 1.17 + 1.18 +#include "unicode/unifilt.h" 1.19 +#include "unicode/uchar.h" 1.20 +#include "unicode/uniset.h" 1.21 +#include "unicode/brkiter.h" 1.22 +#include "brktrans.h" 1.23 +#include "unicode/uchar.h" 1.24 +#include "cmemory.h" 1.25 +#include "uprops.h" 1.26 +#include "uinvchar.h" 1.27 +#include "util.h" 1.28 +#include "uvectr32.h" 1.29 + 1.30 +U_NAMESPACE_BEGIN 1.31 + 1.32 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) 1.33 + 1.34 +static const UChar SPACE = 32; // ' ' 1.35 + 1.36 + 1.37 +/** 1.38 + * Constructs a transliterator with the default delimiters '{' and 1.39 + * '}'. 1.40 + */ 1.41 +BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : 1.42 + Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), 1.43 + fInsertion(SPACE) { 1.44 + bi = NULL; 1.45 + UErrorCode status = U_ZERO_ERROR; 1.46 + boundaries = new UVector32(status); 1.47 + } 1.48 + 1.49 + 1.50 +/** 1.51 + * Destructor. 1.52 + */ 1.53 +BreakTransliterator::~BreakTransliterator() { 1.54 + delete bi; 1.55 + bi = NULL; 1.56 + delete boundaries; 1.57 + boundaries = NULL; 1.58 +} 1.59 + 1.60 +/** 1.61 + * Copy constructor. 1.62 + */ 1.63 +BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : 1.64 + Transliterator(o) { 1.65 + bi = NULL; 1.66 + if (o.bi != NULL) { 1.67 + bi = o.bi->clone(); 1.68 + } 1.69 + fInsertion = o.fInsertion; 1.70 + UErrorCode status = U_ZERO_ERROR; 1.71 + boundaries = new UVector32(status); 1.72 + } 1.73 + 1.74 + 1.75 +/** 1.76 + * Transliterator API. 1.77 + */ 1.78 +Transliterator* BreakTransliterator::clone(void) const { 1.79 + return new BreakTransliterator(*this); 1.80 +} 1.81 + 1.82 +/** 1.83 + * Implements {@link Transliterator#handleTransliterate}. 1.84 + */ 1.85 +void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 1.86 + UBool isIncremental ) const { 1.87 + 1.88 + UErrorCode status = U_ZERO_ERROR; 1.89 + boundaries->removeAllElements(); 1.90 + BreakTransliterator *nonConstThis = (BreakTransliterator *)this; 1.91 + nonConstThis->getBreakIterator(); // Lazy-create it if necessary 1.92 + UnicodeString sText = replaceableAsString(text); 1.93 + bi->setText(sText); 1.94 + bi->preceding(offsets.start); 1.95 + 1.96 + // To make things much easier, we will stack the boundaries, and then insert at the end. 1.97 + // generally, we won't need too many, since we will be filtered. 1.98 + 1.99 + int32_t boundary; 1.100 + for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { 1.101 + if (boundary == 0) continue; 1.102 + // HACK: Check to see that preceeding item was a letter 1.103 + 1.104 + UChar32 cp = sText.char32At(boundary-1); 1.105 + int type = u_charType(cp); 1.106 + //System.out.println(Integer.toString(cp,16) + " (before): " + type); 1.107 + if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; 1.108 + 1.109 + cp = sText.char32At(boundary); 1.110 + type = u_charType(cp); 1.111 + //System.out.println(Integer.toString(cp,16) + " (after): " + type); 1.112 + if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; 1.113 + 1.114 + boundaries->addElement(boundary, status); 1.115 + // printf("Boundary at %d\n", boundary); 1.116 + } 1.117 + 1.118 + int delta = 0; 1.119 + int lastBoundary = 0; 1.120 + 1.121 + if (boundaries->size() != 0) { // if we found something, adjust 1.122 + delta = boundaries->size() * fInsertion.length(); 1.123 + lastBoundary = boundaries->lastElementi(); 1.124 + 1.125 + // we do this from the end backwards, so that we don't have to keep updating. 1.126 + 1.127 + while (boundaries->size() > 0) { 1.128 + boundary = boundaries->popi(); 1.129 + text.handleReplaceBetween(boundary, boundary, fInsertion); 1.130 + } 1.131 + } 1.132 + 1.133 + // Now fix up the return values 1.134 + offsets.contextLimit += delta; 1.135 + offsets.limit += delta; 1.136 + offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; 1.137 + 1.138 + // TODO: do something with U_FAILURE(status); 1.139 + // (need to look at transliterators overall, not just here.) 1.140 +} 1.141 + 1.142 +// 1.143 +// getInsertion() 1.144 +// 1.145 +const UnicodeString &BreakTransliterator::getInsertion() const { 1.146 + return fInsertion; 1.147 +} 1.148 + 1.149 +// 1.150 +// setInsertion() 1.151 +// 1.152 +void BreakTransliterator::setInsertion(const UnicodeString &insertion) { 1.153 + this->fInsertion = insertion; 1.154 +} 1.155 + 1.156 +// 1.157 +// getBreakIterator Lazily create the break iterator if it does 1.158 +// not already exist. Copied from Java, probably 1.159 +// better to just create it in the constructor. 1.160 +// 1.161 +BreakIterator *BreakTransliterator::getBreakIterator() { 1.162 + UErrorCode status = U_ZERO_ERROR; 1.163 + if (bi == NULL) { 1.164 + // Note: Thai breaking behavior is universal, it is not 1.165 + // tied to the Thai locale. 1.166 + bi = BreakIterator::createWordInstance(Locale::getEnglish(), status); 1.167 + } 1.168 + return bi; 1.169 +} 1.170 + 1.171 +// 1.172 +// replaceableAsString Hack to let break iterators work 1.173 +// on the replaceable text from transliterators. 1.174 +// In practice, the only real Replaceable type that we 1.175 +// will be seeing is UnicodeString, so this function 1.176 +// will normally be efficient. 1.177 +// 1.178 +UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { 1.179 + UnicodeString s; 1.180 + UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); 1.181 + if (rs != NULL) { 1.182 + s = *rs; 1.183 + } else { 1.184 + r.extractBetween(0, r.length(), s); 1.185 + } 1.186 + return s; 1.187 +} 1.188 + 1.189 +U_NAMESPACE_END 1.190 + 1.191 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */