Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2008-2010, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * Date Name Description |
michael@0 | 7 | * 05/11/2008 Andy Heninger Port from Java |
michael@0 | 8 | ********************************************************************** |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "unicode/utypes.h" |
michael@0 | 12 | |
michael@0 | 13 | #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 14 | |
michael@0 | 15 | #include "unicode/unifilt.h" |
michael@0 | 16 | #include "unicode/uchar.h" |
michael@0 | 17 | #include "unicode/uniset.h" |
michael@0 | 18 | #include "unicode/brkiter.h" |
michael@0 | 19 | #include "brktrans.h" |
michael@0 | 20 | #include "unicode/uchar.h" |
michael@0 | 21 | #include "cmemory.h" |
michael@0 | 22 | #include "uprops.h" |
michael@0 | 23 | #include "uinvchar.h" |
michael@0 | 24 | #include "util.h" |
michael@0 | 25 | #include "uvectr32.h" |
michael@0 | 26 | |
michael@0 | 27 | U_NAMESPACE_BEGIN |
michael@0 | 28 | |
michael@0 | 29 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) |
michael@0 | 30 | |
michael@0 | 31 | static const UChar SPACE = 32; // ' ' |
michael@0 | 32 | |
michael@0 | 33 | |
michael@0 | 34 | /** |
michael@0 | 35 | * Constructs a transliterator with the default delimiters '{' and |
michael@0 | 36 | * '}'. |
michael@0 | 37 | */ |
michael@0 | 38 | BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : |
michael@0 | 39 | Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), |
michael@0 | 40 | fInsertion(SPACE) { |
michael@0 | 41 | bi = NULL; |
michael@0 | 42 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 43 | boundaries = new UVector32(status); |
michael@0 | 44 | } |
michael@0 | 45 | |
michael@0 | 46 | |
michael@0 | 47 | /** |
michael@0 | 48 | * Destructor. |
michael@0 | 49 | */ |
michael@0 | 50 | BreakTransliterator::~BreakTransliterator() { |
michael@0 | 51 | delete bi; |
michael@0 | 52 | bi = NULL; |
michael@0 | 53 | delete boundaries; |
michael@0 | 54 | boundaries = NULL; |
michael@0 | 55 | } |
michael@0 | 56 | |
michael@0 | 57 | /** |
michael@0 | 58 | * Copy constructor. |
michael@0 | 59 | */ |
michael@0 | 60 | BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : |
michael@0 | 61 | Transliterator(o) { |
michael@0 | 62 | bi = NULL; |
michael@0 | 63 | if (o.bi != NULL) { |
michael@0 | 64 | bi = o.bi->clone(); |
michael@0 | 65 | } |
michael@0 | 66 | fInsertion = o.fInsertion; |
michael@0 | 67 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 68 | boundaries = new UVector32(status); |
michael@0 | 69 | } |
michael@0 | 70 | |
michael@0 | 71 | |
michael@0 | 72 | /** |
michael@0 | 73 | * Transliterator API. |
michael@0 | 74 | */ |
michael@0 | 75 | Transliterator* BreakTransliterator::clone(void) const { |
michael@0 | 76 | return new BreakTransliterator(*this); |
michael@0 | 77 | } |
michael@0 | 78 | |
michael@0 | 79 | /** |
michael@0 | 80 | * Implements {@link Transliterator#handleTransliterate}. |
michael@0 | 81 | */ |
michael@0 | 82 | void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, |
michael@0 | 83 | UBool isIncremental ) const { |
michael@0 | 84 | |
michael@0 | 85 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 86 | boundaries->removeAllElements(); |
michael@0 | 87 | BreakTransliterator *nonConstThis = (BreakTransliterator *)this; |
michael@0 | 88 | nonConstThis->getBreakIterator(); // Lazy-create it if necessary |
michael@0 | 89 | UnicodeString sText = replaceableAsString(text); |
michael@0 | 90 | bi->setText(sText); |
michael@0 | 91 | bi->preceding(offsets.start); |
michael@0 | 92 | |
michael@0 | 93 | // To make things much easier, we will stack the boundaries, and then insert at the end. |
michael@0 | 94 | // generally, we won't need too many, since we will be filtered. |
michael@0 | 95 | |
michael@0 | 96 | int32_t boundary; |
michael@0 | 97 | for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { |
michael@0 | 98 | if (boundary == 0) continue; |
michael@0 | 99 | // HACK: Check to see that preceeding item was a letter |
michael@0 | 100 | |
michael@0 | 101 | UChar32 cp = sText.char32At(boundary-1); |
michael@0 | 102 | int type = u_charType(cp); |
michael@0 | 103 | //System.out.println(Integer.toString(cp,16) + " (before): " + type); |
michael@0 | 104 | if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; |
michael@0 | 105 | |
michael@0 | 106 | cp = sText.char32At(boundary); |
michael@0 | 107 | type = u_charType(cp); |
michael@0 | 108 | //System.out.println(Integer.toString(cp,16) + " (after): " + type); |
michael@0 | 109 | if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; |
michael@0 | 110 | |
michael@0 | 111 | boundaries->addElement(boundary, status); |
michael@0 | 112 | // printf("Boundary at %d\n", boundary); |
michael@0 | 113 | } |
michael@0 | 114 | |
michael@0 | 115 | int delta = 0; |
michael@0 | 116 | int lastBoundary = 0; |
michael@0 | 117 | |
michael@0 | 118 | if (boundaries->size() != 0) { // if we found something, adjust |
michael@0 | 119 | delta = boundaries->size() * fInsertion.length(); |
michael@0 | 120 | lastBoundary = boundaries->lastElementi(); |
michael@0 | 121 | |
michael@0 | 122 | // we do this from the end backwards, so that we don't have to keep updating. |
michael@0 | 123 | |
michael@0 | 124 | while (boundaries->size() > 0) { |
michael@0 | 125 | boundary = boundaries->popi(); |
michael@0 | 126 | text.handleReplaceBetween(boundary, boundary, fInsertion); |
michael@0 | 127 | } |
michael@0 | 128 | } |
michael@0 | 129 | |
michael@0 | 130 | // Now fix up the return values |
michael@0 | 131 | offsets.contextLimit += delta; |
michael@0 | 132 | offsets.limit += delta; |
michael@0 | 133 | offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; |
michael@0 | 134 | |
michael@0 | 135 | // TODO: do something with U_FAILURE(status); |
michael@0 | 136 | // (need to look at transliterators overall, not just here.) |
michael@0 | 137 | } |
michael@0 | 138 | |
michael@0 | 139 | // |
michael@0 | 140 | // getInsertion() |
michael@0 | 141 | // |
michael@0 | 142 | const UnicodeString &BreakTransliterator::getInsertion() const { |
michael@0 | 143 | return fInsertion; |
michael@0 | 144 | } |
michael@0 | 145 | |
michael@0 | 146 | // |
michael@0 | 147 | // setInsertion() |
michael@0 | 148 | // |
michael@0 | 149 | void BreakTransliterator::setInsertion(const UnicodeString &insertion) { |
michael@0 | 150 | this->fInsertion = insertion; |
michael@0 | 151 | } |
michael@0 | 152 | |
michael@0 | 153 | // |
michael@0 | 154 | // getBreakIterator Lazily create the break iterator if it does |
michael@0 | 155 | // not already exist. Copied from Java, probably |
michael@0 | 156 | // better to just create it in the constructor. |
michael@0 | 157 | // |
michael@0 | 158 | BreakIterator *BreakTransliterator::getBreakIterator() { |
michael@0 | 159 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 160 | if (bi == NULL) { |
michael@0 | 161 | // Note: Thai breaking behavior is universal, it is not |
michael@0 | 162 | // tied to the Thai locale. |
michael@0 | 163 | bi = BreakIterator::createWordInstance(Locale::getEnglish(), status); |
michael@0 | 164 | } |
michael@0 | 165 | return bi; |
michael@0 | 166 | } |
michael@0 | 167 | |
michael@0 | 168 | // |
michael@0 | 169 | // replaceableAsString Hack to let break iterators work |
michael@0 | 170 | // on the replaceable text from transliterators. |
michael@0 | 171 | // In practice, the only real Replaceable type that we |
michael@0 | 172 | // will be seeing is UnicodeString, so this function |
michael@0 | 173 | // will normally be efficient. |
michael@0 | 174 | // |
michael@0 | 175 | UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { |
michael@0 | 176 | UnicodeString s; |
michael@0 | 177 | UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); |
michael@0 | 178 | if (rs != NULL) { |
michael@0 | 179 | s = *rs; |
michael@0 | 180 | } else { |
michael@0 | 181 | r.extractBetween(0, r.length(), s); |
michael@0 | 182 | } |
michael@0 | 183 | return s; |
michael@0 | 184 | } |
michael@0 | 185 | |
michael@0 | 186 | U_NAMESPACE_END |
michael@0 | 187 | |
michael@0 | 188 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |