|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2008-2010, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * Date Name Description |
|
7 * 05/11/2008 Andy Heninger Port from Java |
|
8 ********************************************************************** |
|
9 */ |
|
10 |
|
11 #include "unicode/utypes.h" |
|
12 |
|
13 #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION |
|
14 |
|
15 #include "unicode/unifilt.h" |
|
16 #include "unicode/uchar.h" |
|
17 #include "unicode/uniset.h" |
|
18 #include "unicode/brkiter.h" |
|
19 #include "brktrans.h" |
|
20 #include "unicode/uchar.h" |
|
21 #include "cmemory.h" |
|
22 #include "uprops.h" |
|
23 #include "uinvchar.h" |
|
24 #include "util.h" |
|
25 #include "uvectr32.h" |
|
26 |
|
27 U_NAMESPACE_BEGIN |
|
28 |
|
29 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) |
|
30 |
|
31 static const UChar SPACE = 32; // ' ' |
|
32 |
|
33 |
|
34 /** |
|
35 * Constructs a transliterator with the default delimiters '{' and |
|
36 * '}'. |
|
37 */ |
|
38 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : |
|
39 Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), |
|
40 fInsertion(SPACE) { |
|
41 bi = NULL; |
|
42 UErrorCode status = U_ZERO_ERROR; |
|
43 boundaries = new UVector32(status); |
|
44 } |
|
45 |
|
46 |
|
47 /** |
|
48 * Destructor. |
|
49 */ |
|
50 BreakTransliterator::~BreakTransliterator() { |
|
51 delete bi; |
|
52 bi = NULL; |
|
53 delete boundaries; |
|
54 boundaries = NULL; |
|
55 } |
|
56 |
|
57 /** |
|
58 * Copy constructor. |
|
59 */ |
|
60 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : |
|
61 Transliterator(o) { |
|
62 bi = NULL; |
|
63 if (o.bi != NULL) { |
|
64 bi = o.bi->clone(); |
|
65 } |
|
66 fInsertion = o.fInsertion; |
|
67 UErrorCode status = U_ZERO_ERROR; |
|
68 boundaries = new UVector32(status); |
|
69 } |
|
70 |
|
71 |
|
72 /** |
|
73 * Transliterator API. |
|
74 */ |
|
75 Transliterator* BreakTransliterator::clone(void) const { |
|
76 return new BreakTransliterator(*this); |
|
77 } |
|
78 |
|
79 /** |
|
80 * Implements {@link Transliterator#handleTransliterate}. |
|
81 */ |
|
82 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, |
|
83 UBool isIncremental ) const { |
|
84 |
|
85 UErrorCode status = U_ZERO_ERROR; |
|
86 boundaries->removeAllElements(); |
|
87 BreakTransliterator *nonConstThis = (BreakTransliterator *)this; |
|
88 nonConstThis->getBreakIterator(); // Lazy-create it if necessary |
|
89 UnicodeString sText = replaceableAsString(text); |
|
90 bi->setText(sText); |
|
91 bi->preceding(offsets.start); |
|
92 |
|
93 // To make things much easier, we will stack the boundaries, and then insert at the end. |
|
94 // generally, we won't need too many, since we will be filtered. |
|
95 |
|
96 int32_t boundary; |
|
97 for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { |
|
98 if (boundary == 0) continue; |
|
99 // HACK: Check to see that preceeding item was a letter |
|
100 |
|
101 UChar32 cp = sText.char32At(boundary-1); |
|
102 int type = u_charType(cp); |
|
103 //System.out.println(Integer.toString(cp,16) + " (before): " + type); |
|
104 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; |
|
105 |
|
106 cp = sText.char32At(boundary); |
|
107 type = u_charType(cp); |
|
108 //System.out.println(Integer.toString(cp,16) + " (after): " + type); |
|
109 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; |
|
110 |
|
111 boundaries->addElement(boundary, status); |
|
112 // printf("Boundary at %d\n", boundary); |
|
113 } |
|
114 |
|
115 int delta = 0; |
|
116 int lastBoundary = 0; |
|
117 |
|
118 if (boundaries->size() != 0) { // if we found something, adjust |
|
119 delta = boundaries->size() * fInsertion.length(); |
|
120 lastBoundary = boundaries->lastElementi(); |
|
121 |
|
122 // we do this from the end backwards, so that we don't have to keep updating. |
|
123 |
|
124 while (boundaries->size() > 0) { |
|
125 boundary = boundaries->popi(); |
|
126 text.handleReplaceBetween(boundary, boundary, fInsertion); |
|
127 } |
|
128 } |
|
129 |
|
130 // Now fix up the return values |
|
131 offsets.contextLimit += delta; |
|
132 offsets.limit += delta; |
|
133 offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; |
|
134 |
|
135 // TODO: do something with U_FAILURE(status); |
|
136 // (need to look at transliterators overall, not just here.) |
|
137 } |
|
138 |
|
139 // |
|
140 // getInsertion() |
|
141 // |
|
142 const UnicodeString &BreakTransliterator::getInsertion() const { |
|
143 return fInsertion; |
|
144 } |
|
145 |
|
146 // |
|
147 // setInsertion() |
|
148 // |
|
149 void BreakTransliterator::setInsertion(const UnicodeString &insertion) { |
|
150 this->fInsertion = insertion; |
|
151 } |
|
152 |
|
153 // |
|
154 // getBreakIterator Lazily create the break iterator if it does |
|
155 // not already exist. Copied from Java, probably |
|
156 // better to just create it in the constructor. |
|
157 // |
|
158 BreakIterator *BreakTransliterator::getBreakIterator() { |
|
159 UErrorCode status = U_ZERO_ERROR; |
|
160 if (bi == NULL) { |
|
161 // Note: Thai breaking behavior is universal, it is not |
|
162 // tied to the Thai locale. |
|
163 bi = BreakIterator::createWordInstance(Locale::getEnglish(), status); |
|
164 } |
|
165 return bi; |
|
166 } |
|
167 |
|
168 // |
|
169 // replaceableAsString Hack to let break iterators work |
|
170 // on the replaceable text from transliterators. |
|
171 // In practice, the only real Replaceable type that we |
|
172 // will be seeing is UnicodeString, so this function |
|
173 // will normally be efficient. |
|
174 // |
|
175 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { |
|
176 UnicodeString s; |
|
177 UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); |
|
178 if (rs != NULL) { |
|
179 s = *rs; |
|
180 } else { |
|
181 r.extractBetween(0, r.length(), s); |
|
182 } |
|
183 return s; |
|
184 } |
|
185 |
|
186 U_NAMESPACE_END |
|
187 |
|
188 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |