|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 1999-2013, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * Date Name Description |
|
7 * 11/17/99 aliu Creation. |
|
8 ********************************************************************** |
|
9 */ |
|
10 |
|
11 #include "unicode/utypes.h" |
|
12 |
|
13 #if !UCONFIG_NO_TRANSLITERATION |
|
14 |
|
15 #include "unicode/rep.h" |
|
16 #include "unicode/uniset.h" |
|
17 #include "rbt_pars.h" |
|
18 #include "rbt_data.h" |
|
19 #include "rbt_rule.h" |
|
20 #include "rbt.h" |
|
21 #include "umutex.h" |
|
22 |
|
23 U_NAMESPACE_BEGIN |
|
24 |
|
25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator) |
|
26 |
|
27 static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER; |
|
28 static Replaceable *gLockedText = NULL; |
|
29 |
|
30 void RuleBasedTransliterator::_construct(const UnicodeString& rules, |
|
31 UTransDirection direction, |
|
32 UParseError& parseError, |
|
33 UErrorCode& status) { |
|
34 fData = 0; |
|
35 isDataOwned = TRUE; |
|
36 if (U_FAILURE(status)) { |
|
37 return; |
|
38 } |
|
39 |
|
40 TransliteratorParser parser(status); |
|
41 parser.parse(rules, direction, parseError, status); |
|
42 if (U_FAILURE(status)) { |
|
43 return; |
|
44 } |
|
45 |
|
46 if (parser.idBlockVector.size() != 0 || |
|
47 parser.compoundFilter != NULL || |
|
48 parser.dataVector.size() == 0) { |
|
49 status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT |
|
50 return; |
|
51 } |
|
52 |
|
53 fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); |
|
54 setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
|
55 } |
|
56 |
|
57 /** |
|
58 * Constructs a new transliterator from the given rules. |
|
59 * @param id the id for the transliterator. |
|
60 * @param rules rules, separated by ';' |
|
61 * @param direction either FORWARD or REVERSE. |
|
62 * @param adoptedFilter the filter for this transliterator. |
|
63 * @param parseError Struct to recieve information on position |
|
64 * of error if an error is encountered |
|
65 * @param status Output param set to success/failure code. |
|
66 * @exception IllegalArgumentException if rules are malformed |
|
67 * or direction is invalid. |
|
68 */ |
|
69 RuleBasedTransliterator::RuleBasedTransliterator( |
|
70 const UnicodeString& id, |
|
71 const UnicodeString& rules, |
|
72 UTransDirection direction, |
|
73 UnicodeFilter* adoptedFilter, |
|
74 UParseError& parseError, |
|
75 UErrorCode& status) : |
|
76 Transliterator(id, adoptedFilter) { |
|
77 _construct(rules, direction,parseError,status); |
|
78 } |
|
79 |
|
80 /** |
|
81 * Constructs a new transliterator from the given rules. |
|
82 * @param id the id for the transliterator. |
|
83 * @param rules rules, separated by ';' |
|
84 * @param direction either FORWARD or REVERSE. |
|
85 * @param adoptedFilter the filter for this transliterator. |
|
86 * @param status Output param set to success/failure code. |
|
87 * @exception IllegalArgumentException if rules are malformed |
|
88 * or direction is invalid. |
|
89 */ |
|
90 /*RuleBasedTransliterator::RuleBasedTransliterator( |
|
91 const UnicodeString& id, |
|
92 const UnicodeString& rules, |
|
93 UTransDirection direction, |
|
94 UnicodeFilter* adoptedFilter, |
|
95 UErrorCode& status) : |
|
96 Transliterator(id, adoptedFilter) { |
|
97 UParseError parseError; |
|
98 _construct(rules, direction,parseError, status); |
|
99 }*/ |
|
100 |
|
101 /** |
|
102 * Covenience constructor with no filter. |
|
103 */ |
|
104 /*RuleBasedTransliterator::RuleBasedTransliterator( |
|
105 const UnicodeString& id, |
|
106 const UnicodeString& rules, |
|
107 UTransDirection direction, |
|
108 UErrorCode& status) : |
|
109 Transliterator(id, 0) { |
|
110 UParseError parseError; |
|
111 _construct(rules, direction,parseError, status); |
|
112 }*/ |
|
113 |
|
114 /** |
|
115 * Covenience constructor with no filter and FORWARD direction. |
|
116 */ |
|
117 /*RuleBasedTransliterator::RuleBasedTransliterator( |
|
118 const UnicodeString& id, |
|
119 const UnicodeString& rules, |
|
120 UErrorCode& status) : |
|
121 Transliterator(id, 0) { |
|
122 UParseError parseError; |
|
123 _construct(rules, UTRANS_FORWARD, parseError, status); |
|
124 }*/ |
|
125 |
|
126 /** |
|
127 * Covenience constructor with FORWARD direction. |
|
128 */ |
|
129 /*RuleBasedTransliterator::RuleBasedTransliterator( |
|
130 const UnicodeString& id, |
|
131 const UnicodeString& rules, |
|
132 UnicodeFilter* adoptedFilter, |
|
133 UErrorCode& status) : |
|
134 Transliterator(id, adoptedFilter) { |
|
135 UParseError parseError; |
|
136 _construct(rules, UTRANS_FORWARD,parseError, status); |
|
137 }*/ |
|
138 |
|
139 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, |
|
140 const TransliterationRuleData* theData, |
|
141 UnicodeFilter* adoptedFilter) : |
|
142 Transliterator(id, adoptedFilter), |
|
143 fData((TransliterationRuleData*)theData), // cast away const |
|
144 isDataOwned(FALSE) { |
|
145 setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
|
146 } |
|
147 |
|
148 /** |
|
149 * Internal constructor. |
|
150 */ |
|
151 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, |
|
152 TransliterationRuleData* theData, |
|
153 UBool isDataAdopted) : |
|
154 Transliterator(id, 0), |
|
155 fData(theData), |
|
156 isDataOwned(isDataAdopted) { |
|
157 setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
|
158 } |
|
159 |
|
160 /** |
|
161 * Copy constructor. |
|
162 */ |
|
163 RuleBasedTransliterator::RuleBasedTransliterator( |
|
164 const RuleBasedTransliterator& other) : |
|
165 Transliterator(other), fData(other.fData), |
|
166 isDataOwned(other.isDataOwned) { |
|
167 |
|
168 // The data object may or may not be owned. If it is not owned we |
|
169 // share it; it is invariant. If it is owned, it's still |
|
170 // invariant, but we need to copy it to prevent double-deletion. |
|
171 // If this becomes a performance issue (if people do a lot of RBT |
|
172 // copying -- unlikely) we can reference count the data object. |
|
173 |
|
174 // Only do a deep copy if this is owned data, that is, data that |
|
175 // will be later deleted. System transliterators contain |
|
176 // non-owned data. |
|
177 if (isDataOwned) { |
|
178 fData = new TransliterationRuleData(*other.fData); |
|
179 } |
|
180 } |
|
181 |
|
182 /** |
|
183 * Destructor. |
|
184 */ |
|
185 RuleBasedTransliterator::~RuleBasedTransliterator() { |
|
186 // Delete the data object only if we own it. |
|
187 if (isDataOwned) { |
|
188 delete fData; |
|
189 } |
|
190 } |
|
191 |
|
192 Transliterator* // Covariant return NOT ALLOWED (for portability) |
|
193 RuleBasedTransliterator::clone(void) const { |
|
194 return new RuleBasedTransliterator(*this); |
|
195 } |
|
196 |
|
197 /** |
|
198 * Implements {@link Transliterator#handleTransliterate}. |
|
199 */ |
|
200 void |
|
201 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, |
|
202 UBool isIncremental) const { |
|
203 /* We keep contextStart and contextLimit fixed the entire time, |
|
204 * relative to the text -- contextLimit may move numerically if |
|
205 * text is inserted or removed. The start offset moves toward |
|
206 * limit, with replacements happening under it. |
|
207 * |
|
208 * Example: rules 1. ab>x|y |
|
209 * 2. yc>z |
|
210 * |
|
211 * |eabcd begin - no match, advance start |
|
212 * e|abcd match rule 1 - change text & adjust start |
|
213 * ex|ycd match rule 2 - change text & adjust start |
|
214 * exz|d no match, advance start |
|
215 * exzd| done |
|
216 */ |
|
217 |
|
218 /* A rule like |
|
219 * a>b|a |
|
220 * creates an infinite loop. To prevent that, we put an arbitrary |
|
221 * limit on the number of iterations that we take, one that is |
|
222 * high enough that any reasonable rules are ok, but low enough to |
|
223 * prevent a server from hanging. The limit is 16 times the |
|
224 * number of characters n, unless n is so large that 16n exceeds a |
|
225 * uint32_t. |
|
226 */ |
|
227 uint32_t loopCount = 0; |
|
228 uint32_t loopLimit = index.limit - index.start; |
|
229 if (loopLimit >= 0x10000000) { |
|
230 loopLimit = 0xFFFFFFFF; |
|
231 } else { |
|
232 loopLimit <<= 4; |
|
233 } |
|
234 |
|
235 // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent |
|
236 // operations must be prevented. |
|
237 // A Complication: compound transliterators can result in recursive entries to this |
|
238 // function, sometimes with different "This" objects, always with the same text. |
|
239 // Double-locking must be prevented in these cases. |
|
240 // |
|
241 |
|
242 // If the transliteration data is exclusively owned by this transliterator object, |
|
243 // we don't need to do any locking. No sharing between transliterators is possible, |
|
244 // so no concurrent access from multiple threads is possible. |
|
245 UBool lockedMutexAtThisLevel = FALSE; |
|
246 if (isDataOwned == FALSE) { |
|
247 // Test whether this request is operating on the same text string as some |
|
248 // some other transliteration that is still in progress and holding the |
|
249 // transliteration mutex. If so, do not lock the transliteration |
|
250 // mutex again. |
|
251 // TODO(andy): Need a better scheme for handling this. |
|
252 UBool needToLock; |
|
253 umtx_lock(NULL); |
|
254 needToLock = (&text != gLockedText); |
|
255 umtx_unlock(NULL); |
|
256 if (needToLock) { |
|
257 umtx_lock(&transliteratorDataMutex); |
|
258 gLockedText = &text; |
|
259 lockedMutexAtThisLevel = TRUE; |
|
260 } |
|
261 } |
|
262 |
|
263 // Check to make sure we don't dereference a null pointer. |
|
264 if (fData != NULL) { |
|
265 while (index.start < index.limit && |
|
266 loopCount <= loopLimit && |
|
267 fData->ruleSet.transliterate(text, index, isIncremental)) { |
|
268 ++loopCount; |
|
269 } |
|
270 } |
|
271 if (lockedMutexAtThisLevel) { |
|
272 gLockedText = NULL; |
|
273 umtx_unlock(&transliteratorDataMutex); |
|
274 } |
|
275 } |
|
276 |
|
277 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource, |
|
278 UBool escapeUnprintable) const { |
|
279 return fData->ruleSet.toRules(rulesSource, escapeUnprintable); |
|
280 } |
|
281 |
|
282 /** |
|
283 * Implement Transliterator framework |
|
284 */ |
|
285 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const { |
|
286 fData->ruleSet.getSourceTargetSet(result, FALSE); |
|
287 } |
|
288 |
|
289 /** |
|
290 * Override Transliterator framework |
|
291 */ |
|
292 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const { |
|
293 return fData->ruleSet.getSourceTargetSet(result, TRUE); |
|
294 } |
|
295 |
|
296 U_NAMESPACE_END |
|
297 |
|
298 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |