Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 1999-2013, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * Date Name Description |
michael@0 | 7 | * 11/17/99 aliu Creation. |
michael@0 | 8 | ********************************************************************** |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "unicode/utypes.h" |
michael@0 | 12 | |
michael@0 | 13 | #if !UCONFIG_NO_TRANSLITERATION |
michael@0 | 14 | |
michael@0 | 15 | #include "unicode/rep.h" |
michael@0 | 16 | #include "unicode/uniset.h" |
michael@0 | 17 | #include "rbt_pars.h" |
michael@0 | 18 | #include "rbt_data.h" |
michael@0 | 19 | #include "rbt_rule.h" |
michael@0 | 20 | #include "rbt.h" |
michael@0 | 21 | #include "umutex.h" |
michael@0 | 22 | |
michael@0 | 23 | U_NAMESPACE_BEGIN |
michael@0 | 24 | |
michael@0 | 25 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator) |
michael@0 | 26 | |
michael@0 | 27 | static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER; |
michael@0 | 28 | static Replaceable *gLockedText = NULL; |
michael@0 | 29 | |
michael@0 | 30 | void RuleBasedTransliterator::_construct(const UnicodeString& rules, |
michael@0 | 31 | UTransDirection direction, |
michael@0 | 32 | UParseError& parseError, |
michael@0 | 33 | UErrorCode& status) { |
michael@0 | 34 | fData = 0; |
michael@0 | 35 | isDataOwned = TRUE; |
michael@0 | 36 | if (U_FAILURE(status)) { |
michael@0 | 37 | return; |
michael@0 | 38 | } |
michael@0 | 39 | |
michael@0 | 40 | TransliteratorParser parser(status); |
michael@0 | 41 | parser.parse(rules, direction, parseError, status); |
michael@0 | 42 | if (U_FAILURE(status)) { |
michael@0 | 43 | return; |
michael@0 | 44 | } |
michael@0 | 45 | |
michael@0 | 46 | if (parser.idBlockVector.size() != 0 || |
michael@0 | 47 | parser.compoundFilter != NULL || |
michael@0 | 48 | parser.dataVector.size() == 0) { |
michael@0 | 49 | status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT |
michael@0 | 50 | return; |
michael@0 | 51 | } |
michael@0 | 52 | |
michael@0 | 53 | fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); |
michael@0 | 54 | setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
michael@0 | 55 | } |
michael@0 | 56 | |
michael@0 | 57 | /** |
michael@0 | 58 | * Constructs a new transliterator from the given rules. |
michael@0 | 59 | * @param id the id for the transliterator. |
michael@0 | 60 | * @param rules rules, separated by ';' |
michael@0 | 61 | * @param direction either FORWARD or REVERSE. |
michael@0 | 62 | * @param adoptedFilter the filter for this transliterator. |
michael@0 | 63 | * @param parseError Struct to recieve information on position |
michael@0 | 64 | * of error if an error is encountered |
michael@0 | 65 | * @param status Output param set to success/failure code. |
michael@0 | 66 | * @exception IllegalArgumentException if rules are malformed |
michael@0 | 67 | * or direction is invalid. |
michael@0 | 68 | */ |
michael@0 | 69 | RuleBasedTransliterator::RuleBasedTransliterator( |
michael@0 | 70 | const UnicodeString& id, |
michael@0 | 71 | const UnicodeString& rules, |
michael@0 | 72 | UTransDirection direction, |
michael@0 | 73 | UnicodeFilter* adoptedFilter, |
michael@0 | 74 | UParseError& parseError, |
michael@0 | 75 | UErrorCode& status) : |
michael@0 | 76 | Transliterator(id, adoptedFilter) { |
michael@0 | 77 | _construct(rules, direction,parseError,status); |
michael@0 | 78 | } |
michael@0 | 79 | |
michael@0 | 80 | /** |
michael@0 | 81 | * Constructs a new transliterator from the given rules. |
michael@0 | 82 | * @param id the id for the transliterator. |
michael@0 | 83 | * @param rules rules, separated by ';' |
michael@0 | 84 | * @param direction either FORWARD or REVERSE. |
michael@0 | 85 | * @param adoptedFilter the filter for this transliterator. |
michael@0 | 86 | * @param status Output param set to success/failure code. |
michael@0 | 87 | * @exception IllegalArgumentException if rules are malformed |
michael@0 | 88 | * or direction is invalid. |
michael@0 | 89 | */ |
michael@0 | 90 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
michael@0 | 91 | const UnicodeString& id, |
michael@0 | 92 | const UnicodeString& rules, |
michael@0 | 93 | UTransDirection direction, |
michael@0 | 94 | UnicodeFilter* adoptedFilter, |
michael@0 | 95 | UErrorCode& status) : |
michael@0 | 96 | Transliterator(id, adoptedFilter) { |
michael@0 | 97 | UParseError parseError; |
michael@0 | 98 | _construct(rules, direction,parseError, status); |
michael@0 | 99 | }*/ |
michael@0 | 100 | |
michael@0 | 101 | /** |
michael@0 | 102 | * Covenience constructor with no filter. |
michael@0 | 103 | */ |
michael@0 | 104 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
michael@0 | 105 | const UnicodeString& id, |
michael@0 | 106 | const UnicodeString& rules, |
michael@0 | 107 | UTransDirection direction, |
michael@0 | 108 | UErrorCode& status) : |
michael@0 | 109 | Transliterator(id, 0) { |
michael@0 | 110 | UParseError parseError; |
michael@0 | 111 | _construct(rules, direction,parseError, status); |
michael@0 | 112 | }*/ |
michael@0 | 113 | |
michael@0 | 114 | /** |
michael@0 | 115 | * Covenience constructor with no filter and FORWARD direction. |
michael@0 | 116 | */ |
michael@0 | 117 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
michael@0 | 118 | const UnicodeString& id, |
michael@0 | 119 | const UnicodeString& rules, |
michael@0 | 120 | UErrorCode& status) : |
michael@0 | 121 | Transliterator(id, 0) { |
michael@0 | 122 | UParseError parseError; |
michael@0 | 123 | _construct(rules, UTRANS_FORWARD, parseError, status); |
michael@0 | 124 | }*/ |
michael@0 | 125 | |
michael@0 | 126 | /** |
michael@0 | 127 | * Covenience constructor with FORWARD direction. |
michael@0 | 128 | */ |
michael@0 | 129 | /*RuleBasedTransliterator::RuleBasedTransliterator( |
michael@0 | 130 | const UnicodeString& id, |
michael@0 | 131 | const UnicodeString& rules, |
michael@0 | 132 | UnicodeFilter* adoptedFilter, |
michael@0 | 133 | UErrorCode& status) : |
michael@0 | 134 | Transliterator(id, adoptedFilter) { |
michael@0 | 135 | UParseError parseError; |
michael@0 | 136 | _construct(rules, UTRANS_FORWARD,parseError, status); |
michael@0 | 137 | }*/ |
michael@0 | 138 | |
michael@0 | 139 | RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, |
michael@0 | 140 | const TransliterationRuleData* theData, |
michael@0 | 141 | UnicodeFilter* adoptedFilter) : |
michael@0 | 142 | Transliterator(id, adoptedFilter), |
michael@0 | 143 | fData((TransliterationRuleData*)theData), // cast away const |
michael@0 | 144 | isDataOwned(FALSE) { |
michael@0 | 145 | setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
michael@0 | 146 | } |
michael@0 | 147 | |
michael@0 | 148 | /** |
michael@0 | 149 | * Internal constructor. |
michael@0 | 150 | */ |
michael@0 | 151 | RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, |
michael@0 | 152 | TransliterationRuleData* theData, |
michael@0 | 153 | UBool isDataAdopted) : |
michael@0 | 154 | Transliterator(id, 0), |
michael@0 | 155 | fData(theData), |
michael@0 | 156 | isDataOwned(isDataAdopted) { |
michael@0 | 157 | setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); |
michael@0 | 158 | } |
michael@0 | 159 | |
michael@0 | 160 | /** |
michael@0 | 161 | * Copy constructor. |
michael@0 | 162 | */ |
michael@0 | 163 | RuleBasedTransliterator::RuleBasedTransliterator( |
michael@0 | 164 | const RuleBasedTransliterator& other) : |
michael@0 | 165 | Transliterator(other), fData(other.fData), |
michael@0 | 166 | isDataOwned(other.isDataOwned) { |
michael@0 | 167 | |
michael@0 | 168 | // The data object may or may not be owned. If it is not owned we |
michael@0 | 169 | // share it; it is invariant. If it is owned, it's still |
michael@0 | 170 | // invariant, but we need to copy it to prevent double-deletion. |
michael@0 | 171 | // If this becomes a performance issue (if people do a lot of RBT |
michael@0 | 172 | // copying -- unlikely) we can reference count the data object. |
michael@0 | 173 | |
michael@0 | 174 | // Only do a deep copy if this is owned data, that is, data that |
michael@0 | 175 | // will be later deleted. System transliterators contain |
michael@0 | 176 | // non-owned data. |
michael@0 | 177 | if (isDataOwned) { |
michael@0 | 178 | fData = new TransliterationRuleData(*other.fData); |
michael@0 | 179 | } |
michael@0 | 180 | } |
michael@0 | 181 | |
michael@0 | 182 | /** |
michael@0 | 183 | * Destructor. |
michael@0 | 184 | */ |
michael@0 | 185 | RuleBasedTransliterator::~RuleBasedTransliterator() { |
michael@0 | 186 | // Delete the data object only if we own it. |
michael@0 | 187 | if (isDataOwned) { |
michael@0 | 188 | delete fData; |
michael@0 | 189 | } |
michael@0 | 190 | } |
michael@0 | 191 | |
michael@0 | 192 | Transliterator* // Covariant return NOT ALLOWED (for portability) |
michael@0 | 193 | RuleBasedTransliterator::clone(void) const { |
michael@0 | 194 | return new RuleBasedTransliterator(*this); |
michael@0 | 195 | } |
michael@0 | 196 | |
michael@0 | 197 | /** |
michael@0 | 198 | * Implements {@link Transliterator#handleTransliterate}. |
michael@0 | 199 | */ |
michael@0 | 200 | void |
michael@0 | 201 | RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, |
michael@0 | 202 | UBool isIncremental) const { |
michael@0 | 203 | /* We keep contextStart and contextLimit fixed the entire time, |
michael@0 | 204 | * relative to the text -- contextLimit may move numerically if |
michael@0 | 205 | * text is inserted or removed. The start offset moves toward |
michael@0 | 206 | * limit, with replacements happening under it. |
michael@0 | 207 | * |
michael@0 | 208 | * Example: rules 1. ab>x|y |
michael@0 | 209 | * 2. yc>z |
michael@0 | 210 | * |
michael@0 | 211 | * |eabcd begin - no match, advance start |
michael@0 | 212 | * e|abcd match rule 1 - change text & adjust start |
michael@0 | 213 | * ex|ycd match rule 2 - change text & adjust start |
michael@0 | 214 | * exz|d no match, advance start |
michael@0 | 215 | * exzd| done |
michael@0 | 216 | */ |
michael@0 | 217 | |
michael@0 | 218 | /* A rule like |
michael@0 | 219 | * a>b|a |
michael@0 | 220 | * creates an infinite loop. To prevent that, we put an arbitrary |
michael@0 | 221 | * limit on the number of iterations that we take, one that is |
michael@0 | 222 | * high enough that any reasonable rules are ok, but low enough to |
michael@0 | 223 | * prevent a server from hanging. The limit is 16 times the |
michael@0 | 224 | * number of characters n, unless n is so large that 16n exceeds a |
michael@0 | 225 | * uint32_t. |
michael@0 | 226 | */ |
michael@0 | 227 | uint32_t loopCount = 0; |
michael@0 | 228 | uint32_t loopLimit = index.limit - index.start; |
michael@0 | 229 | if (loopLimit >= 0x10000000) { |
michael@0 | 230 | loopLimit = 0xFFFFFFFF; |
michael@0 | 231 | } else { |
michael@0 | 232 | loopLimit <<= 4; |
michael@0 | 233 | } |
michael@0 | 234 | |
michael@0 | 235 | // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent |
michael@0 | 236 | // operations must be prevented. |
michael@0 | 237 | // A Complication: compound transliterators can result in recursive entries to this |
michael@0 | 238 | // function, sometimes with different "This" objects, always with the same text. |
michael@0 | 239 | // Double-locking must be prevented in these cases. |
michael@0 | 240 | // |
michael@0 | 241 | |
michael@0 | 242 | // If the transliteration data is exclusively owned by this transliterator object, |
michael@0 | 243 | // we don't need to do any locking. No sharing between transliterators is possible, |
michael@0 | 244 | // so no concurrent access from multiple threads is possible. |
michael@0 | 245 | UBool lockedMutexAtThisLevel = FALSE; |
michael@0 | 246 | if (isDataOwned == FALSE) { |
michael@0 | 247 | // Test whether this request is operating on the same text string as some |
michael@0 | 248 | // some other transliteration that is still in progress and holding the |
michael@0 | 249 | // transliteration mutex. If so, do not lock the transliteration |
michael@0 | 250 | // mutex again. |
michael@0 | 251 | // TODO(andy): Need a better scheme for handling this. |
michael@0 | 252 | UBool needToLock; |
michael@0 | 253 | umtx_lock(NULL); |
michael@0 | 254 | needToLock = (&text != gLockedText); |
michael@0 | 255 | umtx_unlock(NULL); |
michael@0 | 256 | if (needToLock) { |
michael@0 | 257 | umtx_lock(&transliteratorDataMutex); |
michael@0 | 258 | gLockedText = &text; |
michael@0 | 259 | lockedMutexAtThisLevel = TRUE; |
michael@0 | 260 | } |
michael@0 | 261 | } |
michael@0 | 262 | |
michael@0 | 263 | // Check to make sure we don't dereference a null pointer. |
michael@0 | 264 | if (fData != NULL) { |
michael@0 | 265 | while (index.start < index.limit && |
michael@0 | 266 | loopCount <= loopLimit && |
michael@0 | 267 | fData->ruleSet.transliterate(text, index, isIncremental)) { |
michael@0 | 268 | ++loopCount; |
michael@0 | 269 | } |
michael@0 | 270 | } |
michael@0 | 271 | if (lockedMutexAtThisLevel) { |
michael@0 | 272 | gLockedText = NULL; |
michael@0 | 273 | umtx_unlock(&transliteratorDataMutex); |
michael@0 | 274 | } |
michael@0 | 275 | } |
michael@0 | 276 | |
michael@0 | 277 | UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource, |
michael@0 | 278 | UBool escapeUnprintable) const { |
michael@0 | 279 | return fData->ruleSet.toRules(rulesSource, escapeUnprintable); |
michael@0 | 280 | } |
michael@0 | 281 | |
michael@0 | 282 | /** |
michael@0 | 283 | * Implement Transliterator framework |
michael@0 | 284 | */ |
michael@0 | 285 | void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const { |
michael@0 | 286 | fData->ruleSet.getSourceTargetSet(result, FALSE); |
michael@0 | 287 | } |
michael@0 | 288 | |
michael@0 | 289 | /** |
michael@0 | 290 | * Override Transliterator framework |
michael@0 | 291 | */ |
michael@0 | 292 | UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const { |
michael@0 | 293 | return fData->ruleSet.getSourceTargetSet(result, TRUE); |
michael@0 | 294 | } |
michael@0 | 295 | |
michael@0 | 296 | U_NAMESPACE_END |
michael@0 | 297 | |
michael@0 | 298 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |