intl/icu/source/i18n/rbt.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 1999-2013, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * Date Name Description
michael@0 7 * 11/17/99 aliu Creation.
michael@0 8 **********************************************************************
michael@0 9 */
michael@0 10
michael@0 11 #include "unicode/utypes.h"
michael@0 12
michael@0 13 #if !UCONFIG_NO_TRANSLITERATION
michael@0 14
michael@0 15 #include "unicode/rep.h"
michael@0 16 #include "unicode/uniset.h"
michael@0 17 #include "rbt_pars.h"
michael@0 18 #include "rbt_data.h"
michael@0 19 #include "rbt_rule.h"
michael@0 20 #include "rbt.h"
michael@0 21 #include "umutex.h"
michael@0 22
michael@0 23 U_NAMESPACE_BEGIN
michael@0 24
michael@0 25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
michael@0 26
michael@0 27 static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER;
michael@0 28 static Replaceable *gLockedText = NULL;
michael@0 29
michael@0 30 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
michael@0 31 UTransDirection direction,
michael@0 32 UParseError& parseError,
michael@0 33 UErrorCode& status) {
michael@0 34 fData = 0;
michael@0 35 isDataOwned = TRUE;
michael@0 36 if (U_FAILURE(status)) {
michael@0 37 return;
michael@0 38 }
michael@0 39
michael@0 40 TransliteratorParser parser(status);
michael@0 41 parser.parse(rules, direction, parseError, status);
michael@0 42 if (U_FAILURE(status)) {
michael@0 43 return;
michael@0 44 }
michael@0 45
michael@0 46 if (parser.idBlockVector.size() != 0 ||
michael@0 47 parser.compoundFilter != NULL ||
michael@0 48 parser.dataVector.size() == 0) {
michael@0 49 status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
michael@0 50 return;
michael@0 51 }
michael@0 52
michael@0 53 fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
michael@0 54 setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
michael@0 55 }
michael@0 56
michael@0 57 /**
michael@0 58 * Constructs a new transliterator from the given rules.
michael@0 59 * @param id the id for the transliterator.
michael@0 60 * @param rules rules, separated by ';'
michael@0 61 * @param direction either FORWARD or REVERSE.
michael@0 62 * @param adoptedFilter the filter for this transliterator.
michael@0 63 * @param parseError Struct to recieve information on position
michael@0 64 * of error if an error is encountered
michael@0 65 * @param status Output param set to success/failure code.
michael@0 66 * @exception IllegalArgumentException if rules are malformed
michael@0 67 * or direction is invalid.
michael@0 68 */
michael@0 69 RuleBasedTransliterator::RuleBasedTransliterator(
michael@0 70 const UnicodeString& id,
michael@0 71 const UnicodeString& rules,
michael@0 72 UTransDirection direction,
michael@0 73 UnicodeFilter* adoptedFilter,
michael@0 74 UParseError& parseError,
michael@0 75 UErrorCode& status) :
michael@0 76 Transliterator(id, adoptedFilter) {
michael@0 77 _construct(rules, direction,parseError,status);
michael@0 78 }
michael@0 79
michael@0 80 /**
michael@0 81 * Constructs a new transliterator from the given rules.
michael@0 82 * @param id the id for the transliterator.
michael@0 83 * @param rules rules, separated by ';'
michael@0 84 * @param direction either FORWARD or REVERSE.
michael@0 85 * @param adoptedFilter the filter for this transliterator.
michael@0 86 * @param status Output param set to success/failure code.
michael@0 87 * @exception IllegalArgumentException if rules are malformed
michael@0 88 * or direction is invalid.
michael@0 89 */
michael@0 90 /*RuleBasedTransliterator::RuleBasedTransliterator(
michael@0 91 const UnicodeString& id,
michael@0 92 const UnicodeString& rules,
michael@0 93 UTransDirection direction,
michael@0 94 UnicodeFilter* adoptedFilter,
michael@0 95 UErrorCode& status) :
michael@0 96 Transliterator(id, adoptedFilter) {
michael@0 97 UParseError parseError;
michael@0 98 _construct(rules, direction,parseError, status);
michael@0 99 }*/
michael@0 100
michael@0 101 /**
michael@0 102 * Covenience constructor with no filter.
michael@0 103 */
michael@0 104 /*RuleBasedTransliterator::RuleBasedTransliterator(
michael@0 105 const UnicodeString& id,
michael@0 106 const UnicodeString& rules,
michael@0 107 UTransDirection direction,
michael@0 108 UErrorCode& status) :
michael@0 109 Transliterator(id, 0) {
michael@0 110 UParseError parseError;
michael@0 111 _construct(rules, direction,parseError, status);
michael@0 112 }*/
michael@0 113
michael@0 114 /**
michael@0 115 * Covenience constructor with no filter and FORWARD direction.
michael@0 116 */
michael@0 117 /*RuleBasedTransliterator::RuleBasedTransliterator(
michael@0 118 const UnicodeString& id,
michael@0 119 const UnicodeString& rules,
michael@0 120 UErrorCode& status) :
michael@0 121 Transliterator(id, 0) {
michael@0 122 UParseError parseError;
michael@0 123 _construct(rules, UTRANS_FORWARD, parseError, status);
michael@0 124 }*/
michael@0 125
michael@0 126 /**
michael@0 127 * Covenience constructor with FORWARD direction.
michael@0 128 */
michael@0 129 /*RuleBasedTransliterator::RuleBasedTransliterator(
michael@0 130 const UnicodeString& id,
michael@0 131 const UnicodeString& rules,
michael@0 132 UnicodeFilter* adoptedFilter,
michael@0 133 UErrorCode& status) :
michael@0 134 Transliterator(id, adoptedFilter) {
michael@0 135 UParseError parseError;
michael@0 136 _construct(rules, UTRANS_FORWARD,parseError, status);
michael@0 137 }*/
michael@0 138
michael@0 139 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
michael@0 140 const TransliterationRuleData* theData,
michael@0 141 UnicodeFilter* adoptedFilter) :
michael@0 142 Transliterator(id, adoptedFilter),
michael@0 143 fData((TransliterationRuleData*)theData), // cast away const
michael@0 144 isDataOwned(FALSE) {
michael@0 145 setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
michael@0 146 }
michael@0 147
michael@0 148 /**
michael@0 149 * Internal constructor.
michael@0 150 */
michael@0 151 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
michael@0 152 TransliterationRuleData* theData,
michael@0 153 UBool isDataAdopted) :
michael@0 154 Transliterator(id, 0),
michael@0 155 fData(theData),
michael@0 156 isDataOwned(isDataAdopted) {
michael@0 157 setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
michael@0 158 }
michael@0 159
michael@0 160 /**
michael@0 161 * Copy constructor.
michael@0 162 */
michael@0 163 RuleBasedTransliterator::RuleBasedTransliterator(
michael@0 164 const RuleBasedTransliterator& other) :
michael@0 165 Transliterator(other), fData(other.fData),
michael@0 166 isDataOwned(other.isDataOwned) {
michael@0 167
michael@0 168 // The data object may or may not be owned. If it is not owned we
michael@0 169 // share it; it is invariant. If it is owned, it's still
michael@0 170 // invariant, but we need to copy it to prevent double-deletion.
michael@0 171 // If this becomes a performance issue (if people do a lot of RBT
michael@0 172 // copying -- unlikely) we can reference count the data object.
michael@0 173
michael@0 174 // Only do a deep copy if this is owned data, that is, data that
michael@0 175 // will be later deleted. System transliterators contain
michael@0 176 // non-owned data.
michael@0 177 if (isDataOwned) {
michael@0 178 fData = new TransliterationRuleData(*other.fData);
michael@0 179 }
michael@0 180 }
michael@0 181
michael@0 182 /**
michael@0 183 * Destructor.
michael@0 184 */
michael@0 185 RuleBasedTransliterator::~RuleBasedTransliterator() {
michael@0 186 // Delete the data object only if we own it.
michael@0 187 if (isDataOwned) {
michael@0 188 delete fData;
michael@0 189 }
michael@0 190 }
michael@0 191
michael@0 192 Transliterator* // Covariant return NOT ALLOWED (for portability)
michael@0 193 RuleBasedTransliterator::clone(void) const {
michael@0 194 return new RuleBasedTransliterator(*this);
michael@0 195 }
michael@0 196
michael@0 197 /**
michael@0 198 * Implements {@link Transliterator#handleTransliterate}.
michael@0 199 */
michael@0 200 void
michael@0 201 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
michael@0 202 UBool isIncremental) const {
michael@0 203 /* We keep contextStart and contextLimit fixed the entire time,
michael@0 204 * relative to the text -- contextLimit may move numerically if
michael@0 205 * text is inserted or removed. The start offset moves toward
michael@0 206 * limit, with replacements happening under it.
michael@0 207 *
michael@0 208 * Example: rules 1. ab>x|y
michael@0 209 * 2. yc>z
michael@0 210 *
michael@0 211 * |eabcd begin - no match, advance start
michael@0 212 * e|abcd match rule 1 - change text & adjust start
michael@0 213 * ex|ycd match rule 2 - change text & adjust start
michael@0 214 * exz|d no match, advance start
michael@0 215 * exzd| done
michael@0 216 */
michael@0 217
michael@0 218 /* A rule like
michael@0 219 * a>b|a
michael@0 220 * creates an infinite loop. To prevent that, we put an arbitrary
michael@0 221 * limit on the number of iterations that we take, one that is
michael@0 222 * high enough that any reasonable rules are ok, but low enough to
michael@0 223 * prevent a server from hanging. The limit is 16 times the
michael@0 224 * number of characters n, unless n is so large that 16n exceeds a
michael@0 225 * uint32_t.
michael@0 226 */
michael@0 227 uint32_t loopCount = 0;
michael@0 228 uint32_t loopLimit = index.limit - index.start;
michael@0 229 if (loopLimit >= 0x10000000) {
michael@0 230 loopLimit = 0xFFFFFFFF;
michael@0 231 } else {
michael@0 232 loopLimit <<= 4;
michael@0 233 }
michael@0 234
michael@0 235 // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent
michael@0 236 // operations must be prevented.
michael@0 237 // A Complication: compound transliterators can result in recursive entries to this
michael@0 238 // function, sometimes with different "This" objects, always with the same text.
michael@0 239 // Double-locking must be prevented in these cases.
michael@0 240 //
michael@0 241
michael@0 242 // If the transliteration data is exclusively owned by this transliterator object,
michael@0 243 // we don't need to do any locking. No sharing between transliterators is possible,
michael@0 244 // so no concurrent access from multiple threads is possible.
michael@0 245 UBool lockedMutexAtThisLevel = FALSE;
michael@0 246 if (isDataOwned == FALSE) {
michael@0 247 // Test whether this request is operating on the same text string as some
michael@0 248 // some other transliteration that is still in progress and holding the
michael@0 249 // transliteration mutex. If so, do not lock the transliteration
michael@0 250 // mutex again.
michael@0 251 // TODO(andy): Need a better scheme for handling this.
michael@0 252 UBool needToLock;
michael@0 253 umtx_lock(NULL);
michael@0 254 needToLock = (&text != gLockedText);
michael@0 255 umtx_unlock(NULL);
michael@0 256 if (needToLock) {
michael@0 257 umtx_lock(&transliteratorDataMutex);
michael@0 258 gLockedText = &text;
michael@0 259 lockedMutexAtThisLevel = TRUE;
michael@0 260 }
michael@0 261 }
michael@0 262
michael@0 263 // Check to make sure we don't dereference a null pointer.
michael@0 264 if (fData != NULL) {
michael@0 265 while (index.start < index.limit &&
michael@0 266 loopCount <= loopLimit &&
michael@0 267 fData->ruleSet.transliterate(text, index, isIncremental)) {
michael@0 268 ++loopCount;
michael@0 269 }
michael@0 270 }
michael@0 271 if (lockedMutexAtThisLevel) {
michael@0 272 gLockedText = NULL;
michael@0 273 umtx_unlock(&transliteratorDataMutex);
michael@0 274 }
michael@0 275 }
michael@0 276
michael@0 277 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
michael@0 278 UBool escapeUnprintable) const {
michael@0 279 return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
michael@0 280 }
michael@0 281
michael@0 282 /**
michael@0 283 * Implement Transliterator framework
michael@0 284 */
michael@0 285 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
michael@0 286 fData->ruleSet.getSourceTargetSet(result, FALSE);
michael@0 287 }
michael@0 288
michael@0 289 /**
michael@0 290 * Override Transliterator framework
michael@0 291 */
michael@0 292 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
michael@0 293 return fData->ruleSet.getSourceTargetSet(result, TRUE);
michael@0 294 }
michael@0 295
michael@0 296 U_NAMESPACE_END
michael@0 297
michael@0 298 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

mercurial