1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/rbt.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,298 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 1999-2013, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* Date Name Description 1.10 +* 11/17/99 aliu Creation. 1.11 +********************************************************************** 1.12 +*/ 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_TRANSLITERATION 1.17 + 1.18 +#include "unicode/rep.h" 1.19 +#include "unicode/uniset.h" 1.20 +#include "rbt_pars.h" 1.21 +#include "rbt_data.h" 1.22 +#include "rbt_rule.h" 1.23 +#include "rbt.h" 1.24 +#include "umutex.h" 1.25 + 1.26 +U_NAMESPACE_BEGIN 1.27 + 1.28 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator) 1.29 + 1.30 +static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER; 1.31 +static Replaceable *gLockedText = NULL; 1.32 + 1.33 +void RuleBasedTransliterator::_construct(const UnicodeString& rules, 1.34 + UTransDirection direction, 1.35 + UParseError& parseError, 1.36 + UErrorCode& status) { 1.37 + fData = 0; 1.38 + isDataOwned = TRUE; 1.39 + if (U_FAILURE(status)) { 1.40 + return; 1.41 + } 1.42 + 1.43 + TransliteratorParser parser(status); 1.44 + parser.parse(rules, direction, parseError, status); 1.45 + if (U_FAILURE(status)) { 1.46 + return; 1.47 + } 1.48 + 1.49 + if (parser.idBlockVector.size() != 0 || 1.50 + parser.compoundFilter != NULL || 1.51 + parser.dataVector.size() == 0) { 1.52 + status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT 1.53 + return; 1.54 + } 1.55 + 1.56 + fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); 1.57 + setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); 1.58 +} 1.59 + 1.60 +/** 1.61 + * Constructs a new transliterator from the given rules. 1.62 + * @param id the id for the transliterator. 1.63 + * @param rules rules, separated by ';' 1.64 + * @param direction either FORWARD or REVERSE. 1.65 + * @param adoptedFilter the filter for this transliterator. 1.66 + * @param parseError Struct to recieve information on position 1.67 + * of error if an error is encountered 1.68 + * @param status Output param set to success/failure code. 1.69 + * @exception IllegalArgumentException if rules are malformed 1.70 + * or direction is invalid. 1.71 + */ 1.72 +RuleBasedTransliterator::RuleBasedTransliterator( 1.73 + const UnicodeString& id, 1.74 + const UnicodeString& rules, 1.75 + UTransDirection direction, 1.76 + UnicodeFilter* adoptedFilter, 1.77 + UParseError& parseError, 1.78 + UErrorCode& status) : 1.79 + Transliterator(id, adoptedFilter) { 1.80 + _construct(rules, direction,parseError,status); 1.81 +} 1.82 + 1.83 +/** 1.84 + * Constructs a new transliterator from the given rules. 1.85 + * @param id the id for the transliterator. 1.86 + * @param rules rules, separated by ';' 1.87 + * @param direction either FORWARD or REVERSE. 1.88 + * @param adoptedFilter the filter for this transliterator. 1.89 + * @param status Output param set to success/failure code. 1.90 + * @exception IllegalArgumentException if rules are malformed 1.91 + * or direction is invalid. 1.92 + */ 1.93 +/*RuleBasedTransliterator::RuleBasedTransliterator( 1.94 + const UnicodeString& id, 1.95 + const UnicodeString& rules, 1.96 + UTransDirection direction, 1.97 + UnicodeFilter* adoptedFilter, 1.98 + UErrorCode& status) : 1.99 + Transliterator(id, adoptedFilter) { 1.100 + UParseError parseError; 1.101 + _construct(rules, direction,parseError, status); 1.102 +}*/ 1.103 + 1.104 +/** 1.105 + * Covenience constructor with no filter. 1.106 + */ 1.107 +/*RuleBasedTransliterator::RuleBasedTransliterator( 1.108 + const UnicodeString& id, 1.109 + const UnicodeString& rules, 1.110 + UTransDirection direction, 1.111 + UErrorCode& status) : 1.112 + Transliterator(id, 0) { 1.113 + UParseError parseError; 1.114 + _construct(rules, direction,parseError, status); 1.115 +}*/ 1.116 + 1.117 +/** 1.118 + * Covenience constructor with no filter and FORWARD direction. 1.119 + */ 1.120 +/*RuleBasedTransliterator::RuleBasedTransliterator( 1.121 + const UnicodeString& id, 1.122 + const UnicodeString& rules, 1.123 + UErrorCode& status) : 1.124 + Transliterator(id, 0) { 1.125 + UParseError parseError; 1.126 + _construct(rules, UTRANS_FORWARD, parseError, status); 1.127 +}*/ 1.128 + 1.129 +/** 1.130 + * Covenience constructor with FORWARD direction. 1.131 + */ 1.132 +/*RuleBasedTransliterator::RuleBasedTransliterator( 1.133 + const UnicodeString& id, 1.134 + const UnicodeString& rules, 1.135 + UnicodeFilter* adoptedFilter, 1.136 + UErrorCode& status) : 1.137 + Transliterator(id, adoptedFilter) { 1.138 + UParseError parseError; 1.139 + _construct(rules, UTRANS_FORWARD,parseError, status); 1.140 +}*/ 1.141 + 1.142 +RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, 1.143 + const TransliterationRuleData* theData, 1.144 + UnicodeFilter* adoptedFilter) : 1.145 + Transliterator(id, adoptedFilter), 1.146 + fData((TransliterationRuleData*)theData), // cast away const 1.147 + isDataOwned(FALSE) { 1.148 + setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); 1.149 +} 1.150 + 1.151 +/** 1.152 + * Internal constructor. 1.153 + */ 1.154 +RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, 1.155 + TransliterationRuleData* theData, 1.156 + UBool isDataAdopted) : 1.157 + Transliterator(id, 0), 1.158 + fData(theData), 1.159 + isDataOwned(isDataAdopted) { 1.160 + setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); 1.161 +} 1.162 + 1.163 +/** 1.164 + * Copy constructor. 1.165 + */ 1.166 +RuleBasedTransliterator::RuleBasedTransliterator( 1.167 + const RuleBasedTransliterator& other) : 1.168 + Transliterator(other), fData(other.fData), 1.169 + isDataOwned(other.isDataOwned) { 1.170 + 1.171 + // The data object may or may not be owned. If it is not owned we 1.172 + // share it; it is invariant. If it is owned, it's still 1.173 + // invariant, but we need to copy it to prevent double-deletion. 1.174 + // If this becomes a performance issue (if people do a lot of RBT 1.175 + // copying -- unlikely) we can reference count the data object. 1.176 + 1.177 + // Only do a deep copy if this is owned data, that is, data that 1.178 + // will be later deleted. System transliterators contain 1.179 + // non-owned data. 1.180 + if (isDataOwned) { 1.181 + fData = new TransliterationRuleData(*other.fData); 1.182 + } 1.183 +} 1.184 + 1.185 +/** 1.186 + * Destructor. 1.187 + */ 1.188 +RuleBasedTransliterator::~RuleBasedTransliterator() { 1.189 + // Delete the data object only if we own it. 1.190 + if (isDataOwned) { 1.191 + delete fData; 1.192 + } 1.193 +} 1.194 + 1.195 +Transliterator* // Covariant return NOT ALLOWED (for portability) 1.196 +RuleBasedTransliterator::clone(void) const { 1.197 + return new RuleBasedTransliterator(*this); 1.198 +} 1.199 + 1.200 +/** 1.201 + * Implements {@link Transliterator#handleTransliterate}. 1.202 + */ 1.203 +void 1.204 +RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, 1.205 + UBool isIncremental) const { 1.206 + /* We keep contextStart and contextLimit fixed the entire time, 1.207 + * relative to the text -- contextLimit may move numerically if 1.208 + * text is inserted or removed. The start offset moves toward 1.209 + * limit, with replacements happening under it. 1.210 + * 1.211 + * Example: rules 1. ab>x|y 1.212 + * 2. yc>z 1.213 + * 1.214 + * |eabcd begin - no match, advance start 1.215 + * e|abcd match rule 1 - change text & adjust start 1.216 + * ex|ycd match rule 2 - change text & adjust start 1.217 + * exz|d no match, advance start 1.218 + * exzd| done 1.219 + */ 1.220 + 1.221 + /* A rule like 1.222 + * a>b|a 1.223 + * creates an infinite loop. To prevent that, we put an arbitrary 1.224 + * limit on the number of iterations that we take, one that is 1.225 + * high enough that any reasonable rules are ok, but low enough to 1.226 + * prevent a server from hanging. The limit is 16 times the 1.227 + * number of characters n, unless n is so large that 16n exceeds a 1.228 + * uint32_t. 1.229 + */ 1.230 + uint32_t loopCount = 0; 1.231 + uint32_t loopLimit = index.limit - index.start; 1.232 + if (loopLimit >= 0x10000000) { 1.233 + loopLimit = 0xFFFFFFFF; 1.234 + } else { 1.235 + loopLimit <<= 4; 1.236 + } 1.237 + 1.238 + // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent 1.239 + // operations must be prevented. 1.240 + // A Complication: compound transliterators can result in recursive entries to this 1.241 + // function, sometimes with different "This" objects, always with the same text. 1.242 + // Double-locking must be prevented in these cases. 1.243 + // 1.244 + 1.245 + // If the transliteration data is exclusively owned by this transliterator object, 1.246 + // we don't need to do any locking. No sharing between transliterators is possible, 1.247 + // so no concurrent access from multiple threads is possible. 1.248 + UBool lockedMutexAtThisLevel = FALSE; 1.249 + if (isDataOwned == FALSE) { 1.250 + // Test whether this request is operating on the same text string as some 1.251 + // some other transliteration that is still in progress and holding the 1.252 + // transliteration mutex. If so, do not lock the transliteration 1.253 + // mutex again. 1.254 + // TODO(andy): Need a better scheme for handling this. 1.255 + UBool needToLock; 1.256 + umtx_lock(NULL); 1.257 + needToLock = (&text != gLockedText); 1.258 + umtx_unlock(NULL); 1.259 + if (needToLock) { 1.260 + umtx_lock(&transliteratorDataMutex); 1.261 + gLockedText = &text; 1.262 + lockedMutexAtThisLevel = TRUE; 1.263 + } 1.264 + } 1.265 + 1.266 + // Check to make sure we don't dereference a null pointer. 1.267 + if (fData != NULL) { 1.268 + while (index.start < index.limit && 1.269 + loopCount <= loopLimit && 1.270 + fData->ruleSet.transliterate(text, index, isIncremental)) { 1.271 + ++loopCount; 1.272 + } 1.273 + } 1.274 + if (lockedMutexAtThisLevel) { 1.275 + gLockedText = NULL; 1.276 + umtx_unlock(&transliteratorDataMutex); 1.277 + } 1.278 +} 1.279 + 1.280 +UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource, 1.281 + UBool escapeUnprintable) const { 1.282 + return fData->ruleSet.toRules(rulesSource, escapeUnprintable); 1.283 +} 1.284 + 1.285 +/** 1.286 + * Implement Transliterator framework 1.287 + */ 1.288 +void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const { 1.289 + fData->ruleSet.getSourceTargetSet(result, FALSE); 1.290 +} 1.291 + 1.292 +/** 1.293 + * Override Transliterator framework 1.294 + */ 1.295 +UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const { 1.296 + return fData->ruleSet.getSourceTargetSet(result, TRUE); 1.297 +} 1.298 + 1.299 +U_NAMESPACE_END 1.300 + 1.301 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */