intl/icu/source/i18n/rbt.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/rbt.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,298 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 1999-2013, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   Date        Name        Description
    1.10 +*   11/17/99    aliu        Creation.
    1.11 +**********************************************************************
    1.12 +*/
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +
    1.16 +#if !UCONFIG_NO_TRANSLITERATION
    1.17 +
    1.18 +#include "unicode/rep.h"
    1.19 +#include "unicode/uniset.h"
    1.20 +#include "rbt_pars.h"
    1.21 +#include "rbt_data.h"
    1.22 +#include "rbt_rule.h"
    1.23 +#include "rbt.h"
    1.24 +#include "umutex.h"
    1.25 +
    1.26 +U_NAMESPACE_BEGIN
    1.27 +
    1.28 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
    1.29 +
    1.30 +static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER;
    1.31 +static Replaceable *gLockedText = NULL;
    1.32 +
    1.33 +void RuleBasedTransliterator::_construct(const UnicodeString& rules,
    1.34 +                                         UTransDirection direction,
    1.35 +                                         UParseError& parseError,
    1.36 +                                         UErrorCode& status) {
    1.37 +    fData = 0;
    1.38 +    isDataOwned = TRUE;
    1.39 +    if (U_FAILURE(status)) {
    1.40 +        return;
    1.41 +    }
    1.42 +
    1.43 +    TransliteratorParser parser(status);
    1.44 +    parser.parse(rules, direction, parseError, status);
    1.45 +    if (U_FAILURE(status)) {
    1.46 +        return;
    1.47 +    }
    1.48 +
    1.49 +    if (parser.idBlockVector.size() != 0 ||
    1.50 +        parser.compoundFilter != NULL ||
    1.51 +        parser.dataVector.size() == 0) {
    1.52 +        status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
    1.53 +        return;
    1.54 +    }
    1.55 +
    1.56 +    fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
    1.57 +    setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
    1.58 +}
    1.59 +
    1.60 +/**
    1.61 + * Constructs a new transliterator from the given rules.
    1.62 + * @param id            the id for the transliterator.
    1.63 + * @param rules         rules, separated by ';'
    1.64 + * @param direction     either FORWARD or REVERSE.
    1.65 + * @param adoptedFilter the filter for this transliterator.
    1.66 + * @param parseError    Struct to recieve information on position 
    1.67 + *                      of error if an error is encountered
    1.68 + * @param status        Output param set to success/failure code.
    1.69 + * @exception IllegalArgumentException if rules are malformed
    1.70 + * or direction is invalid.
    1.71 + */
    1.72 +RuleBasedTransliterator::RuleBasedTransliterator(
    1.73 +                            const UnicodeString& id,
    1.74 +                            const UnicodeString& rules,
    1.75 +                            UTransDirection direction,
    1.76 +                            UnicodeFilter* adoptedFilter,
    1.77 +                            UParseError& parseError,
    1.78 +                            UErrorCode& status) :
    1.79 +    Transliterator(id, adoptedFilter) {
    1.80 +    _construct(rules, direction,parseError,status);
    1.81 +}
    1.82 +
    1.83 +/**
    1.84 + * Constructs a new transliterator from the given rules.
    1.85 + * @param id            the id for the transliterator.
    1.86 + * @param rules         rules, separated by ';'
    1.87 + * @param direction     either FORWARD or REVERSE.
    1.88 + * @param adoptedFilter the filter for this transliterator.
    1.89 + * @param status        Output param set to success/failure code.
    1.90 + * @exception IllegalArgumentException if rules are malformed
    1.91 + * or direction is invalid.
    1.92 + */
    1.93 +/*RuleBasedTransliterator::RuleBasedTransliterator(
    1.94 +                            const UnicodeString& id,
    1.95 +                            const UnicodeString& rules,
    1.96 +                            UTransDirection direction,
    1.97 +                            UnicodeFilter* adoptedFilter,
    1.98 +                            UErrorCode& status) :
    1.99 +    Transliterator(id, adoptedFilter) {
   1.100 +    UParseError parseError;
   1.101 +    _construct(rules, direction,parseError, status);
   1.102 +}*/
   1.103 +
   1.104 +/**
   1.105 + * Covenience constructor with no filter.
   1.106 + */
   1.107 +/*RuleBasedTransliterator::RuleBasedTransliterator(
   1.108 +                            const UnicodeString& id,
   1.109 +                            const UnicodeString& rules,
   1.110 +                            UTransDirection direction,
   1.111 +                            UErrorCode& status) :
   1.112 +    Transliterator(id, 0) {
   1.113 +    UParseError parseError;
   1.114 +    _construct(rules, direction,parseError, status);
   1.115 +}*/
   1.116 +
   1.117 +/**
   1.118 + * Covenience constructor with no filter and FORWARD direction.
   1.119 + */
   1.120 +/*RuleBasedTransliterator::RuleBasedTransliterator(
   1.121 +                            const UnicodeString& id,
   1.122 +                            const UnicodeString& rules,
   1.123 +                            UErrorCode& status) :
   1.124 +    Transliterator(id, 0) {
   1.125 +    UParseError parseError;
   1.126 +    _construct(rules, UTRANS_FORWARD, parseError, status);
   1.127 +}*/
   1.128 +
   1.129 +/**
   1.130 + * Covenience constructor with FORWARD direction.
   1.131 + */
   1.132 +/*RuleBasedTransliterator::RuleBasedTransliterator(
   1.133 +                            const UnicodeString& id,
   1.134 +                            const UnicodeString& rules,
   1.135 +                            UnicodeFilter* adoptedFilter,
   1.136 +                            UErrorCode& status) :
   1.137 +    Transliterator(id, adoptedFilter) {
   1.138 +    UParseError parseError;
   1.139 +    _construct(rules, UTRANS_FORWARD,parseError, status);
   1.140 +}*/
   1.141 +
   1.142 +RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
   1.143 +                                 const TransliterationRuleData* theData,
   1.144 +                                 UnicodeFilter* adoptedFilter) :
   1.145 +    Transliterator(id, adoptedFilter),
   1.146 +    fData((TransliterationRuleData*)theData), // cast away const
   1.147 +    isDataOwned(FALSE) {
   1.148 +    setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
   1.149 +}
   1.150 +
   1.151 +/**
   1.152 + * Internal constructor.
   1.153 + */
   1.154 +RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
   1.155 +                                                 TransliterationRuleData* theData,
   1.156 +                                                 UBool isDataAdopted) :
   1.157 +    Transliterator(id, 0),
   1.158 +    fData(theData),
   1.159 +    isDataOwned(isDataAdopted) {
   1.160 +    setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
   1.161 +}
   1.162 +
   1.163 +/**
   1.164 + * Copy constructor.
   1.165 + */
   1.166 +RuleBasedTransliterator::RuleBasedTransliterator(
   1.167 +        const RuleBasedTransliterator& other) :
   1.168 +    Transliterator(other), fData(other.fData),
   1.169 +    isDataOwned(other.isDataOwned) {
   1.170 +
   1.171 +    // The data object may or may not be owned.  If it is not owned we
   1.172 +    // share it; it is invariant.  If it is owned, it's still
   1.173 +    // invariant, but we need to copy it to prevent double-deletion.
   1.174 +    // If this becomes a performance issue (if people do a lot of RBT
   1.175 +    // copying -- unlikely) we can reference count the data object.
   1.176 +
   1.177 +    // Only do a deep copy if this is owned data, that is, data that
   1.178 +    // will be later deleted.  System transliterators contain
   1.179 +    // non-owned data.
   1.180 +    if (isDataOwned) {
   1.181 +        fData = new TransliterationRuleData(*other.fData);
   1.182 +    }
   1.183 +}
   1.184 +
   1.185 +/**
   1.186 + * Destructor.
   1.187 + */
   1.188 +RuleBasedTransliterator::~RuleBasedTransliterator() {
   1.189 +    // Delete the data object only if we own it.
   1.190 +    if (isDataOwned) {
   1.191 +        delete fData;
   1.192 +    }
   1.193 +}
   1.194 +
   1.195 +Transliterator* // Covariant return NOT ALLOWED (for portability)
   1.196 +RuleBasedTransliterator::clone(void) const {
   1.197 +    return new RuleBasedTransliterator(*this);
   1.198 +}
   1.199 +
   1.200 +/**
   1.201 + * Implements {@link Transliterator#handleTransliterate}.
   1.202 + */
   1.203 +void
   1.204 +RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
   1.205 +                                             UBool isIncremental) const {
   1.206 +    /* We keep contextStart and contextLimit fixed the entire time,
   1.207 +     * relative to the text -- contextLimit may move numerically if
   1.208 +     * text is inserted or removed.  The start offset moves toward
   1.209 +     * limit, with replacements happening under it.
   1.210 +     *
   1.211 +     * Example: rules 1. ab>x|y
   1.212 +     *                2. yc>z
   1.213 +     *
   1.214 +     * |eabcd   begin - no match, advance start
   1.215 +     * e|abcd   match rule 1 - change text & adjust start
   1.216 +     * ex|ycd   match rule 2 - change text & adjust start
   1.217 +     * exz|d    no match, advance start
   1.218 +     * exzd|    done
   1.219 +     */
   1.220 +
   1.221 +    /* A rule like
   1.222 +     *   a>b|a
   1.223 +     * creates an infinite loop. To prevent that, we put an arbitrary
   1.224 +     * limit on the number of iterations that we take, one that is
   1.225 +     * high enough that any reasonable rules are ok, but low enough to
   1.226 +     * prevent a server from hanging.  The limit is 16 times the
   1.227 +     * number of characters n, unless n is so large that 16n exceeds a
   1.228 +     * uint32_t.
   1.229 +     */
   1.230 +    uint32_t loopCount = 0;
   1.231 +    uint32_t loopLimit = index.limit - index.start;
   1.232 +    if (loopLimit >= 0x10000000) {
   1.233 +        loopLimit = 0xFFFFFFFF;
   1.234 +    } else {
   1.235 +        loopLimit <<= 4;
   1.236 +    }
   1.237 +
   1.238 +    // Transliterator locking.  Rule-based Transliterators are not thread safe; concurrent
   1.239 +    //   operations must be prevented.  
   1.240 +    // A Complication: compound transliterators can result in recursive entries to this
   1.241 +    //   function, sometimes with different "This" objects, always with the same text. 
   1.242 +    //   Double-locking must be prevented in these cases.
   1.243 +    //   
   1.244 +
   1.245 +    // If the transliteration data is exclusively owned by this transliterator object,
   1.246 +    //   we don't need to do any locking.  No sharing between transliterators is possible,
   1.247 +    //   so no concurrent access from multiple threads is possible.
   1.248 +    UBool    lockedMutexAtThisLevel = FALSE;
   1.249 +    if (isDataOwned == FALSE) {
   1.250 +        // Test whether this request is operating on the same text string as some
   1.251 +        //   some other transliteration that is still in progress and holding the 
   1.252 +        //   transliteration mutex.  If so, do not lock the transliteration
   1.253 +        //    mutex again.
   1.254 +        // TODO(andy): Need a better scheme for handling this.
   1.255 +        UBool needToLock;
   1.256 +        umtx_lock(NULL);
   1.257 +        needToLock = (&text != gLockedText);
   1.258 +        umtx_unlock(NULL);
   1.259 +        if (needToLock) {
   1.260 +            umtx_lock(&transliteratorDataMutex);
   1.261 +            gLockedText = &text;
   1.262 +            lockedMutexAtThisLevel = TRUE;
   1.263 +        }
   1.264 +    }
   1.265 +    
   1.266 +    // Check to make sure we don't dereference a null pointer.
   1.267 +    if (fData != NULL) {
   1.268 +	    while (index.start < index.limit &&
   1.269 +	           loopCount <= loopLimit &&
   1.270 +	           fData->ruleSet.transliterate(text, index, isIncremental)) {
   1.271 +	        ++loopCount;
   1.272 +	    }
   1.273 +    }
   1.274 +    if (lockedMutexAtThisLevel) {
   1.275 +        gLockedText = NULL;
   1.276 +        umtx_unlock(&transliteratorDataMutex);
   1.277 +    }
   1.278 +}
   1.279 +
   1.280 +UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
   1.281 +                                                UBool escapeUnprintable) const {
   1.282 +    return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
   1.283 +}
   1.284 +
   1.285 +/**
   1.286 + * Implement Transliterator framework
   1.287 + */
   1.288 +void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
   1.289 +    fData->ruleSet.getSourceTargetSet(result, FALSE);
   1.290 +}
   1.291 +
   1.292 +/**
   1.293 + * Override Transliterator framework
   1.294 + */
   1.295 +UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
   1.296 +    return fData->ruleSet.getSourceTargetSet(result, TRUE);
   1.297 +}
   1.298 +
   1.299 +U_NAMESPACE_END
   1.300 +
   1.301 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */

mercurial