intl/icu/source/i18n/rbt.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2 **********************************************************************
     3 *   Copyright (C) 1999-2013, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 **********************************************************************
     6 *   Date        Name        Description
     7 *   11/17/99    aliu        Creation.
     8 **********************************************************************
     9 */
    11 #include "unicode/utypes.h"
    13 #if !UCONFIG_NO_TRANSLITERATION
    15 #include "unicode/rep.h"
    16 #include "unicode/uniset.h"
    17 #include "rbt_pars.h"
    18 #include "rbt_data.h"
    19 #include "rbt_rule.h"
    20 #include "rbt.h"
    21 #include "umutex.h"
    23 U_NAMESPACE_BEGIN
    25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
    27 static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER;
    28 static Replaceable *gLockedText = NULL;
    30 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
    31                                          UTransDirection direction,
    32                                          UParseError& parseError,
    33                                          UErrorCode& status) {
    34     fData = 0;
    35     isDataOwned = TRUE;
    36     if (U_FAILURE(status)) {
    37         return;
    38     }
    40     TransliteratorParser parser(status);
    41     parser.parse(rules, direction, parseError, status);
    42     if (U_FAILURE(status)) {
    43         return;
    44     }
    46     if (parser.idBlockVector.size() != 0 ||
    47         parser.compoundFilter != NULL ||
    48         parser.dataVector.size() == 0) {
    49         status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
    50         return;
    51     }
    53     fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
    54     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
    55 }
    57 /**
    58  * Constructs a new transliterator from the given rules.
    59  * @param id            the id for the transliterator.
    60  * @param rules         rules, separated by ';'
    61  * @param direction     either FORWARD or REVERSE.
    62  * @param adoptedFilter the filter for this transliterator.
    63  * @param parseError    Struct to recieve information on position 
    64  *                      of error if an error is encountered
    65  * @param status        Output param set to success/failure code.
    66  * @exception IllegalArgumentException if rules are malformed
    67  * or direction is invalid.
    68  */
    69 RuleBasedTransliterator::RuleBasedTransliterator(
    70                             const UnicodeString& id,
    71                             const UnicodeString& rules,
    72                             UTransDirection direction,
    73                             UnicodeFilter* adoptedFilter,
    74                             UParseError& parseError,
    75                             UErrorCode& status) :
    76     Transliterator(id, adoptedFilter) {
    77     _construct(rules, direction,parseError,status);
    78 }
    80 /**
    81  * Constructs a new transliterator from the given rules.
    82  * @param id            the id for the transliterator.
    83  * @param rules         rules, separated by ';'
    84  * @param direction     either FORWARD or REVERSE.
    85  * @param adoptedFilter the filter for this transliterator.
    86  * @param status        Output param set to success/failure code.
    87  * @exception IllegalArgumentException if rules are malformed
    88  * or direction is invalid.
    89  */
    90 /*RuleBasedTransliterator::RuleBasedTransliterator(
    91                             const UnicodeString& id,
    92                             const UnicodeString& rules,
    93                             UTransDirection direction,
    94                             UnicodeFilter* adoptedFilter,
    95                             UErrorCode& status) :
    96     Transliterator(id, adoptedFilter) {
    97     UParseError parseError;
    98     _construct(rules, direction,parseError, status);
    99 }*/
   101 /**
   102  * Covenience constructor with no filter.
   103  */
   104 /*RuleBasedTransliterator::RuleBasedTransliterator(
   105                             const UnicodeString& id,
   106                             const UnicodeString& rules,
   107                             UTransDirection direction,
   108                             UErrorCode& status) :
   109     Transliterator(id, 0) {
   110     UParseError parseError;
   111     _construct(rules, direction,parseError, status);
   112 }*/
   114 /**
   115  * Covenience constructor with no filter and FORWARD direction.
   116  */
   117 /*RuleBasedTransliterator::RuleBasedTransliterator(
   118                             const UnicodeString& id,
   119                             const UnicodeString& rules,
   120                             UErrorCode& status) :
   121     Transliterator(id, 0) {
   122     UParseError parseError;
   123     _construct(rules, UTRANS_FORWARD, parseError, status);
   124 }*/
   126 /**
   127  * Covenience constructor with FORWARD direction.
   128  */
   129 /*RuleBasedTransliterator::RuleBasedTransliterator(
   130                             const UnicodeString& id,
   131                             const UnicodeString& rules,
   132                             UnicodeFilter* adoptedFilter,
   133                             UErrorCode& status) :
   134     Transliterator(id, adoptedFilter) {
   135     UParseError parseError;
   136     _construct(rules, UTRANS_FORWARD,parseError, status);
   137 }*/
   139 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
   140                                  const TransliterationRuleData* theData,
   141                                  UnicodeFilter* adoptedFilter) :
   142     Transliterator(id, adoptedFilter),
   143     fData((TransliterationRuleData*)theData), // cast away const
   144     isDataOwned(FALSE) {
   145     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
   146 }
   148 /**
   149  * Internal constructor.
   150  */
   151 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
   152                                                  TransliterationRuleData* theData,
   153                                                  UBool isDataAdopted) :
   154     Transliterator(id, 0),
   155     fData(theData),
   156     isDataOwned(isDataAdopted) {
   157     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
   158 }
   160 /**
   161  * Copy constructor.
   162  */
   163 RuleBasedTransliterator::RuleBasedTransliterator(
   164         const RuleBasedTransliterator& other) :
   165     Transliterator(other), fData(other.fData),
   166     isDataOwned(other.isDataOwned) {
   168     // The data object may or may not be owned.  If it is not owned we
   169     // share it; it is invariant.  If it is owned, it's still
   170     // invariant, but we need to copy it to prevent double-deletion.
   171     // If this becomes a performance issue (if people do a lot of RBT
   172     // copying -- unlikely) we can reference count the data object.
   174     // Only do a deep copy if this is owned data, that is, data that
   175     // will be later deleted.  System transliterators contain
   176     // non-owned data.
   177     if (isDataOwned) {
   178         fData = new TransliterationRuleData(*other.fData);
   179     }
   180 }
   182 /**
   183  * Destructor.
   184  */
   185 RuleBasedTransliterator::~RuleBasedTransliterator() {
   186     // Delete the data object only if we own it.
   187     if (isDataOwned) {
   188         delete fData;
   189     }
   190 }
   192 Transliterator* // Covariant return NOT ALLOWED (for portability)
   193 RuleBasedTransliterator::clone(void) const {
   194     return new RuleBasedTransliterator(*this);
   195 }
   197 /**
   198  * Implements {@link Transliterator#handleTransliterate}.
   199  */
   200 void
   201 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
   202                                              UBool isIncremental) const {
   203     /* We keep contextStart and contextLimit fixed the entire time,
   204      * relative to the text -- contextLimit may move numerically if
   205      * text is inserted or removed.  The start offset moves toward
   206      * limit, with replacements happening under it.
   207      *
   208      * Example: rules 1. ab>x|y
   209      *                2. yc>z
   210      *
   211      * |eabcd   begin - no match, advance start
   212      * e|abcd   match rule 1 - change text & adjust start
   213      * ex|ycd   match rule 2 - change text & adjust start
   214      * exz|d    no match, advance start
   215      * exzd|    done
   216      */
   218     /* A rule like
   219      *   a>b|a
   220      * creates an infinite loop. To prevent that, we put an arbitrary
   221      * limit on the number of iterations that we take, one that is
   222      * high enough that any reasonable rules are ok, but low enough to
   223      * prevent a server from hanging.  The limit is 16 times the
   224      * number of characters n, unless n is so large that 16n exceeds a
   225      * uint32_t.
   226      */
   227     uint32_t loopCount = 0;
   228     uint32_t loopLimit = index.limit - index.start;
   229     if (loopLimit >= 0x10000000) {
   230         loopLimit = 0xFFFFFFFF;
   231     } else {
   232         loopLimit <<= 4;
   233     }
   235     // Transliterator locking.  Rule-based Transliterators are not thread safe; concurrent
   236     //   operations must be prevented.  
   237     // A Complication: compound transliterators can result in recursive entries to this
   238     //   function, sometimes with different "This" objects, always with the same text. 
   239     //   Double-locking must be prevented in these cases.
   240     //   
   242     // If the transliteration data is exclusively owned by this transliterator object,
   243     //   we don't need to do any locking.  No sharing between transliterators is possible,
   244     //   so no concurrent access from multiple threads is possible.
   245     UBool    lockedMutexAtThisLevel = FALSE;
   246     if (isDataOwned == FALSE) {
   247         // Test whether this request is operating on the same text string as some
   248         //   some other transliteration that is still in progress and holding the 
   249         //   transliteration mutex.  If so, do not lock the transliteration
   250         //    mutex again.
   251         // TODO(andy): Need a better scheme for handling this.
   252         UBool needToLock;
   253         umtx_lock(NULL);
   254         needToLock = (&text != gLockedText);
   255         umtx_unlock(NULL);
   256         if (needToLock) {
   257             umtx_lock(&transliteratorDataMutex);
   258             gLockedText = &text;
   259             lockedMutexAtThisLevel = TRUE;
   260         }
   261     }
   263     // Check to make sure we don't dereference a null pointer.
   264     if (fData != NULL) {
   265 	    while (index.start < index.limit &&
   266 	           loopCount <= loopLimit &&
   267 	           fData->ruleSet.transliterate(text, index, isIncremental)) {
   268 	        ++loopCount;
   269 	    }
   270     }
   271     if (lockedMutexAtThisLevel) {
   272         gLockedText = NULL;
   273         umtx_unlock(&transliteratorDataMutex);
   274     }
   275 }
   277 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
   278                                                 UBool escapeUnprintable) const {
   279     return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
   280 }
   282 /**
   283  * Implement Transliterator framework
   284  */
   285 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
   286     fData->ruleSet.getSourceTargetSet(result, FALSE);
   287 }
   289 /**
   290  * Override Transliterator framework
   291  */
   292 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
   293     return fData->ruleSet.getSourceTargetSet(result, TRUE);
   294 }
   296 U_NAMESPACE_END
   298 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

mercurial