intl/icu/source/i18n/rbt_set.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2  **********************************************************************
     3  *   Copyright (C) 1999-2011, International Business Machines
     4  *   Corporation and others.  All Rights Reserved.
     5  **********************************************************************
     6  *   Date        Name        Description
     7  *   11/17/99    aliu        Creation.
     8  **********************************************************************
     9  */
    11 #include "unicode/utypes.h"
    13 #if !UCONFIG_NO_TRANSLITERATION
    15 #include "unicode/unistr.h"
    16 #include "unicode/uniset.h"
    17 #include "unicode/utf16.h"
    18 #include "rbt_set.h"
    19 #include "rbt_rule.h"
    20 #include "cmemory.h"
    21 #include "putilimp.h"
    23 U_CDECL_BEGIN
    24 static void U_CALLCONV _deleteRule(void *rule) {
    25     delete (icu::TransliterationRule *)rule;
    26 }
    27 U_CDECL_END
    29 //----------------------------------------------------------------------
    30 // BEGIN Debugging support
    31 //----------------------------------------------------------------------
    33 // #define DEBUG_RBT
    35 #ifdef DEBUG_RBT
    36 #include <stdio.h>
    37 #include "charstr.h"
    39 /**
    40  * @param appendTo result is appended to this param.
    41  * @param input the string being transliterated
    42  * @param pos the index struct
    43  */
    44 static UnicodeString& _formatInput(UnicodeString &appendTo,
    45                                    const UnicodeString& input,
    46                                    const UTransPosition& pos) {
    47     // Output a string of the form aaa{bbb|ccc|ddd}eee, where
    48     // the {} indicate the context start and limit, and the ||
    49     // indicate the start and limit.
    50     if (0 <= pos.contextStart &&
    51         pos.contextStart <= pos.start &&
    52         pos.start <= pos.limit &&
    53         pos.limit <= pos.contextLimit &&
    54         pos.contextLimit <= input.length()) {
    56         UnicodeString a, b, c, d, e;
    57         input.extractBetween(0, pos.contextStart, a);
    58         input.extractBetween(pos.contextStart, pos.start, b);
    59         input.extractBetween(pos.start, pos.limit, c);
    60         input.extractBetween(pos.limit, pos.contextLimit, d);
    61         input.extractBetween(pos.contextLimit, input.length(), e);
    62         appendTo.append(a).append((UChar)123/*{*/).append(b).
    63             append((UChar)124/*|*/).append(c).append((UChar)124/*|*/).append(d).
    64             append((UChar)125/*}*/).append(e);
    65     } else {
    66         appendTo.append("INVALID UTransPosition");
    67         //appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
    68         //                pos.contextStart + ", s=" + pos.start + ", l=" +
    69         //                pos.limit + ", cl=" + pos.contextLimit + "} on " +
    70         //                input);
    71     }
    72     return appendTo;
    73 }
    75 // Append a hex string to the target
    76 UnicodeString& _appendHex(uint32_t number,
    77                           int32_t digits,
    78                           UnicodeString& target) {
    79     static const UChar digitString[] = {
    80         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
    81         0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0
    82     };
    83     while (digits--) {
    84         target += digitString[(number >> (digits*4)) & 0xF];
    85     }
    86     return target;
    87 }
    89 // Replace nonprintable characters with unicode escapes
    90 UnicodeString& _escape(const UnicodeString &source,
    91                        UnicodeString &target) {
    92     for (int32_t i = 0; i < source.length(); ) {
    93         UChar32 ch = source.char32At(i);
    94         i += U16_LENGTH(ch);
    95         if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E) {
    96             if (ch <= 0xFFFF) {
    97                 target += "\\u";
    98                 _appendHex(ch, 4, target);
    99             } else {
   100                 target += "\\U";
   101                 _appendHex(ch, 8, target);
   102             }
   103         } else {
   104             target += ch;
   105         }
   106     }
   107     return target;
   108 }
   110 inline void _debugOut(const char* msg, TransliterationRule* rule,
   111                       const Replaceable& theText, UTransPosition& pos) {
   112     UnicodeString buf(msg, "");
   113     if (rule) {
   114         UnicodeString r;
   115         rule->toRule(r, TRUE);
   116         buf.append((UChar)32).append(r);
   117     }
   118     buf.append(UnicodeString(" => ", ""));
   119     UnicodeString* text = (UnicodeString*)&theText;
   120     _formatInput(buf, *text, pos);
   121     UnicodeString esc;
   122     _escape(buf, esc);
   123     CharString cbuf(esc);
   124     printf("%s\n", (const char*) cbuf);
   125 }
   127 #else
   128 #define _debugOut(msg, rule, theText, pos)
   129 #endif
   131 //----------------------------------------------------------------------
   132 // END Debugging support
   133 //----------------------------------------------------------------------
   135 // Fill the precontext and postcontext with the patterns of the rules
   136 // that are masking one another.
   137 static void maskingError(const icu::TransliterationRule& rule1,
   138                          const icu::TransliterationRule& rule2,
   139                          UParseError& parseError) {
   140     icu::UnicodeString r;
   141     int32_t len;
   143     parseError.line = parseError.offset = -1;
   145     // for pre-context
   146     rule1.toRule(r, FALSE);
   147     len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);
   148     r.extract(0, len, parseError.preContext);
   149     parseError.preContext[len] = 0;   
   151     //for post-context
   152     r.truncate(0);
   153     rule2.toRule(r, FALSE);
   154     len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);
   155     r.extract(0, len, parseError.postContext);
   156     parseError.postContext[len] = 0;   
   157 }
   159 U_NAMESPACE_BEGIN
   161 /**
   162  * Construct a new empty rule set.
   163  */
   164 TransliterationRuleSet::TransliterationRuleSet(UErrorCode& status) : UMemory() {
   165     ruleVector = new UVector(&_deleteRule, NULL, status);
   166     if (U_FAILURE(status)) {
   167         return;
   168     }
   169     if (ruleVector == NULL) {
   170         status = U_MEMORY_ALLOCATION_ERROR;
   171     }
   172     rules = NULL;
   173     maxContextLength = 0;
   174 }
   176 /**
   177  * Copy constructor.
   178  */
   179 TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet& other) :
   180     UMemory(other),
   181     ruleVector(0),
   182     rules(0),
   183     maxContextLength(other.maxContextLength) {
   185     int32_t i, len;
   186     uprv_memcpy(index, other.index, sizeof(index));
   187     UErrorCode status = U_ZERO_ERROR;
   188     ruleVector = new UVector(&_deleteRule, NULL, status);
   189     if (other.ruleVector != 0 && ruleVector != 0 && U_SUCCESS(status)) {
   190         len = other.ruleVector->size();
   191         for (i=0; i<len && U_SUCCESS(status); ++i) {
   192             TransliterationRule *tempTranslitRule = new TransliterationRule(*(TransliterationRule*)other.ruleVector->elementAt(i));
   193             // Null pointer test
   194             if (tempTranslitRule == NULL) {
   195                 status = U_MEMORY_ALLOCATION_ERROR;
   196                 break;
   197             }
   198             ruleVector->addElement(tempTranslitRule, status);
   199             if (U_FAILURE(status)) {
   200                 break;
   201             }
   202         }
   203     }
   204     if (other.rules != 0 && U_SUCCESS(status)) {
   205         UParseError p;
   206         freeze(p, status);
   207     }
   208 }
   210 /**
   211  * Destructor.
   212  */
   213 TransliterationRuleSet::~TransliterationRuleSet() {
   214     delete ruleVector; // This deletes the contained rules
   215     uprv_free(rules);
   216 }
   218 void TransliterationRuleSet::setData(const TransliterationRuleData* d) {
   219     /**
   220      * We assume that the ruleset has already been frozen.
   221      */
   222     int32_t len = index[256]; // see freeze()
   223     for (int32_t i=0; i<len; ++i) {
   224         rules[i]->setData(d);
   225     }
   226 }
   228 /**
   229  * Return the maximum context length.
   230  * @return the length of the longest preceding context.
   231  */
   232 int32_t TransliterationRuleSet::getMaximumContextLength(void) const {
   233     return maxContextLength;
   234 }
   236 /**
   237  * Add a rule to this set.  Rules are added in order, and order is
   238  * significant.  The last call to this method must be followed by
   239  * a call to <code>freeze()</code> before the rule set is used.
   240  *
   241  * <p>If freeze() has already been called, calling addRule()
   242  * unfreezes the rules, and freeze() must be called again.
   243  *
   244  * @param adoptedRule the rule to add
   245  */
   246 void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
   247                                      UErrorCode& status) {
   248     if (U_FAILURE(status)) {
   249         delete adoptedRule;
   250         return;
   251     }
   252     ruleVector->addElement(adoptedRule, status);
   254     int32_t len;
   255     if ((len = adoptedRule->getContextLength()) > maxContextLength) {
   256         maxContextLength = len;
   257     }
   259     uprv_free(rules);
   260     rules = 0;
   261 }
   263 /**
   264  * Check this for masked rules and index it to optimize performance.
   265  * The sequence of operations is: (1) add rules to a set using
   266  * <code>addRule()</code>; (2) freeze the set using
   267  * <code>freeze()</code>; (3) use the rule set.  If
   268  * <code>addRule()</code> is called after calling this method, it
   269  * invalidates this object, and this method must be called again.
   270  * That is, <code>freeze()</code> may be called multiple times,
   271  * although for optimal performance it shouldn't be.
   272  */
   273 void TransliterationRuleSet::freeze(UParseError& parseError,UErrorCode& status) {
   274     /* Construct the rule array and index table.  We reorder the
   275      * rules by sorting them into 256 bins.  Each bin contains all
   276      * rules matching the index value for that bin.  A rule
   277      * matches an index value if string whose first key character
   278      * has a low byte equal to the index value can match the rule.
   279      *
   280      * Each bin contains zero or more rules, in the same order
   281      * they were found originally.  However, the total rules in
   282      * the bins may exceed the number in the original vector,
   283      * since rules that have a variable as their first key
   284      * character will generally fall into more than one bin.
   285      *
   286      * That is, each bin contains all rules that either have that
   287      * first index value as their first key character, or have
   288      * a set containing the index value as their first character.
   289      */
   290     int32_t n = ruleVector->size();
   291     int32_t j;
   292     int16_t x;
   293     UVector v(2*n, status); // heuristic; adjust as needed
   295     if (U_FAILURE(status)) {
   296         return;
   297     }
   299     /* Precompute the index values.  This saves a LOT of time.
   300      * Be careful not to call malloc(0).
   301      */
   302     int16_t* indexValue = (int16_t*) uprv_malloc( sizeof(int16_t) * (n > 0 ? n : 1) );
   303     /* test for NULL */
   304     if (indexValue == 0) {
   305         status = U_MEMORY_ALLOCATION_ERROR;
   306         return;
   307     }
   308     for (j=0; j<n; ++j) {
   309         TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
   310         indexValue[j] = r->getIndexValue();
   311     }
   312     for (x=0; x<256; ++x) {
   313         index[x] = v.size();
   314         for (j=0; j<n; ++j) {
   315             if (indexValue[j] >= 0) {
   316                 if (indexValue[j] == x) {
   317                     v.addElement(ruleVector->elementAt(j), status);
   318                 }
   319             } else {
   320                 // If the indexValue is < 0, then the first key character is
   321                 // a set, and we must use the more time-consuming
   322                 // matchesIndexValue check.  In practice this happens
   323                 // rarely, so we seldom tread this code path.
   324                 TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
   325                 if (r->matchesIndexValue((uint8_t)x)) {
   326                     v.addElement(r, status);
   327                 }
   328             }
   329         }
   330     }
   331     uprv_free(indexValue);
   332     index[256] = v.size();
   334     /* Freeze things into an array.
   335      */
   336     uprv_free(rules); // Contains alias pointers
   338     /* You can't do malloc(0)! */
   339     if (v.size() == 0) {
   340         rules = NULL;
   341         return;
   342     }
   343     rules = (TransliterationRule **)uprv_malloc(v.size() * sizeof(TransliterationRule *));
   344     /* test for NULL */
   345     if (rules == 0) {
   346         status = U_MEMORY_ALLOCATION_ERROR;
   347         return;
   348     }
   349     for (j=0; j<v.size(); ++j) {
   350         rules[j] = (TransliterationRule*) v.elementAt(j);
   351     }
   353     // TODO Add error reporting that indicates the rules that
   354     //      are being masked.
   355     //UnicodeString errors;
   357     /* Check for masking.  This is MUCH faster than our old check,
   358      * which was each rule against each following rule, since we
   359      * only have to check for masking within each bin now.  It's
   360      * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
   361      * count, and n2 is the per-bin rule count.  But n2<<n1, so
   362      * it's a big win.
   363      */
   364     for (x=0; x<256; ++x) {
   365         for (j=index[x]; j<index[x+1]-1; ++j) {
   366             TransliterationRule* r1 = rules[j];
   367             for (int32_t k=j+1; k<index[x+1]; ++k) {
   368                 TransliterationRule* r2 = rules[k];
   369                 if (r1->masks(*r2)) {
   370 //|                 if (errors == null) {
   371 //|                     errors = new StringBuffer();
   372 //|                 } else {
   373 //|                     errors.append("\n");
   374 //|                 }
   375 //|                 errors.append("Rule " + r1 + " masks " + r2);
   376                     status = U_RULE_MASK_ERROR;
   377                     maskingError(*r1, *r2, parseError);
   378                     return;
   379                 }
   380             }
   381         }
   382     }
   384     //if (errors != null) {
   385     //    throw new IllegalArgumentException(errors.toString());
   386     //}
   387 }
   389 /**
   390  * Transliterate the given text with the given UTransPosition
   391  * indices.  Return TRUE if the transliteration should continue
   392  * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
   393  * Note that FALSE is only ever returned if isIncremental is TRUE.
   394  * @param text the text to be transliterated
   395  * @param pos the position indices, which will be updated
   396  * @param incremental if TRUE, assume new text may be inserted
   397  * at index.limit, and return FALSE if thre is a partial match.
   398  * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
   399  * indicating that transliteration should stop until more text
   400  * arrives.
   401  */
   402 UBool TransliterationRuleSet::transliterate(Replaceable& text,
   403                                             UTransPosition& pos,
   404                                             UBool incremental) {
   405     int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF);
   406     for (int32_t i=index[indexByte]; i<index[indexByte+1]; ++i) {
   407         UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental);
   408         switch (m) {
   409         case U_MATCH:
   410             _debugOut("match", rules[i], text, pos);
   411             return TRUE;
   412         case U_PARTIAL_MATCH:
   413             _debugOut("partial match", rules[i], text, pos);
   414             return FALSE;
   415         default: /* Ram: added default to make GCC happy */
   416             break;
   417         }
   418     }
   419     // No match or partial match from any rule
   420     pos.start += U16_LENGTH(text.char32At(pos.start));
   421     _debugOut("no match", NULL, text, pos);
   422     return TRUE;
   423 }
   425 /**
   426  * Create rule strings that represents this rule set.
   427  */
   428 UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
   429                                                UBool escapeUnprintable) const {
   430     int32_t i;
   431     int32_t count = ruleVector->size();
   432     ruleSource.truncate(0);
   433     for (i=0; i<count; ++i) {
   434         if (i != 0) {
   435             ruleSource.append((UChar) 0x000A /*\n*/);
   436         }
   437         TransliterationRule *r =
   438             (TransliterationRule*) ruleVector->elementAt(i);
   439         r->toRule(ruleSource, escapeUnprintable);
   440     }
   441     return ruleSource;
   442 }
   444 /**
   445  * Return the set of all characters that may be modified
   446  * (getTarget=false) or emitted (getTarget=true) by this set.
   447  */
   448 UnicodeSet& TransliterationRuleSet::getSourceTargetSet(UnicodeSet& result,
   449                                UBool getTarget) const
   450 {
   451     result.clear();
   452     int32_t count = ruleVector->size();
   453     for (int32_t i=0; i<count; ++i) {
   454         TransliterationRule* r =
   455             (TransliterationRule*) ruleVector->elementAt(i);
   456         if (getTarget) {
   457             r->addTargetSetTo(result);
   458         } else {
   459             r->addSourceSetTo(result);
   460         }
   461     }
   462     return result;
   463 }
   465 U_NAMESPACE_END
   467 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

mercurial