The Tor Browser: intl/icu/source/i18n/rbt

     1 /*

     2  **********************************************************************

     3  *   Copyright (C) 1999-2011, International Business Machines

     4  *   Corporation and others.  All Rights Reserved.

     5  **********************************************************************

     6  *   Date        Name        Description

     7  *   11/17/99    aliu        Creation.

     8  **********************************************************************

     9  */

    11 #include "unicode/utypes.h"

    13 #if !UCONFIG_NO_TRANSLITERATION

    15 #include "unicode/unistr.h"

    16 #include "unicode/uniset.h"

    17 #include "unicode/utf16.h"

    18 #include "rbt_set.h"

    19 #include "rbt_rule.h"

    20 #include "cmemory.h"

    21 #include "putilimp.h"

    23 U_CDECL_BEGIN

    24 static void U_CALLCONV _deleteRule(void *rule) {

    25     delete (icu::TransliterationRule *)rule;

    26 }

    27 U_CDECL_END

    29 //----------------------------------------------------------------------

    30 // BEGIN Debugging support

    31 //----------------------------------------------------------------------

    33 // #define DEBUG_RBT

    35 #ifdef DEBUG_RBT

    36 #include <stdio.h>

    37 #include "charstr.h"

    39 /**

    40  * @param appendTo result is appended to this param.

    41  * @param input the string being transliterated

    42  * @param pos the index struct

    43  */

    44 static UnicodeString& _formatInput(UnicodeString &appendTo,

    45                                    const UnicodeString& input,

    46                                    const UTransPosition& pos) {

    47     // Output a string of the form aaa{bbb|ccc|ddd}eee, where

    48     // the {} indicate the context start and limit, and the ||

    49     // indicate the start and limit.

    50     if (0 <= pos.contextStart &&

    51         pos.contextStart <= pos.start &&

    52         pos.start <= pos.limit &&

    53         pos.limit <= pos.contextLimit &&

    54         pos.contextLimit <= input.length()) {

    56         UnicodeString a, b, c, d, e;

    57         input.extractBetween(0, pos.contextStart, a);

    58         input.extractBetween(pos.contextStart, pos.start, b);

    59         input.extractBetween(pos.start, pos.limit, c);

    60         input.extractBetween(pos.limit, pos.contextLimit, d);

    61         input.extractBetween(pos.contextLimit, input.length(), e);

    62         appendTo.append(a).append((UChar)123/*{*/).append(b).

    63             append((UChar)124/*|*/).append(c).append((UChar)124/*|*/).append(d).

    64             append((UChar)125/*}*/).append(e);

    65     } else {

    66         appendTo.append("INVALID UTransPosition");

    67         //appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +

    68         //                pos.contextStart + ", s=" + pos.start + ", l=" +

    69         //                pos.limit + ", cl=" + pos.contextLimit + "} on " +

    70         //                input);

    71     }

    72     return appendTo;

    73 }

    75 // Append a hex string to the target

    76 UnicodeString& _appendHex(uint32_t number,

    77                           int32_t digits,

    78                           UnicodeString& target) {

    79     static const UChar digitString[] = {

    80         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,

    81         0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0

    82     };

    83     while (digits--) {

    84         target += digitString[(number >> (digits*4)) & 0xF];

    85     }

    86     return target;

    87 }

    89 // Replace nonprintable characters with unicode escapes

    90 UnicodeString& _escape(const UnicodeString &source,

    91                        UnicodeString &target) {

    92     for (int32_t i = 0; i < source.length(); ) {

    93         UChar32 ch = source.char32At(i);

    94         i += U16_LENGTH(ch);

    95         if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E) {

    96             if (ch <= 0xFFFF) {

    97                 target += "\\u";

    98                 _appendHex(ch, 4, target);

    99             } else {

   100                 target += "\\U";

   101                 _appendHex(ch, 8, target);

   102             }

   103         } else {

   104             target += ch;

   105         }

   106     }

   107     return target;

   108 }

   110 inline void _debugOut(const char* msg, TransliterationRule* rule,

   111                       const Replaceable& theText, UTransPosition& pos) {

   112     UnicodeString buf(msg, "");

   113     if (rule) {

   114         UnicodeString r;

   115         rule->toRule(r, TRUE);

   116         buf.append((UChar)32).append(r);

   117     }

   118     buf.append(UnicodeString(" => ", ""));

   119     UnicodeString* text = (UnicodeString*)&theText;

   120     _formatInput(buf, *text, pos);

   121     UnicodeString esc;

   122     _escape(buf, esc);

   123     CharString cbuf(esc);

   124     printf("%s\n", (const char*) cbuf);

   125 }

   127 #else

   128 #define _debugOut(msg, rule, theText, pos)

   129 #endif

   131 //----------------------------------------------------------------------

   132 // END Debugging support

   133 //----------------------------------------------------------------------

   135 // Fill the precontext and postcontext with the patterns of the rules

   136 // that are masking one another.

   137 static void maskingError(const icu::TransliterationRule& rule1,

   138                          const icu::TransliterationRule& rule2,

   139                          UParseError& parseError) {

   140     icu::UnicodeString r;

   141     int32_t len;

   143     parseError.line = parseError.offset = -1;

   145     // for pre-context

   146     rule1.toRule(r, FALSE);

   147     len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);

   148     r.extract(0, len, parseError.preContext);

   149     parseError.preContext[len] = 0;

   151     //for post-context

   152     r.truncate(0);

   153     rule2.toRule(r, FALSE);

   154     len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);

   155     r.extract(0, len, parseError.postContext);

   156     parseError.postContext[len] = 0;

   157 }

   159 U_NAMESPACE_BEGIN

   161 /**

   162  * Construct a new empty rule set.

   163  */

   164 TransliterationRuleSet::TransliterationRuleSet(UErrorCode& status) : UMemory() {

   165     ruleVector = new UVector(&_deleteRule, NULL, status);

   166     if (U_FAILURE(status)) {

   167         return;

   168     }

   169     if (ruleVector == NULL) {

   170         status = U_MEMORY_ALLOCATION_ERROR;

   171     }

   172     rules = NULL;

   173     maxContextLength = 0;

   174 }

   176 /**

   177  * Copy constructor.

   178  */

   179 TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet& other) :

   180     UMemory(other),

   181     ruleVector(0),

   182     rules(0),

   183     maxContextLength(other.maxContextLength) {

   185     int32_t i, len;

   186     uprv_memcpy(index, other.index, sizeof(index));

   187     UErrorCode status = U_ZERO_ERROR;

   188     ruleVector = new UVector(&_deleteRule, NULL, status);

   189     if (other.ruleVector != 0 && ruleVector != 0 && U_SUCCESS(status)) {

   190         len = other.ruleVector->size();

   191         for (i=0; i<len && U_SUCCESS(status); ++i) {

   192             TransliterationRule *tempTranslitRule = new TransliterationRule(*(TransliterationRule*)other.ruleVector->elementAt(i));

   193             // Null pointer test

   194             if (tempTranslitRule == NULL) {

   195                 status = U_MEMORY_ALLOCATION_ERROR;

   196                 break;

   197             }

   198             ruleVector->addElement(tempTranslitRule, status);

   199             if (U_FAILURE(status)) {

   200                 break;

   201             }

   202         }

   203     }

   204     if (other.rules != 0 && U_SUCCESS(status)) {

   205         UParseError p;

   206         freeze(p, status);

   207     }

   208 }

   210 /**

   211  * Destructor.

   212  */

   213 TransliterationRuleSet::~TransliterationRuleSet() {

   214     delete ruleVector; // This deletes the contained rules

   215     uprv_free(rules);

   216 }

   218 void TransliterationRuleSet::setData(const TransliterationRuleData* d) {

   219     /**

   220      * We assume that the ruleset has already been frozen.

   221      */

   222     int32_t len = index[256]; // see freeze()

   223     for (int32_t i=0; i<len; ++i) {

   224         rules[i]->setData(d);

   225     }

   226 }

   228 /**

   229  * Return the maximum context length.

   230  * @return the length of the longest preceding context.

   231  */

   232 int32_t TransliterationRuleSet::getMaximumContextLength(void) const {

   233     return maxContextLength;

   234 }

   236 /**

   237  * Add a rule to this set.  Rules are added in order, and order is

   238  * significant.  The last call to this method must be followed by

   239  * a call to <code>freeze()</code> before the rule set is used.

   240  *

   241  * <p>If freeze() has already been called, calling addRule()

   242  * unfreezes the rules, and freeze() must be called again.

   243  *

   244  * @param adoptedRule the rule to add

   245  */

   246 void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,

   247                                      UErrorCode& status) {

   248     if (U_FAILURE(status)) {

   249         delete adoptedRule;

   250         return;

   251     }

   252     ruleVector->addElement(adoptedRule, status);

   254     int32_t len;

   255     if ((len = adoptedRule->getContextLength()) > maxContextLength) {

   256         maxContextLength = len;

   257     }

   259     uprv_free(rules);

   260     rules = 0;

   261 }

   263 /**

   264  * Check this for masked rules and index it to optimize performance.

   265  * The sequence of operations is: (1) add rules to a set using

   266  * <code>addRule()</code>; (2) freeze the set using

   267  * <code>freeze()</code>; (3) use the rule set.  If

   268  * <code>addRule()</code> is called after calling this method, it

   269  * invalidates this object, and this method must be called again.

   270  * That is, <code>freeze()</code> may be called multiple times,

   271  * although for optimal performance it shouldn't be.

   272  */

   273 void TransliterationRuleSet::freeze(UParseError& parseError,UErrorCode& status) {

   274     /* Construct the rule array and index table.  We reorder the

   275      * rules by sorting them into 256 bins.  Each bin contains all

   276      * rules matching the index value for that bin.  A rule

   277      * matches an index value if string whose first key character

   278      * has a low byte equal to the index value can match the rule.

   279      *

   280      * Each bin contains zero or more rules, in the same order

   281      * they were found originally.  However, the total rules in

   282      * the bins may exceed the number in the original vector,

   283      * since rules that have a variable as their first key

   284      * character will generally fall into more than one bin.

   285      *

   286      * That is, each bin contains all rules that either have that

   287      * first index value as their first key character, or have

   288      * a set containing the index value as their first character.

   289      */

   290     int32_t n = ruleVector->size();

   291     int32_t j;

   292     int16_t x;

   293     UVector v(2*n, status); // heuristic; adjust as needed

   295     if (U_FAILURE(status)) {

   296         return;

   297     }

   299     /* Precompute the index values.  This saves a LOT of time.

   300      * Be careful not to call malloc(0).

   301      */

   302     int16_t* indexValue = (int16_t*) uprv_malloc( sizeof(int16_t) * (n > 0 ? n : 1) );

   303     /* test for NULL */

   304     if (indexValue == 0) {

   305         status = U_MEMORY_ALLOCATION_ERROR;

   306         return;

   307     }

   308     for (j=0; j<n; ++j) {

   309         TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);

   310         indexValue[j] = r->getIndexValue();

   311     }

   312     for (x=0; x<256; ++x) {

   313         index[x] = v.size();

   314         for (j=0; j<n; ++j) {

   315             if (indexValue[j] >= 0) {

   316                 if (indexValue[j] == x) {

   317                     v.addElement(ruleVector->elementAt(j), status);

   318                 }

   319             } else {

   320                 // If the indexValue is < 0, then the first key character is

   321                 // a set, and we must use the more time-consuming

   322                 // matchesIndexValue check.  In practice this happens

   323                 // rarely, so we seldom tread this code path.

   324                 TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);

   325                 if (r->matchesIndexValue((uint8_t)x)) {

   326                     v.addElement(r, status);

   327                 }

   328             }

   329         }

   330     }

   331     uprv_free(indexValue);

   332     index[256] = v.size();

   334     /* Freeze things into an array.

   335      */

   336     uprv_free(rules); // Contains alias pointers

   338     /* You can't do malloc(0)! */

   339     if (v.size() == 0) {

   340         rules = NULL;

   341         return;

   342     }

   343     rules = (TransliterationRule **)uprv_malloc(v.size() * sizeof(TransliterationRule *));

   344     /* test for NULL */

   345     if (rules == 0) {

   346         status = U_MEMORY_ALLOCATION_ERROR;

   347         return;

   348     }

   349     for (j=0; j<v.size(); ++j) {

   350         rules[j] = (TransliterationRule*) v.elementAt(j);

   351     }

   353     // TODO Add error reporting that indicates the rules that

   354     //      are being masked.

   355     //UnicodeString errors;

   357     /* Check for masking.  This is MUCH faster than our old check,

   358      * which was each rule against each following rule, since we

   359      * only have to check for masking within each bin now.  It's

   360      * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule

   361      * count, and n2 is the per-bin rule count.  But n2<<n1, so

   362      * it's a big win.

   363      */

   364     for (x=0; x<256; ++x) {

   365         for (j=index[x]; j<index[x+1]-1; ++j) {

   366             TransliterationRule* r1 = rules[j];

   367             for (int32_t k=j+1; k<index[x+1]; ++k) {

   368                 TransliterationRule* r2 = rules[k];

   369                 if (r1->masks(*r2)) {

   370 //|                 if (errors == null) {

   371 //|                     errors = new StringBuffer();

   372 //|                 } else {

   373 //|                     errors.append("\n");

   374 //|                 }

   375 //|                 errors.append("Rule " + r1 + " masks " + r2);

   376                     status = U_RULE_MASK_ERROR;

   377                     maskingError(*r1, *r2, parseError);

   378                     return;

   379                 }

   380             }

   381         }

   382     }

   384     //if (errors != null) {

   385     //    throw new IllegalArgumentException(errors.toString());

   386     //}

   387 }

   389 /**

   390  * Transliterate the given text with the given UTransPosition

   391  * indices.  Return TRUE if the transliteration should continue

   392  * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).

   393  * Note that FALSE is only ever returned if isIncremental is TRUE.

   394  * @param text the text to be transliterated

   395  * @param pos the position indices, which will be updated

   396  * @param incremental if TRUE, assume new text may be inserted

   397  * at index.limit, and return FALSE if thre is a partial match.

   398  * @return TRUE unless a U_PARTIAL_MATCH has been obtained,

   399  * indicating that transliteration should stop until more text

   400  * arrives.

   401  */

   402 UBool TransliterationRuleSet::transliterate(Replaceable& text,

   403                                             UTransPosition& pos,

   404                                             UBool incremental) {

   405     int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF);

   406     for (int32_t i=index[indexByte]; i<index[indexByte+1]; ++i) {

   407         UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental);

   408         switch (m) {

   409         case U_MATCH:

   410             _debugOut("match", rules[i], text, pos);

   411             return TRUE;

   412         case U_PARTIAL_MATCH:

   413             _debugOut("partial match", rules[i], text, pos);

   414             return FALSE;

   415         default: /* Ram: added default to make GCC happy */

   416             break;

   417         }

   418     }

   419     // No match or partial match from any rule

   420     pos.start += U16_LENGTH(text.char32At(pos.start));

   421     _debugOut("no match", NULL, text, pos);

   422     return TRUE;

   423 }

   425 /**

   426  * Create rule strings that represents this rule set.

   427  */

   428 UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,

   429                                                UBool escapeUnprintable) const {

   430     int32_t i;

   431     int32_t count = ruleVector->size();

   432     ruleSource.truncate(0);

   433     for (i=0; i<count; ++i) {

   434         if (i != 0) {

   435             ruleSource.append((UChar) 0x000A /*\n*/);

   436         }

   437         TransliterationRule *r =

   438             (TransliterationRule*) ruleVector->elementAt(i);

   439         r->toRule(ruleSource, escapeUnprintable);

   440     }

   441     return ruleSource;

   442 }

   444 /**

   445  * Return the set of all characters that may be modified

   446  * (getTarget=false) or emitted (getTarget=true) by this set.

   447  */

   448 UnicodeSet& TransliterationRuleSet::getSourceTargetSet(UnicodeSet& result,

   449                                UBool getTarget) const

   450 {

   451     result.clear();

   452     int32_t count = ruleVector->size();

   453     for (int32_t i=0; i<count; ++i) {

   454         TransliterationRule* r =

   455             (TransliterationRule*) ruleVector->elementAt(i);

   456         if (getTarget) {

   457             r->addTargetSetTo(result);

   458         } else {

   459             r->addSourceSetTo(result);

   460         }

   461     }

   462     return result;

   463 }

   465 U_NAMESPACE_END

   467 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

The Tor Browser / file revision

intl/icu/source/i18n/rbt_set.cpp@fc2d59ddac77

intl/icu/source/i18n/rbt_set.cpp