The Tor Browser: intl/icu/source/i18n/unesctrn.cpp@fc2d59ddac77

     1 /*

     2  **********************************************************************

     3  *   Copyright (c) 2001-2011, International Business Machines

     4  *   Corporation and others.  All Rights Reserved.

     5  **********************************************************************

     6  *   Date        Name        Description

     7  *   11/19/2001  aliu        Creation.

     8  **********************************************************************

     9  */

    11 #include "unicode/utypes.h"

    13 #if !UCONFIG_NO_TRANSLITERATION

    15 #include "unicode/uchar.h"

    16 #include "unicode/utf16.h"

    17 #include "unesctrn.h"

    18 #include "util.h"

    20 #include "cmemory.h"

    22 U_NAMESPACE_BEGIN

    24 /**

    25  * Special character marking the end of the spec[] array.

    26  */

    27 static const UChar END = 0xFFFF;

    29 // Unicode: "U+10FFFF" hex, min=4, max=6

    30 static const UChar SPEC_Unicode[] = {

    31     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,

    32     END

    33 };

    35 // Java: "\\uFFFF" hex, min=4, max=4

    36 static const UChar SPEC_Java[] = {

    37     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,

    38     END

    39 };

    41 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8

    42 static const UChar SPEC_C[] = {

    43     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,

    44     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,

    45     END

    46 };

    48 // XML: "&#x10FFFF;" hex, min=1, max=6

    49 static const UChar SPEC_XML[] = {

    50     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,

    51     END

    52 };

    54 // XML10: "&#1114111;" dec, min=1, max=7 (not really "Hex-Any")

    55 static const UChar SPEC_XML10[] = {

    56     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,

    57     END

    58 };

    60 // Perl: "\\x{263A}" hex, min=1, max=6

    61 static const UChar SPEC_Perl[] = {

    62     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,

    63     END

    64 };

    66 // All: Java, C, Perl, XML, XML10, Unicode

    67 static const UChar SPEC_Any[] = {

    68     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode

    69     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java

    70     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)

    71     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML

    72     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10

    73     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl

    74     END

    75 };

    77 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)

    79 static UChar* copySpec(const UChar* spec) {

    80     int32_t len = 0;

    81     while (spec[len] != END) {

    82         ++len;

    83     }

    84     ++len;

    85     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));

    86     // Check for memory allocation error.

    87     if (result != NULL) {

    88     	uprv_memcpy(result, spec, len*sizeof(result[0]));

    89     }

    90     return result;

    91 }

    93 /**

    94  * Factory methods.  Ignore the context.

    95  */

    96 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {

    97     return new UnescapeTransliterator(ID, SPEC_Unicode);

    98 }

    99 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {

   100     return new UnescapeTransliterator(ID, SPEC_Java);

   101 }

   102 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {

   103     return new UnescapeTransliterator(ID, SPEC_C);

   104 }

   105 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {

   106     return new UnescapeTransliterator(ID, SPEC_XML);

   107 }

   108 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {

   109     return new UnescapeTransliterator(ID, SPEC_XML10);

   110 }

   111 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {

   112     return new UnescapeTransliterator(ID, SPEC_Perl);

   113 }

   114 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {

   115     return new UnescapeTransliterator(ID, SPEC_Any);

   116 }

   118 /**

   119  * Registers standard variants with the system.  Called by

   120  * Transliterator during initialization.

   121  */

   122 void UnescapeTransliterator::registerIDs() {

   123     Token t = integerToken(0);

   125     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);

   127     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);

   129     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);

   131     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);

   133     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);

   135     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);

   137     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);

   138 }

   140 /**

   141  * Constructor.  Takes the encoded spec array.

   142  */

   143 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,

   144                                                const UChar *newSpec) :

   145     Transliterator(newID, NULL)

   146 {

   147     this->spec = copySpec(newSpec);

   148 }

   150 /**

   151  * Copy constructor.

   152  */

   153 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :

   154     Transliterator(o) {

   155     this->spec = copySpec(o.spec);

   156 }

   158 UnescapeTransliterator::~UnescapeTransliterator() {

   159     uprv_free(spec);

   160 }

   162 /**

   163  * Transliterator API.

   164  */

   165 Transliterator* UnescapeTransliterator::clone() const {

   166     return new UnescapeTransliterator(*this);

   167 }

   169 /**

   170  * Implements {@link Transliterator#handleTransliterate}.

   171  */

   172 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,

   173                                                  UBool isIncremental) const {

   174     int32_t start = pos.start;

   175     int32_t limit = pos.limit;

   176     int32_t i, j, ipat;

   178     while (start < limit) {

   179         // Loop over the forms in spec[].  Exit this loop when we

   180         // match one of the specs.  Exit the outer loop if a

   181         // partial match is detected and isIncremental is true.

   182         for (j=0, ipat=0; spec[ipat] != END; ++j) {

   184             // Read the header

   185             int32_t prefixLen = spec[ipat++];

   186             int32_t suffixLen = spec[ipat++];

   187             int8_t  radix     = (int8_t) spec[ipat++];

   188             int32_t minDigits = spec[ipat++];

   189             int32_t maxDigits = spec[ipat++];

   191             // s is a copy of start that is advanced over the

   192             // characters as we parse them.

   193             int32_t s = start;

   194             UBool match = TRUE;

   196             for (i=0; i<prefixLen; ++i) {

   197                 if (s >= limit) {

   198                     if (i > 0) {

   199                         // We've already matched a character.  This is

   200                         // a partial match, so we return if in

   201                         // incremental mode.  In non-incremental mode,

   202                         // go to the next spec.

   203                         if (isIncremental) {

   204                             goto exit;

   205                         }

   206                         match = FALSE;

   207                         break;

   208                     }

   209                 }

   210                 UChar c = text.charAt(s++);

   211                 if (c != spec[ipat + i]) {

   212                     match = FALSE;

   213                     break;

   214                 }

   215             }

   217             if (match) {

   218                 UChar32 u = 0;

   219                 int32_t digitCount = 0;

   220                 for (;;) {

   221                     if (s >= limit) {

   222                         // Check for partial match in incremental mode.

   223                         if (s > start && isIncremental) {

   224                             goto exit;

   225                         }

   226                         break;

   227                     }

   228                     UChar32 ch = text.char32At(s);

   229                     int32_t digit = u_digit(ch, radix);

   230                     if (digit < 0) {

   231                         break;

   232                     }

   233                     s += U16_LENGTH(ch);

   234                     u = (u * radix) + digit;

   235                     if (++digitCount == maxDigits) {

   236                         break;

   237                     }

   238                 }

   240                 match = (digitCount >= minDigits);

   242                 if (match) {

   243                     for (i=0; i<suffixLen; ++i) {

   244                         if (s >= limit) {

   245                             // Check for partial match in incremental mode.

   246                             if (s > start && isIncremental) {

   247                                 goto exit;

   248                             }

   249                             match = FALSE;

   250                             break;

   251                         }

   252                         UChar c = text.charAt(s++);

   253                         if (c != spec[ipat + prefixLen + i]) {

   254                             match = FALSE;

   255                             break;

   256                         }

   257                     }

   259                     if (match) {

   260                         // At this point, we have a match

   261                         UnicodeString str(u);

   262                         text.handleReplaceBetween(start, s, str);

   263                         limit -= s - start - str.length();

   264                         // The following break statement leaves the

   265                         // loop that is traversing the forms in

   266                         // spec[].  We then parse the next input

   267                         // character.

   268                         break;

   269                     }

   270                 }

   271             }

   273             ipat += prefixLen + suffixLen;

   274         }

   276         if (start < limit) {

   277             start += U16_LENGTH(text.char32At(start));

   278         }

   279     }

   281   exit:

   282     pos.contextLimit += limit - pos.limit;

   283     pos.limit = limit;

   284     pos.start = start;

   285 }

   287 U_NAMESPACE_END

   289 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

   291 //eof

The Tor Browser / file revision

intl/icu/source/i18n/unesctrn.cpp@fc2d59ddac77

intl/icu/source/i18n/unesctrn.cpp