intl/icu/source/i18n/unesctrn.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2  **********************************************************************
     3  *   Copyright (c) 2001-2011, International Business Machines
     4  *   Corporation and others.  All Rights Reserved.
     5  **********************************************************************
     6  *   Date        Name        Description
     7  *   11/19/2001  aliu        Creation.
     8  **********************************************************************
     9  */
    11 #include "unicode/utypes.h"
    13 #if !UCONFIG_NO_TRANSLITERATION
    15 #include "unicode/uchar.h"
    16 #include "unicode/utf16.h"
    17 #include "unesctrn.h"
    18 #include "util.h"
    20 #include "cmemory.h"
    22 U_NAMESPACE_BEGIN
    24 /**
    25  * Special character marking the end of the spec[] array.
    26  */
    27 static const UChar END = 0xFFFF;
    29 // Unicode: "U+10FFFF" hex, min=4, max=6
    30 static const UChar SPEC_Unicode[] = {
    31     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
    32     END
    33 };
    35 // Java: "\\uFFFF" hex, min=4, max=4
    36 static const UChar SPEC_Java[] = {
    37     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
    38     END
    39 };
    41 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
    42 static const UChar SPEC_C[] = {
    43     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
    44     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
    45     END
    46 };
    48 // XML: "&#x10FFFF;" hex, min=1, max=6
    49 static const UChar SPEC_XML[] = {
    50     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
    51     END
    52 };
    54 // XML10: "&#1114111;" dec, min=1, max=7 (not really "Hex-Any")
    55 static const UChar SPEC_XML10[] = {
    56     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
    57     END
    58 };
    60 // Perl: "\\x{263A}" hex, min=1, max=6
    61 static const UChar SPEC_Perl[] = {
    62     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
    63     END
    64 };
    66 // All: Java, C, Perl, XML, XML10, Unicode
    67 static const UChar SPEC_Any[] = {
    68     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
    69     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
    70     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
    71     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
    72     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
    73     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
    74     END
    75 };
    77 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
    79 static UChar* copySpec(const UChar* spec) {
    80     int32_t len = 0;
    81     while (spec[len] != END) {
    82         ++len;
    83     }
    84     ++len;
    85     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
    86     // Check for memory allocation error. 
    87     if (result != NULL) {
    88     	uprv_memcpy(result, spec, len*sizeof(result[0]));
    89     }
    90     return result;
    91 }
    93 /**
    94  * Factory methods.  Ignore the context.
    95  */
    96 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
    97     return new UnescapeTransliterator(ID, SPEC_Unicode);
    98 }
    99 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
   100     return new UnescapeTransliterator(ID, SPEC_Java);
   101 }
   102 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
   103     return new UnescapeTransliterator(ID, SPEC_C);
   104 }
   105 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
   106     return new UnescapeTransliterator(ID, SPEC_XML);
   107 }
   108 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
   109     return new UnescapeTransliterator(ID, SPEC_XML10);
   110 }
   111 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
   112     return new UnescapeTransliterator(ID, SPEC_Perl);
   113 }
   114 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
   115     return new UnescapeTransliterator(ID, SPEC_Any);
   116 }
   118 /**
   119  * Registers standard variants with the system.  Called by
   120  * Transliterator during initialization.
   121  */
   122 void UnescapeTransliterator::registerIDs() {
   123     Token t = integerToken(0);
   125     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
   127     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
   129     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
   131     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
   133     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
   135     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
   137     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
   138 }
   140 /**
   141  * Constructor.  Takes the encoded spec array.
   142  */
   143 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
   144                                                const UChar *newSpec) :
   145     Transliterator(newID, NULL)
   146 {
   147     this->spec = copySpec(newSpec);
   148 }
   150 /**
   151  * Copy constructor.
   152  */
   153 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
   154     Transliterator(o) {
   155     this->spec = copySpec(o.spec);
   156 }
   158 UnescapeTransliterator::~UnescapeTransliterator() {
   159     uprv_free(spec);
   160 }
   162 /**
   163  * Transliterator API.
   164  */
   165 Transliterator* UnescapeTransliterator::clone() const {
   166     return new UnescapeTransliterator(*this);
   167 }
   169 /**
   170  * Implements {@link Transliterator#handleTransliterate}.
   171  */
   172 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
   173                                                  UBool isIncremental) const {
   174     int32_t start = pos.start;
   175     int32_t limit = pos.limit;
   176     int32_t i, j, ipat;
   178     while (start < limit) {
   179         // Loop over the forms in spec[].  Exit this loop when we
   180         // match one of the specs.  Exit the outer loop if a
   181         // partial match is detected and isIncremental is true.
   182         for (j=0, ipat=0; spec[ipat] != END; ++j) {
   184             // Read the header
   185             int32_t prefixLen = spec[ipat++];
   186             int32_t suffixLen = spec[ipat++];
   187             int8_t  radix     = (int8_t) spec[ipat++];
   188             int32_t minDigits = spec[ipat++];
   189             int32_t maxDigits = spec[ipat++];
   191             // s is a copy of start that is advanced over the
   192             // characters as we parse them.
   193             int32_t s = start;
   194             UBool match = TRUE;
   196             for (i=0; i<prefixLen; ++i) {
   197                 if (s >= limit) {
   198                     if (i > 0) {
   199                         // We've already matched a character.  This is
   200                         // a partial match, so we return if in
   201                         // incremental mode.  In non-incremental mode,
   202                         // go to the next spec.
   203                         if (isIncremental) {
   204                             goto exit;
   205                         }
   206                         match = FALSE;
   207                         break;
   208                     }
   209                 }
   210                 UChar c = text.charAt(s++);
   211                 if (c != spec[ipat + i]) {
   212                     match = FALSE;
   213                     break;
   214                 }
   215             }
   217             if (match) {
   218                 UChar32 u = 0;
   219                 int32_t digitCount = 0;
   220                 for (;;) {
   221                     if (s >= limit) {
   222                         // Check for partial match in incremental mode.
   223                         if (s > start && isIncremental) {
   224                             goto exit;
   225                         }
   226                         break;
   227                     }
   228                     UChar32 ch = text.char32At(s);
   229                     int32_t digit = u_digit(ch, radix);
   230                     if (digit < 0) {
   231                         break;
   232                     }
   233                     s += U16_LENGTH(ch);
   234                     u = (u * radix) + digit;
   235                     if (++digitCount == maxDigits) {
   236                         break;
   237                     }
   238                 }
   240                 match = (digitCount >= minDigits);
   242                 if (match) {
   243                     for (i=0; i<suffixLen; ++i) {
   244                         if (s >= limit) {
   245                             // Check for partial match in incremental mode.
   246                             if (s > start && isIncremental) {
   247                                 goto exit;
   248                             }
   249                             match = FALSE;
   250                             break;
   251                         }
   252                         UChar c = text.charAt(s++);
   253                         if (c != spec[ipat + prefixLen + i]) {
   254                             match = FALSE;
   255                             break;
   256                         }
   257                     }
   259                     if (match) {
   260                         // At this point, we have a match
   261                         UnicodeString str(u);
   262                         text.handleReplaceBetween(start, s, str);
   263                         limit -= s - start - str.length();
   264                         // The following break statement leaves the
   265                         // loop that is traversing the forms in
   266                         // spec[].  We then parse the next input
   267                         // character.
   268                         break;
   269                     }
   270                 }
   271             }
   273             ipat += prefixLen + suffixLen;
   274         }
   276         if (start < limit) {
   277             start += U16_LENGTH(text.char32At(start));
   278         }
   279     }
   281   exit:
   282     pos.contextLimit += limit - pos.limit;
   283     pos.limit = limit;
   284     pos.start = start;
   285 }
   287 U_NAMESPACE_END
   289 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
   291 //eof

mercurial