intl/icu/source/i18n/unesctrn.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/unesctrn.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,291 @@
     1.4 +/*
     1.5 + **********************************************************************
     1.6 + *   Copyright (c) 2001-2011, International Business Machines
     1.7 + *   Corporation and others.  All Rights Reserved.
     1.8 + **********************************************************************
     1.9 + *   Date        Name        Description
    1.10 + *   11/19/2001  aliu        Creation.
    1.11 + **********************************************************************
    1.12 + */
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +
    1.16 +#if !UCONFIG_NO_TRANSLITERATION
    1.17 +
    1.18 +#include "unicode/uchar.h"
    1.19 +#include "unicode/utf16.h"
    1.20 +#include "unesctrn.h"
    1.21 +#include "util.h"
    1.22 +
    1.23 +#include "cmemory.h"
    1.24 +
    1.25 +U_NAMESPACE_BEGIN
    1.26 +
    1.27 +/**
    1.28 + * Special character marking the end of the spec[] array.
    1.29 + */
    1.30 +static const UChar END = 0xFFFF;
    1.31 +
    1.32 +// Unicode: "U+10FFFF" hex, min=4, max=6
    1.33 +static const UChar SPEC_Unicode[] = {
    1.34 +    2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
    1.35 +    END
    1.36 +};
    1.37 +
    1.38 +// Java: "\\uFFFF" hex, min=4, max=4
    1.39 +static const UChar SPEC_Java[] = {
    1.40 +    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
    1.41 +    END
    1.42 +};
    1.43 +
    1.44 +// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
    1.45 +static const UChar SPEC_C[] = {
    1.46 +    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
    1.47 +    2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
    1.48 +    END
    1.49 +};
    1.50 +
    1.51 +// XML: "" hex, min=1, max=6
    1.52 +static const UChar SPEC_XML[] = {
    1.53 +    3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
    1.54 +    END
    1.55 +};
    1.56 +
    1.57 +// XML10: "" dec, min=1, max=7 (not really "Hex-Any")
    1.58 +static const UChar SPEC_XML10[] = {
    1.59 +    2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
    1.60 +    END
    1.61 +};
    1.62 +
    1.63 +// Perl: "\\x{263A}" hex, min=1, max=6
    1.64 +static const UChar SPEC_Perl[] = {
    1.65 +    3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
    1.66 +    END
    1.67 +};
    1.68 +
    1.69 +// All: Java, C, Perl, XML, XML10, Unicode
    1.70 +static const UChar SPEC_Any[] = {
    1.71 +    2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
    1.72 +    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
    1.73 +    2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
    1.74 +    3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
    1.75 +    2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
    1.76 +    3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
    1.77 +    END
    1.78 +};
    1.79 +
    1.80 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
    1.81 +
    1.82 +static UChar* copySpec(const UChar* spec) {
    1.83 +    int32_t len = 0;
    1.84 +    while (spec[len] != END) {
    1.85 +        ++len;
    1.86 +    }
    1.87 +    ++len;
    1.88 +    UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
    1.89 +    // Check for memory allocation error. 
    1.90 +    if (result != NULL) {
    1.91 +    	uprv_memcpy(result, spec, len*sizeof(result[0]));
    1.92 +    }
    1.93 +    return result;
    1.94 +}
    1.95 +
    1.96 +/**
    1.97 + * Factory methods.  Ignore the context.
    1.98 + */
    1.99 +static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
   1.100 +    return new UnescapeTransliterator(ID, SPEC_Unicode);
   1.101 +}
   1.102 +static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
   1.103 +    return new UnescapeTransliterator(ID, SPEC_Java);
   1.104 +}
   1.105 +static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
   1.106 +    return new UnescapeTransliterator(ID, SPEC_C);
   1.107 +}
   1.108 +static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
   1.109 +    return new UnescapeTransliterator(ID, SPEC_XML);
   1.110 +}
   1.111 +static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
   1.112 +    return new UnescapeTransliterator(ID, SPEC_XML10);
   1.113 +}
   1.114 +static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
   1.115 +    return new UnescapeTransliterator(ID, SPEC_Perl);
   1.116 +}
   1.117 +static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
   1.118 +    return new UnescapeTransliterator(ID, SPEC_Any);
   1.119 +}
   1.120 +
   1.121 +/**
   1.122 + * Registers standard variants with the system.  Called by
   1.123 + * Transliterator during initialization.
   1.124 + */
   1.125 +void UnescapeTransliterator::registerIDs() {
   1.126 +    Token t = integerToken(0);
   1.127 +
   1.128 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
   1.129 +
   1.130 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
   1.131 +
   1.132 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
   1.133 +
   1.134 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
   1.135 +
   1.136 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
   1.137 +
   1.138 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
   1.139 +
   1.140 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
   1.141 +}
   1.142 +
   1.143 +/**
   1.144 + * Constructor.  Takes the encoded spec array.
   1.145 + */
   1.146 +UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
   1.147 +                                               const UChar *newSpec) :
   1.148 +    Transliterator(newID, NULL)
   1.149 +{
   1.150 +    this->spec = copySpec(newSpec);
   1.151 +}
   1.152 +
   1.153 +/**
   1.154 + * Copy constructor.
   1.155 + */
   1.156 +UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
   1.157 +    Transliterator(o) {
   1.158 +    this->spec = copySpec(o.spec);
   1.159 +}
   1.160 +
   1.161 +UnescapeTransliterator::~UnescapeTransliterator() {
   1.162 +    uprv_free(spec);
   1.163 +}
   1.164 +
   1.165 +/**
   1.166 + * Transliterator API.
   1.167 + */
   1.168 +Transliterator* UnescapeTransliterator::clone() const {
   1.169 +    return new UnescapeTransliterator(*this);
   1.170 +}
   1.171 +
   1.172 +/**
   1.173 + * Implements {@link Transliterator#handleTransliterate}.
   1.174 + */
   1.175 +void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
   1.176 +                                                 UBool isIncremental) const {
   1.177 +    int32_t start = pos.start;
   1.178 +    int32_t limit = pos.limit;
   1.179 +    int32_t i, j, ipat;
   1.180 +
   1.181 +    while (start < limit) {
   1.182 +        // Loop over the forms in spec[].  Exit this loop when we
   1.183 +        // match one of the specs.  Exit the outer loop if a
   1.184 +        // partial match is detected and isIncremental is true.
   1.185 +        for (j=0, ipat=0; spec[ipat] != END; ++j) {
   1.186 +
   1.187 +            // Read the header
   1.188 +            int32_t prefixLen = spec[ipat++];
   1.189 +            int32_t suffixLen = spec[ipat++];
   1.190 +            int8_t  radix     = (int8_t) spec[ipat++];
   1.191 +            int32_t minDigits = spec[ipat++];
   1.192 +            int32_t maxDigits = spec[ipat++];
   1.193 +
   1.194 +            // s is a copy of start that is advanced over the
   1.195 +            // characters as we parse them.
   1.196 +            int32_t s = start;
   1.197 +            UBool match = TRUE;
   1.198 +
   1.199 +            for (i=0; i<prefixLen; ++i) {
   1.200 +                if (s >= limit) {
   1.201 +                    if (i > 0) {
   1.202 +                        // We've already matched a character.  This is
   1.203 +                        // a partial match, so we return if in
   1.204 +                        // incremental mode.  In non-incremental mode,
   1.205 +                        // go to the next spec.
   1.206 +                        if (isIncremental) {
   1.207 +                            goto exit;
   1.208 +                        }
   1.209 +                        match = FALSE;
   1.210 +                        break;
   1.211 +                    }
   1.212 +                }
   1.213 +                UChar c = text.charAt(s++);
   1.214 +                if (c != spec[ipat + i]) {
   1.215 +                    match = FALSE;
   1.216 +                    break;
   1.217 +                }
   1.218 +            }
   1.219 +
   1.220 +            if (match) {
   1.221 +                UChar32 u = 0;
   1.222 +                int32_t digitCount = 0;
   1.223 +                for (;;) {
   1.224 +                    if (s >= limit) {
   1.225 +                        // Check for partial match in incremental mode.
   1.226 +                        if (s > start && isIncremental) {
   1.227 +                            goto exit;
   1.228 +                        }
   1.229 +                        break;
   1.230 +                    }
   1.231 +                    UChar32 ch = text.char32At(s);
   1.232 +                    int32_t digit = u_digit(ch, radix);
   1.233 +                    if (digit < 0) {
   1.234 +                        break;
   1.235 +                    }
   1.236 +                    s += U16_LENGTH(ch);
   1.237 +                    u = (u * radix) + digit;
   1.238 +                    if (++digitCount == maxDigits) {
   1.239 +                        break;
   1.240 +                    }
   1.241 +                }
   1.242 +
   1.243 +                match = (digitCount >= minDigits);
   1.244 +
   1.245 +                if (match) {
   1.246 +                    for (i=0; i<suffixLen; ++i) {
   1.247 +                        if (s >= limit) {
   1.248 +                            // Check for partial match in incremental mode.
   1.249 +                            if (s > start && isIncremental) {
   1.250 +                                goto exit;
   1.251 +                            }
   1.252 +                            match = FALSE;
   1.253 +                            break;
   1.254 +                        }
   1.255 +                        UChar c = text.charAt(s++);
   1.256 +                        if (c != spec[ipat + prefixLen + i]) {
   1.257 +                            match = FALSE;
   1.258 +                            break;
   1.259 +                        }
   1.260 +                    }
   1.261 +
   1.262 +                    if (match) {
   1.263 +                        // At this point, we have a match
   1.264 +                        UnicodeString str(u);
   1.265 +                        text.handleReplaceBetween(start, s, str);
   1.266 +                        limit -= s - start - str.length();
   1.267 +                        // The following break statement leaves the
   1.268 +                        // loop that is traversing the forms in
   1.269 +                        // spec[].  We then parse the next input
   1.270 +                        // character.
   1.271 +                        break;
   1.272 +                    }
   1.273 +                }
   1.274 +            }
   1.275 +
   1.276 +            ipat += prefixLen + suffixLen;
   1.277 +        }
   1.278 +
   1.279 +        if (start < limit) {
   1.280 +            start += U16_LENGTH(text.char32At(start));
   1.281 +        }
   1.282 +    }
   1.283 +
   1.284 +  exit:
   1.285 +    pos.contextLimit += limit - pos.limit;
   1.286 +    pos.limit = limit;
   1.287 +    pos.start = start;
   1.288 +}
   1.289 +
   1.290 +U_NAMESPACE_END
   1.291 +
   1.292 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */
   1.293 +
   1.294 +//eof

mercurial