1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/unesctrn.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,291 @@ 1.4 +/* 1.5 + ********************************************************************** 1.6 + * Copyright (c) 2001-2011, International Business Machines 1.7 + * Corporation and others. All Rights Reserved. 1.8 + ********************************************************************** 1.9 + * Date Name Description 1.10 + * 11/19/2001 aliu Creation. 1.11 + ********************************************************************** 1.12 + */ 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_TRANSLITERATION 1.17 + 1.18 +#include "unicode/uchar.h" 1.19 +#include "unicode/utf16.h" 1.20 +#include "unesctrn.h" 1.21 +#include "util.h" 1.22 + 1.23 +#include "cmemory.h" 1.24 + 1.25 +U_NAMESPACE_BEGIN 1.26 + 1.27 +/** 1.28 + * Special character marking the end of the spec[] array. 1.29 + */ 1.30 +static const UChar END = 0xFFFF; 1.31 + 1.32 +// Unicode: "U+10FFFF" hex, min=4, max=6 1.33 +static const UChar SPEC_Unicode[] = { 1.34 + 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, 1.35 + END 1.36 +}; 1.37 + 1.38 +// Java: "\\uFFFF" hex, min=4, max=4 1.39 +static const UChar SPEC_Java[] = { 1.40 + 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, 1.41 + END 1.42 +}; 1.43 + 1.44 +// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8 1.45 +static const UChar SPEC_C[] = { 1.46 + 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, 1.47 + 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, 1.48 + END 1.49 +}; 1.50 + 1.51 +// XML: "" hex, min=1, max=6 1.52 +static const UChar SPEC_XML[] = { 1.53 + 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, 1.54 + END 1.55 +}; 1.56 + 1.57 +// XML10: "" dec, min=1, max=7 (not really "Hex-Any") 1.58 +static const UChar SPEC_XML10[] = { 1.59 + 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, 1.60 + END 1.61 +}; 1.62 + 1.63 +// Perl: "\\x{263A}" hex, min=1, max=6 1.64 +static const UChar SPEC_Perl[] = { 1.65 + 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, 1.66 + END 1.67 +}; 1.68 + 1.69 +// All: Java, C, Perl, XML, XML10, Unicode 1.70 +static const UChar SPEC_Any[] = { 1.71 + 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode 1.72 + 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java 1.73 + 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates) 1.74 + 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML 1.75 + 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10 1.76 + 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl 1.77 + END 1.78 +}; 1.79 + 1.80 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator) 1.81 + 1.82 +static UChar* copySpec(const UChar* spec) { 1.83 + int32_t len = 0; 1.84 + while (spec[len] != END) { 1.85 + ++len; 1.86 + } 1.87 + ++len; 1.88 + UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar)); 1.89 + // Check for memory allocation error. 1.90 + if (result != NULL) { 1.91 + uprv_memcpy(result, spec, len*sizeof(result[0])); 1.92 + } 1.93 + return result; 1.94 +} 1.95 + 1.96 +/** 1.97 + * Factory methods. Ignore the context. 1.98 + */ 1.99 +static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) { 1.100 + return new UnescapeTransliterator(ID, SPEC_Unicode); 1.101 +} 1.102 +static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) { 1.103 + return new UnescapeTransliterator(ID, SPEC_Java); 1.104 +} 1.105 +static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) { 1.106 + return new UnescapeTransliterator(ID, SPEC_C); 1.107 +} 1.108 +static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) { 1.109 + return new UnescapeTransliterator(ID, SPEC_XML); 1.110 +} 1.111 +static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) { 1.112 + return new UnescapeTransliterator(ID, SPEC_XML10); 1.113 +} 1.114 +static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) { 1.115 + return new UnescapeTransliterator(ID, SPEC_Perl); 1.116 +} 1.117 +static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) { 1.118 + return new UnescapeTransliterator(ID, SPEC_Any); 1.119 +} 1.120 + 1.121 +/** 1.122 + * Registers standard variants with the system. Called by 1.123 + * Transliterator during initialization. 1.124 + */ 1.125 +void UnescapeTransliterator::registerIDs() { 1.126 + Token t = integerToken(0); 1.127 + 1.128 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t); 1.129 + 1.130 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t); 1.131 + 1.132 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t); 1.133 + 1.134 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t); 1.135 + 1.136 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t); 1.137 + 1.138 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t); 1.139 + 1.140 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t); 1.141 +} 1.142 + 1.143 +/** 1.144 + * Constructor. Takes the encoded spec array. 1.145 + */ 1.146 +UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID, 1.147 + const UChar *newSpec) : 1.148 + Transliterator(newID, NULL) 1.149 +{ 1.150 + this->spec = copySpec(newSpec); 1.151 +} 1.152 + 1.153 +/** 1.154 + * Copy constructor. 1.155 + */ 1.156 +UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) : 1.157 + Transliterator(o) { 1.158 + this->spec = copySpec(o.spec); 1.159 +} 1.160 + 1.161 +UnescapeTransliterator::~UnescapeTransliterator() { 1.162 + uprv_free(spec); 1.163 +} 1.164 + 1.165 +/** 1.166 + * Transliterator API. 1.167 + */ 1.168 +Transliterator* UnescapeTransliterator::clone() const { 1.169 + return new UnescapeTransliterator(*this); 1.170 +} 1.171 + 1.172 +/** 1.173 + * Implements {@link Transliterator#handleTransliterate}. 1.174 + */ 1.175 +void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, 1.176 + UBool isIncremental) const { 1.177 + int32_t start = pos.start; 1.178 + int32_t limit = pos.limit; 1.179 + int32_t i, j, ipat; 1.180 + 1.181 + while (start < limit) { 1.182 + // Loop over the forms in spec[]. Exit this loop when we 1.183 + // match one of the specs. Exit the outer loop if a 1.184 + // partial match is detected and isIncremental is true. 1.185 + for (j=0, ipat=0; spec[ipat] != END; ++j) { 1.186 + 1.187 + // Read the header 1.188 + int32_t prefixLen = spec[ipat++]; 1.189 + int32_t suffixLen = spec[ipat++]; 1.190 + int8_t radix = (int8_t) spec[ipat++]; 1.191 + int32_t minDigits = spec[ipat++]; 1.192 + int32_t maxDigits = spec[ipat++]; 1.193 + 1.194 + // s is a copy of start that is advanced over the 1.195 + // characters as we parse them. 1.196 + int32_t s = start; 1.197 + UBool match = TRUE; 1.198 + 1.199 + for (i=0; i<prefixLen; ++i) { 1.200 + if (s >= limit) { 1.201 + if (i > 0) { 1.202 + // We've already matched a character. This is 1.203 + // a partial match, so we return if in 1.204 + // incremental mode. In non-incremental mode, 1.205 + // go to the next spec. 1.206 + if (isIncremental) { 1.207 + goto exit; 1.208 + } 1.209 + match = FALSE; 1.210 + break; 1.211 + } 1.212 + } 1.213 + UChar c = text.charAt(s++); 1.214 + if (c != spec[ipat + i]) { 1.215 + match = FALSE; 1.216 + break; 1.217 + } 1.218 + } 1.219 + 1.220 + if (match) { 1.221 + UChar32 u = 0; 1.222 + int32_t digitCount = 0; 1.223 + for (;;) { 1.224 + if (s >= limit) { 1.225 + // Check for partial match in incremental mode. 1.226 + if (s > start && isIncremental) { 1.227 + goto exit; 1.228 + } 1.229 + break; 1.230 + } 1.231 + UChar32 ch = text.char32At(s); 1.232 + int32_t digit = u_digit(ch, radix); 1.233 + if (digit < 0) { 1.234 + break; 1.235 + } 1.236 + s += U16_LENGTH(ch); 1.237 + u = (u * radix) + digit; 1.238 + if (++digitCount == maxDigits) { 1.239 + break; 1.240 + } 1.241 + } 1.242 + 1.243 + match = (digitCount >= minDigits); 1.244 + 1.245 + if (match) { 1.246 + for (i=0; i<suffixLen; ++i) { 1.247 + if (s >= limit) { 1.248 + // Check for partial match in incremental mode. 1.249 + if (s > start && isIncremental) { 1.250 + goto exit; 1.251 + } 1.252 + match = FALSE; 1.253 + break; 1.254 + } 1.255 + UChar c = text.charAt(s++); 1.256 + if (c != spec[ipat + prefixLen + i]) { 1.257 + match = FALSE; 1.258 + break; 1.259 + } 1.260 + } 1.261 + 1.262 + if (match) { 1.263 + // At this point, we have a match 1.264 + UnicodeString str(u); 1.265 + text.handleReplaceBetween(start, s, str); 1.266 + limit -= s - start - str.length(); 1.267 + // The following break statement leaves the 1.268 + // loop that is traversing the forms in 1.269 + // spec[]. We then parse the next input 1.270 + // character. 1.271 + break; 1.272 + } 1.273 + } 1.274 + } 1.275 + 1.276 + ipat += prefixLen + suffixLen; 1.277 + } 1.278 + 1.279 + if (start < limit) { 1.280 + start += U16_LENGTH(text.char32At(start)); 1.281 + } 1.282 + } 1.283 + 1.284 + exit: 1.285 + pos.contextLimit += limit - pos.limit; 1.286 + pos.limit = limit; 1.287 + pos.start = start; 1.288 +} 1.289 + 1.290 +U_NAMESPACE_END 1.291 + 1.292 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1.293 + 1.294 +//eof