intl/icu/source/i18n/unesctrn.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (c) 2001-2011, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * Date Name Description
michael@0 7 * 11/19/2001 aliu Creation.
michael@0 8 **********************************************************************
michael@0 9 */
michael@0 10
michael@0 11 #include "unicode/utypes.h"
michael@0 12
michael@0 13 #if !UCONFIG_NO_TRANSLITERATION
michael@0 14
michael@0 15 #include "unicode/uchar.h"
michael@0 16 #include "unicode/utf16.h"
michael@0 17 #include "unesctrn.h"
michael@0 18 #include "util.h"
michael@0 19
michael@0 20 #include "cmemory.h"
michael@0 21
michael@0 22 U_NAMESPACE_BEGIN
michael@0 23
michael@0 24 /**
michael@0 25 * Special character marking the end of the spec[] array.
michael@0 26 */
michael@0 27 static const UChar END = 0xFFFF;
michael@0 28
michael@0 29 // Unicode: "U+10FFFF" hex, min=4, max=6
michael@0 30 static const UChar SPEC_Unicode[] = {
michael@0 31 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
michael@0 32 END
michael@0 33 };
michael@0 34
michael@0 35 // Java: "\\uFFFF" hex, min=4, max=4
michael@0 36 static const UChar SPEC_Java[] = {
michael@0 37 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
michael@0 38 END
michael@0 39 };
michael@0 40
michael@0 41 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
michael@0 42 static const UChar SPEC_C[] = {
michael@0 43 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
michael@0 44 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
michael@0 45 END
michael@0 46 };
michael@0 47
michael@0 48 // XML: "&#x10FFFF;" hex, min=1, max=6
michael@0 49 static const UChar SPEC_XML[] = {
michael@0 50 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
michael@0 51 END
michael@0 52 };
michael@0 53
michael@0 54 // XML10: "&#1114111;" dec, min=1, max=7 (not really "Hex-Any")
michael@0 55 static const UChar SPEC_XML10[] = {
michael@0 56 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
michael@0 57 END
michael@0 58 };
michael@0 59
michael@0 60 // Perl: "\\x{263A}" hex, min=1, max=6
michael@0 61 static const UChar SPEC_Perl[] = {
michael@0 62 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
michael@0 63 END
michael@0 64 };
michael@0 65
michael@0 66 // All: Java, C, Perl, XML, XML10, Unicode
michael@0 67 static const UChar SPEC_Any[] = {
michael@0 68 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode
michael@0 69 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java
michael@0 70 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates)
michael@0 71 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML
michael@0 72 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10
michael@0 73 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
michael@0 74 END
michael@0 75 };
michael@0 76
michael@0 77 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
michael@0 78
michael@0 79 static UChar* copySpec(const UChar* spec) {
michael@0 80 int32_t len = 0;
michael@0 81 while (spec[len] != END) {
michael@0 82 ++len;
michael@0 83 }
michael@0 84 ++len;
michael@0 85 UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
michael@0 86 // Check for memory allocation error.
michael@0 87 if (result != NULL) {
michael@0 88 uprv_memcpy(result, spec, len*sizeof(result[0]));
michael@0 89 }
michael@0 90 return result;
michael@0 91 }
michael@0 92
michael@0 93 /**
michael@0 94 * Factory methods. Ignore the context.
michael@0 95 */
michael@0 96 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0 97 return new UnescapeTransliterator(ID, SPEC_Unicode);
michael@0 98 }
michael@0 99 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0 100 return new UnescapeTransliterator(ID, SPEC_Java);
michael@0 101 }
michael@0 102 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0 103 return new UnescapeTransliterator(ID, SPEC_C);
michael@0 104 }
michael@0 105 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0 106 return new UnescapeTransliterator(ID, SPEC_XML);
michael@0 107 }
michael@0 108 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0 109 return new UnescapeTransliterator(ID, SPEC_XML10);
michael@0 110 }
michael@0 111 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0 112 return new UnescapeTransliterator(ID, SPEC_Perl);
michael@0 113 }
michael@0 114 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0 115 return new UnescapeTransliterator(ID, SPEC_Any);
michael@0 116 }
michael@0 117
michael@0 118 /**
michael@0 119 * Registers standard variants with the system. Called by
michael@0 120 * Transliterator during initialization.
michael@0 121 */
michael@0 122 void UnescapeTransliterator::registerIDs() {
michael@0 123 Token t = integerToken(0);
michael@0 124
michael@0 125 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
michael@0 126
michael@0 127 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
michael@0 128
michael@0 129 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
michael@0 130
michael@0 131 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
michael@0 132
michael@0 133 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
michael@0 134
michael@0 135 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
michael@0 136
michael@0 137 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
michael@0 138 }
michael@0 139
michael@0 140 /**
michael@0 141 * Constructor. Takes the encoded spec array.
michael@0 142 */
michael@0 143 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
michael@0 144 const UChar *newSpec) :
michael@0 145 Transliterator(newID, NULL)
michael@0 146 {
michael@0 147 this->spec = copySpec(newSpec);
michael@0 148 }
michael@0 149
michael@0 150 /**
michael@0 151 * Copy constructor.
michael@0 152 */
michael@0 153 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
michael@0 154 Transliterator(o) {
michael@0 155 this->spec = copySpec(o.spec);
michael@0 156 }
michael@0 157
michael@0 158 UnescapeTransliterator::~UnescapeTransliterator() {
michael@0 159 uprv_free(spec);
michael@0 160 }
michael@0 161
michael@0 162 /**
michael@0 163 * Transliterator API.
michael@0 164 */
michael@0 165 Transliterator* UnescapeTransliterator::clone() const {
michael@0 166 return new UnescapeTransliterator(*this);
michael@0 167 }
michael@0 168
michael@0 169 /**
michael@0 170 * Implements {@link Transliterator#handleTransliterate}.
michael@0 171 */
michael@0 172 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
michael@0 173 UBool isIncremental) const {
michael@0 174 int32_t start = pos.start;
michael@0 175 int32_t limit = pos.limit;
michael@0 176 int32_t i, j, ipat;
michael@0 177
michael@0 178 while (start < limit) {
michael@0 179 // Loop over the forms in spec[]. Exit this loop when we
michael@0 180 // match one of the specs. Exit the outer loop if a
michael@0 181 // partial match is detected and isIncremental is true.
michael@0 182 for (j=0, ipat=0; spec[ipat] != END; ++j) {
michael@0 183
michael@0 184 // Read the header
michael@0 185 int32_t prefixLen = spec[ipat++];
michael@0 186 int32_t suffixLen = spec[ipat++];
michael@0 187 int8_t radix = (int8_t) spec[ipat++];
michael@0 188 int32_t minDigits = spec[ipat++];
michael@0 189 int32_t maxDigits = spec[ipat++];
michael@0 190
michael@0 191 // s is a copy of start that is advanced over the
michael@0 192 // characters as we parse them.
michael@0 193 int32_t s = start;
michael@0 194 UBool match = TRUE;
michael@0 195
michael@0 196 for (i=0; i<prefixLen; ++i) {
michael@0 197 if (s >= limit) {
michael@0 198 if (i > 0) {
michael@0 199 // We've already matched a character. This is
michael@0 200 // a partial match, so we return if in
michael@0 201 // incremental mode. In non-incremental mode,
michael@0 202 // go to the next spec.
michael@0 203 if (isIncremental) {
michael@0 204 goto exit;
michael@0 205 }
michael@0 206 match = FALSE;
michael@0 207 break;
michael@0 208 }
michael@0 209 }
michael@0 210 UChar c = text.charAt(s++);
michael@0 211 if (c != spec[ipat + i]) {
michael@0 212 match = FALSE;
michael@0 213 break;
michael@0 214 }
michael@0 215 }
michael@0 216
michael@0 217 if (match) {
michael@0 218 UChar32 u = 0;
michael@0 219 int32_t digitCount = 0;
michael@0 220 for (;;) {
michael@0 221 if (s >= limit) {
michael@0 222 // Check for partial match in incremental mode.
michael@0 223 if (s > start && isIncremental) {
michael@0 224 goto exit;
michael@0 225 }
michael@0 226 break;
michael@0 227 }
michael@0 228 UChar32 ch = text.char32At(s);
michael@0 229 int32_t digit = u_digit(ch, radix);
michael@0 230 if (digit < 0) {
michael@0 231 break;
michael@0 232 }
michael@0 233 s += U16_LENGTH(ch);
michael@0 234 u = (u * radix) + digit;
michael@0 235 if (++digitCount == maxDigits) {
michael@0 236 break;
michael@0 237 }
michael@0 238 }
michael@0 239
michael@0 240 match = (digitCount >= minDigits);
michael@0 241
michael@0 242 if (match) {
michael@0 243 for (i=0; i<suffixLen; ++i) {
michael@0 244 if (s >= limit) {
michael@0 245 // Check for partial match in incremental mode.
michael@0 246 if (s > start && isIncremental) {
michael@0 247 goto exit;
michael@0 248 }
michael@0 249 match = FALSE;
michael@0 250 break;
michael@0 251 }
michael@0 252 UChar c = text.charAt(s++);
michael@0 253 if (c != spec[ipat + prefixLen + i]) {
michael@0 254 match = FALSE;
michael@0 255 break;
michael@0 256 }
michael@0 257 }
michael@0 258
michael@0 259 if (match) {
michael@0 260 // At this point, we have a match
michael@0 261 UnicodeString str(u);
michael@0 262 text.handleReplaceBetween(start, s, str);
michael@0 263 limit -= s - start - str.length();
michael@0 264 // The following break statement leaves the
michael@0 265 // loop that is traversing the forms in
michael@0 266 // spec[]. We then parse the next input
michael@0 267 // character.
michael@0 268 break;
michael@0 269 }
michael@0 270 }
michael@0 271 }
michael@0 272
michael@0 273 ipat += prefixLen + suffixLen;
michael@0 274 }
michael@0 275
michael@0 276 if (start < limit) {
michael@0 277 start += U16_LENGTH(text.char32At(start));
michael@0 278 }
michael@0 279 }
michael@0 280
michael@0 281 exit:
michael@0 282 pos.contextLimit += limit - pos.limit;
michael@0 283 pos.limit = limit;
michael@0 284 pos.start = start;
michael@0 285 }
michael@0 286
michael@0 287 U_NAMESPACE_END
michael@0 288
michael@0 289 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
michael@0 290
michael@0 291 //eof

mercurial