michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (c) 2001-2011, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: * Date Name Description michael@0: * 11/19/2001 aliu Creation. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_TRANSLITERATION michael@0: michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/utf16.h" michael@0: #include "unesctrn.h" michael@0: #include "util.h" michael@0: michael@0: #include "cmemory.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: /** michael@0: * Special character marking the end of the spec[] array. michael@0: */ michael@0: static const UChar END = 0xFFFF; michael@0: michael@0: // Unicode: "U+10FFFF" hex, min=4, max=6 michael@0: static const UChar SPEC_Unicode[] = { michael@0: 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, michael@0: END michael@0: }; michael@0: michael@0: // Java: "\\uFFFF" hex, min=4, max=4 michael@0: static const UChar SPEC_Java[] = { michael@0: 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, michael@0: END michael@0: }; michael@0: michael@0: // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8 michael@0: static const UChar SPEC_C[] = { michael@0: 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, michael@0: 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, michael@0: END michael@0: }; michael@0: michael@0: // XML: "􏿿" hex, min=1, max=6 michael@0: static const UChar SPEC_XML[] = { michael@0: 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, michael@0: END michael@0: }; michael@0: michael@0: // XML10: "􏿿" dec, min=1, max=7 (not really "Hex-Any") michael@0: static const UChar SPEC_XML10[] = { michael@0: 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, michael@0: END michael@0: }; michael@0: michael@0: // Perl: "\\x{263A}" hex, min=1, max=6 michael@0: static const UChar SPEC_Perl[] = { michael@0: 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, michael@0: END michael@0: }; michael@0: michael@0: // All: Java, C, Perl, XML, XML10, Unicode michael@0: static const UChar SPEC_Any[] = { michael@0: 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode michael@0: 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java michael@0: 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates) michael@0: 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML michael@0: 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10 michael@0: 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl michael@0: END michael@0: }; michael@0: michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator) michael@0: michael@0: static UChar* copySpec(const UChar* spec) { michael@0: int32_t len = 0; michael@0: while (spec[len] != END) { michael@0: ++len; michael@0: } michael@0: ++len; michael@0: UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar)); michael@0: // Check for memory allocation error. michael@0: if (result != NULL) { michael@0: uprv_memcpy(result, spec, len*sizeof(result[0])); michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Factory methods. Ignore the context. michael@0: */ michael@0: static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) { michael@0: return new UnescapeTransliterator(ID, SPEC_Unicode); michael@0: } michael@0: static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) { michael@0: return new UnescapeTransliterator(ID, SPEC_Java); michael@0: } michael@0: static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) { michael@0: return new UnescapeTransliterator(ID, SPEC_C); michael@0: } michael@0: static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) { michael@0: return new UnescapeTransliterator(ID, SPEC_XML); michael@0: } michael@0: static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) { michael@0: return new UnescapeTransliterator(ID, SPEC_XML10); michael@0: } michael@0: static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) { michael@0: return new UnescapeTransliterator(ID, SPEC_Perl); michael@0: } michael@0: static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) { michael@0: return new UnescapeTransliterator(ID, SPEC_Any); michael@0: } michael@0: michael@0: /** michael@0: * Registers standard variants with the system. Called by michael@0: * Transliterator during initialization. michael@0: */ michael@0: void UnescapeTransliterator::registerIDs() { michael@0: Token t = integerToken(0); michael@0: michael@0: Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t); michael@0: michael@0: Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t); michael@0: michael@0: Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t); michael@0: michael@0: Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t); michael@0: michael@0: Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t); michael@0: michael@0: Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t); michael@0: michael@0: Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t); michael@0: } michael@0: michael@0: /** michael@0: * Constructor. Takes the encoded spec array. michael@0: */ michael@0: UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID, michael@0: const UChar *newSpec) : michael@0: Transliterator(newID, NULL) michael@0: { michael@0: this->spec = copySpec(newSpec); michael@0: } michael@0: michael@0: /** michael@0: * Copy constructor. michael@0: */ michael@0: UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) : michael@0: Transliterator(o) { michael@0: this->spec = copySpec(o.spec); michael@0: } michael@0: michael@0: UnescapeTransliterator::~UnescapeTransliterator() { michael@0: uprv_free(spec); michael@0: } michael@0: michael@0: /** michael@0: * Transliterator API. michael@0: */ michael@0: Transliterator* UnescapeTransliterator::clone() const { michael@0: return new UnescapeTransliterator(*this); michael@0: } michael@0: michael@0: /** michael@0: * Implements {@link Transliterator#handleTransliterate}. michael@0: */ michael@0: void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, michael@0: UBool isIncremental) const { michael@0: int32_t start = pos.start; michael@0: int32_t limit = pos.limit; michael@0: int32_t i, j, ipat; michael@0: michael@0: while (start < limit) { michael@0: // Loop over the forms in spec[]. Exit this loop when we michael@0: // match one of the specs. Exit the outer loop if a michael@0: // partial match is detected and isIncremental is true. michael@0: for (j=0, ipat=0; spec[ipat] != END; ++j) { michael@0: michael@0: // Read the header michael@0: int32_t prefixLen = spec[ipat++]; michael@0: int32_t suffixLen = spec[ipat++]; michael@0: int8_t radix = (int8_t) spec[ipat++]; michael@0: int32_t minDigits = spec[ipat++]; michael@0: int32_t maxDigits = spec[ipat++]; michael@0: michael@0: // s is a copy of start that is advanced over the michael@0: // characters as we parse them. michael@0: int32_t s = start; michael@0: UBool match = TRUE; michael@0: michael@0: for (i=0; i= limit) { michael@0: if (i > 0) { michael@0: // We've already matched a character. This is michael@0: // a partial match, so we return if in michael@0: // incremental mode. In non-incremental mode, michael@0: // go to the next spec. michael@0: if (isIncremental) { michael@0: goto exit; michael@0: } michael@0: match = FALSE; michael@0: break; michael@0: } michael@0: } michael@0: UChar c = text.charAt(s++); michael@0: if (c != spec[ipat + i]) { michael@0: match = FALSE; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: if (match) { michael@0: UChar32 u = 0; michael@0: int32_t digitCount = 0; michael@0: for (;;) { michael@0: if (s >= limit) { michael@0: // Check for partial match in incremental mode. michael@0: if (s > start && isIncremental) { michael@0: goto exit; michael@0: } michael@0: break; michael@0: } michael@0: UChar32 ch = text.char32At(s); michael@0: int32_t digit = u_digit(ch, radix); michael@0: if (digit < 0) { michael@0: break; michael@0: } michael@0: s += U16_LENGTH(ch); michael@0: u = (u * radix) + digit; michael@0: if (++digitCount == maxDigits) { michael@0: break; michael@0: } michael@0: } michael@0: michael@0: match = (digitCount >= minDigits); michael@0: michael@0: if (match) { michael@0: for (i=0; i= limit) { michael@0: // Check for partial match in incremental mode. michael@0: if (s > start && isIncremental) { michael@0: goto exit; michael@0: } michael@0: match = FALSE; michael@0: break; michael@0: } michael@0: UChar c = text.charAt(s++); michael@0: if (c != spec[ipat + prefixLen + i]) { michael@0: match = FALSE; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: if (match) { michael@0: // At this point, we have a match michael@0: UnicodeString str(u); michael@0: text.handleReplaceBetween(start, s, str); michael@0: limit -= s - start - str.length(); michael@0: // The following break statement leaves the michael@0: // loop that is traversing the forms in michael@0: // spec[]. We then parse the next input michael@0: // character. michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: michael@0: ipat += prefixLen + suffixLen; michael@0: } michael@0: michael@0: if (start < limit) { michael@0: start += U16_LENGTH(text.char32At(start)); michael@0: } michael@0: } michael@0: michael@0: exit: michael@0: pos.contextLimit += limit - pos.limit; michael@0: pos.limit = limit; michael@0: pos.start = start; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */ michael@0: michael@0: //eof