michael@0: /*
michael@0:  **********************************************************************
michael@0:  *   Copyright (c) 2001-2011, International Business Machines
michael@0:  *   Corporation and others.  All Rights Reserved.
michael@0:  **********************************************************************
michael@0:  *   Date        Name        Description
michael@0:  *   11/19/2001  aliu        Creation.
michael@0:  **********************************************************************
michael@0:  */
michael@0: 
michael@0: #include "unicode/utypes.h"
michael@0: 
michael@0: #if !UCONFIG_NO_TRANSLITERATION
michael@0: 
michael@0: #include "unicode/uchar.h"
michael@0: #include "unicode/utf16.h"
michael@0: #include "unesctrn.h"
michael@0: #include "util.h"
michael@0: 
michael@0: #include "cmemory.h"
michael@0: 
michael@0: U_NAMESPACE_BEGIN
michael@0: 
michael@0: /**
michael@0:  * Special character marking the end of the spec[] array.
michael@0:  */
michael@0: static const UChar END = 0xFFFF;
michael@0: 
michael@0: // Unicode: "U+10FFFF" hex, min=4, max=6
michael@0: static const UChar SPEC_Unicode[] = {
michael@0:     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
michael@0:     END
michael@0: };
michael@0: 
michael@0: // Java: "\\uFFFF" hex, min=4, max=4
michael@0: static const UChar SPEC_Java[] = {
michael@0:     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
michael@0:     END
michael@0: };
michael@0: 
michael@0: // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
michael@0: static const UChar SPEC_C[] = {
michael@0:     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
michael@0:     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
michael@0:     END
michael@0: };
michael@0: 
michael@0: // XML: "&#x10FFFF;" hex, min=1, max=6
michael@0: static const UChar SPEC_XML[] = {
michael@0:     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
michael@0:     END
michael@0: };
michael@0: 
michael@0: // XML10: "&#1114111;" dec, min=1, max=7 (not really "Hex-Any")
michael@0: static const UChar SPEC_XML10[] = {
michael@0:     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
michael@0:     END
michael@0: };
michael@0: 
michael@0: // Perl: "\\x{263A}" hex, min=1, max=6
michael@0: static const UChar SPEC_Perl[] = {
michael@0:     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
michael@0:     END
michael@0: };
michael@0: 
michael@0: // All: Java, C, Perl, XML, XML10, Unicode
michael@0: static const UChar SPEC_Any[] = {
michael@0:     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
michael@0:     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
michael@0:     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
michael@0:     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
michael@0:     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
michael@0:     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
michael@0:     END
michael@0: };
michael@0: 
michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
michael@0: 
michael@0: static UChar* copySpec(const UChar* spec) {
michael@0:     int32_t len = 0;
michael@0:     while (spec[len] != END) {
michael@0:         ++len;
michael@0:     }
michael@0:     ++len;
michael@0:     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
michael@0:     // Check for memory allocation error. 
michael@0:     if (result != NULL) {
michael@0:     	uprv_memcpy(result, spec, len*sizeof(result[0]));
michael@0:     }
michael@0:     return result;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Factory methods.  Ignore the context.
michael@0:  */
michael@0: static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0:     return new UnescapeTransliterator(ID, SPEC_Unicode);
michael@0: }
michael@0: static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0:     return new UnescapeTransliterator(ID, SPEC_Java);
michael@0: }
michael@0: static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0:     return new UnescapeTransliterator(ID, SPEC_C);
michael@0: }
michael@0: static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0:     return new UnescapeTransliterator(ID, SPEC_XML);
michael@0: }
michael@0: static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0:     return new UnescapeTransliterator(ID, SPEC_XML10);
michael@0: }
michael@0: static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0:     return new UnescapeTransliterator(ID, SPEC_Perl);
michael@0: }
michael@0: static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
michael@0:     return new UnescapeTransliterator(ID, SPEC_Any);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Registers standard variants with the system.  Called by
michael@0:  * Transliterator during initialization.
michael@0:  */
michael@0: void UnescapeTransliterator::registerIDs() {
michael@0:     Token t = integerToken(0);
michael@0: 
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
michael@0: 
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
michael@0: 
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
michael@0: 
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
michael@0: 
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
michael@0: 
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
michael@0: 
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Constructor.  Takes the encoded spec array.
michael@0:  */
michael@0: UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
michael@0:                                                const UChar *newSpec) :
michael@0:     Transliterator(newID, NULL)
michael@0: {
michael@0:     this->spec = copySpec(newSpec);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Copy constructor.
michael@0:  */
michael@0: UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
michael@0:     Transliterator(o) {
michael@0:     this->spec = copySpec(o.spec);
michael@0: }
michael@0: 
michael@0: UnescapeTransliterator::~UnescapeTransliterator() {
michael@0:     uprv_free(spec);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Transliterator API.
michael@0:  */
michael@0: Transliterator* UnescapeTransliterator::clone() const {
michael@0:     return new UnescapeTransliterator(*this);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Implements {@link Transliterator#handleTransliterate}.
michael@0:  */
michael@0: void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
michael@0:                                                  UBool isIncremental) const {
michael@0:     int32_t start = pos.start;
michael@0:     int32_t limit = pos.limit;
michael@0:     int32_t i, j, ipat;
michael@0: 
michael@0:     while (start < limit) {
michael@0:         // Loop over the forms in spec[].  Exit this loop when we
michael@0:         // match one of the specs.  Exit the outer loop if a
michael@0:         // partial match is detected and isIncremental is true.
michael@0:         for (j=0, ipat=0; spec[ipat] != END; ++j) {
michael@0: 
michael@0:             // Read the header
michael@0:             int32_t prefixLen = spec[ipat++];
michael@0:             int32_t suffixLen = spec[ipat++];
michael@0:             int8_t  radix     = (int8_t) spec[ipat++];
michael@0:             int32_t minDigits = spec[ipat++];
michael@0:             int32_t maxDigits = spec[ipat++];
michael@0: 
michael@0:             // s is a copy of start that is advanced over the
michael@0:             // characters as we parse them.
michael@0:             int32_t s = start;
michael@0:             UBool match = TRUE;
michael@0: 
michael@0:             for (i=0; i<prefixLen; ++i) {
michael@0:                 if (s >= limit) {
michael@0:                     if (i > 0) {
michael@0:                         // We've already matched a character.  This is
michael@0:                         // a partial match, so we return if in
michael@0:                         // incremental mode.  In non-incremental mode,
michael@0:                         // go to the next spec.
michael@0:                         if (isIncremental) {
michael@0:                             goto exit;
michael@0:                         }
michael@0:                         match = FALSE;
michael@0:                         break;
michael@0:                     }
michael@0:                 }
michael@0:                 UChar c = text.charAt(s++);
michael@0:                 if (c != spec[ipat + i]) {
michael@0:                     match = FALSE;
michael@0:                     break;
michael@0:                 }
michael@0:             }
michael@0: 
michael@0:             if (match) {
michael@0:                 UChar32 u = 0;
michael@0:                 int32_t digitCount = 0;
michael@0:                 for (;;) {
michael@0:                     if (s >= limit) {
michael@0:                         // Check for partial match in incremental mode.
michael@0:                         if (s > start && isIncremental) {
michael@0:                             goto exit;
michael@0:                         }
michael@0:                         break;
michael@0:                     }
michael@0:                     UChar32 ch = text.char32At(s);
michael@0:                     int32_t digit = u_digit(ch, radix);
michael@0:                     if (digit < 0) {
michael@0:                         break;
michael@0:                     }
michael@0:                     s += U16_LENGTH(ch);
michael@0:                     u = (u * radix) + digit;
michael@0:                     if (++digitCount == maxDigits) {
michael@0:                         break;
michael@0:                     }
michael@0:                 }
michael@0: 
michael@0:                 match = (digitCount >= minDigits);
michael@0: 
michael@0:                 if (match) {
michael@0:                     for (i=0; i<suffixLen; ++i) {
michael@0:                         if (s >= limit) {
michael@0:                             // Check for partial match in incremental mode.
michael@0:                             if (s > start && isIncremental) {
michael@0:                                 goto exit;
michael@0:                             }
michael@0:                             match = FALSE;
michael@0:                             break;
michael@0:                         }
michael@0:                         UChar c = text.charAt(s++);
michael@0:                         if (c != spec[ipat + prefixLen + i]) {
michael@0:                             match = FALSE;
michael@0:                             break;
michael@0:                         }
michael@0:                     }
michael@0: 
michael@0:                     if (match) {
michael@0:                         // At this point, we have a match
michael@0:                         UnicodeString str(u);
michael@0:                         text.handleReplaceBetween(start, s, str);
michael@0:                         limit -= s - start - str.length();
michael@0:                         // The following break statement leaves the
michael@0:                         // loop that is traversing the forms in
michael@0:                         // spec[].  We then parse the next input
michael@0:                         // character.
michael@0:                         break;
michael@0:                     }
michael@0:                 }
michael@0:             }
michael@0: 
michael@0:             ipat += prefixLen + suffixLen;
michael@0:         }
michael@0: 
michael@0:         if (start < limit) {
michael@0:             start += U16_LENGTH(text.char32At(start));
michael@0:         }
michael@0:     }
michael@0: 
michael@0:   exit:
michael@0:     pos.contextLimit += limit - pos.limit;
michael@0:     pos.limit = limit;
michael@0:     pos.start = start;
michael@0: }
michael@0: 
michael@0: U_NAMESPACE_END
michael@0: 
michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */
michael@0: 
michael@0: //eof