michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2001-2011, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: * Date Name Description michael@0: * 06/07/01 aliu Creation. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_TRANSLITERATION michael@0: michael@0: #include "unicode/unifilt.h" michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/utf16.h" michael@0: #include "cmemory.h" michael@0: #include "name2uni.h" michael@0: #include "patternprops.h" michael@0: #include "uprops.h" michael@0: #include "uinvchar.h" michael@0: #include "util.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NameUnicodeTransliterator) michael@0: michael@0: static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~" michael@0: static const UChar OPEN_DELIM = 92; // '\\' first char of OPEN michael@0: static const UChar CLOSE_DELIM = 125; // '}' michael@0: static const UChar SPACE = 32; // ' ' michael@0: michael@0: U_CDECL_BEGIN michael@0: michael@0: // USetAdder implementation michael@0: // Does not use uset.h to reduce code dependencies michael@0: static void U_CALLCONV michael@0: _set_add(USet *set, UChar32 c) { michael@0: uset_add(set, c); michael@0: } michael@0: michael@0: // These functions aren't used. michael@0: /*static void U_CALLCONV michael@0: _set_addRange(USet *set, UChar32 start, UChar32 end) { michael@0: ((UnicodeSet *)set)->add(start, end); michael@0: } michael@0: michael@0: static void U_CALLCONV michael@0: _set_addString(USet *set, const UChar *str, int32_t length) { michael@0: ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); michael@0: }*/ michael@0: michael@0: U_CDECL_END michael@0: michael@0: /** michael@0: * Constructs a transliterator with the default delimiters '{' and michael@0: * '}'. michael@0: */ michael@0: NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) : michael@0: Transliterator(UNICODE_STRING("Name-Any", 8), adoptedFilter) { michael@0: michael@0: UnicodeSet *legalPtr = &legal; michael@0: // Get the legal character set michael@0: USetAdder sa = { michael@0: (USet *)legalPtr, // USet* == UnicodeSet* michael@0: _set_add, michael@0: NULL, // Don't need _set_addRange michael@0: NULL, // Don't need _set_addString michael@0: NULL, // Don't need remove() michael@0: NULL michael@0: }; michael@0: uprv_getCharNameCharacters(&sa); michael@0: } michael@0: michael@0: /** michael@0: * Destructor. michael@0: */ michael@0: NameUnicodeTransliterator::~NameUnicodeTransliterator() {} michael@0: michael@0: /** michael@0: * Copy constructor. michael@0: */ michael@0: NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) : michael@0: Transliterator(o), legal(o.legal) {} michael@0: michael@0: /** michael@0: * Assignment operator. michael@0: */ michael@0: /*NameUnicodeTransliterator& NameUnicodeTransliterator::operator=( michael@0: const NameUnicodeTransliterator& o) { michael@0: Transliterator::operator=(o); michael@0: // not necessary: the legal sets should all be the same -- legal=o.legal; michael@0: return *this; michael@0: }*/ michael@0: michael@0: /** michael@0: * Transliterator API. michael@0: */ michael@0: Transliterator* NameUnicodeTransliterator::clone(void) const { michael@0: return new NameUnicodeTransliterator(*this); michael@0: } michael@0: michael@0: /** michael@0: * Implements {@link Transliterator#handleTransliterate}. michael@0: */ michael@0: void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, michael@0: UBool isIncremental) const { michael@0: // The failure mode, here and below, is to behave like Any-Null, michael@0: // if either there is no name data (max len == 0) or there is no michael@0: // memory (malloc() => NULL). michael@0: michael@0: int32_t maxLen = uprv_getMaxCharNameLength(); michael@0: if (maxLen == 0) { michael@0: offsets.start = offsets.limit; michael@0: return; michael@0: } michael@0: michael@0: // Accomodate the longest possible name michael@0: ++maxLen; // allow for temporary trailing space michael@0: char* cbuf = (char*) uprv_malloc(maxLen); michael@0: if (cbuf == NULL) { michael@0: offsets.start = offsets.limit; michael@0: return; michael@0: } michael@0: michael@0: UnicodeString openPat(TRUE, OPEN, -1); michael@0: UnicodeString str, name; michael@0: michael@0: int32_t cursor = offsets.start; michael@0: int32_t limit = offsets.limit; michael@0: michael@0: // Modes: michael@0: // 0 - looking for open delimiter michael@0: // 1 - after open delimiter michael@0: int32_t mode = 0; michael@0: int32_t openPos = -1; // open delim candidate pos michael@0: michael@0: UChar32 c; michael@0: while (cursor < limit) { michael@0: c = text.char32At(cursor); michael@0: michael@0: switch (mode) { michael@0: case 0: // looking for open delimiter michael@0: if (c == OPEN_DELIM) { // quick check first michael@0: openPos = cursor; michael@0: int32_t i = michael@0: ICU_Utility::parsePattern(openPat, text, cursor, limit); michael@0: if (i >= 0 && i < limit) { michael@0: mode = 1; michael@0: name.truncate(0); michael@0: cursor = i; michael@0: continue; // *** reprocess char32At(cursor) michael@0: } michael@0: } michael@0: break; michael@0: michael@0: case 1: // after open delimiter michael@0: // Look for legal chars. If \s+ is found, convert it michael@0: // to a single space. If closeDelimiter is found, exit michael@0: // the loop. If any other character is found, exit the michael@0: // loop. If the limit is reached, exit the loop. michael@0: michael@0: // Convert \s+ => SPACE. This assumes there are no michael@0: // runs of >1 space characters in names. michael@0: if (PatternProps::isWhiteSpace(c)) { michael@0: // Ignore leading whitespace michael@0: if (name.length() > 0 && michael@0: name.charAt(name.length()-1) != SPACE) { michael@0: name.append(SPACE); michael@0: // If we are too long then abort. maxLen includes michael@0: // temporary trailing space, so use '>'. michael@0: if (name.length() > maxLen) { michael@0: mode = 0; michael@0: } michael@0: } michael@0: break; michael@0: } michael@0: michael@0: if (c == CLOSE_DELIM) { michael@0: int32_t len = name.length(); michael@0: michael@0: // Delete trailing space, if any michael@0: if (len > 0 && michael@0: name.charAt(len-1) == SPACE) { michael@0: --len; michael@0: } michael@0: michael@0: if (uprv_isInvariantUString(name.getBuffer(), len)) { michael@0: name.extract(0, len, cbuf, maxLen, US_INV); michael@0: michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status); michael@0: if (U_SUCCESS(status)) { michael@0: // Lookup succeeded michael@0: michael@0: // assert(U16_LENGTH(CLOSE_DELIM) == 1); michael@0: cursor++; // advance over CLOSE_DELIM michael@0: michael@0: str.truncate(0); michael@0: str.append(c); michael@0: text.handleReplaceBetween(openPos, cursor, str); michael@0: michael@0: // Adjust indices for the change in the length of michael@0: // the string. Do not assume that str.length() == michael@0: // 1, in case of surrogates. michael@0: int32_t delta = cursor - openPos - str.length(); michael@0: cursor -= delta; michael@0: limit -= delta; michael@0: // assert(cursor == openPos + str.length()); michael@0: } michael@0: } michael@0: // If the lookup failed, we leave things as-is and michael@0: // still switch to mode 0 and continue. michael@0: mode = 0; michael@0: openPos = -1; // close off candidate michael@0: continue; // *** reprocess char32At(cursor) michael@0: } michael@0: michael@0: // Check if c is a legal char. We assume here that michael@0: // legal.contains(OPEN_DELIM) is FALSE, so when we abort a michael@0: // name, we don't have to go back to openPos+1. michael@0: if (legal.contains(c)) { michael@0: name.append(c); michael@0: // If we go past the longest possible name then abort. michael@0: // maxLen includes temporary trailing space, so use '>='. michael@0: if (name.length() >= maxLen) { michael@0: mode = 0; michael@0: } michael@0: } michael@0: michael@0: // Invalid character michael@0: else { michael@0: --cursor; // Backup and reprocess this character michael@0: mode = 0; michael@0: } michael@0: michael@0: break; michael@0: } michael@0: michael@0: cursor += U16_LENGTH(c); michael@0: } michael@0: michael@0: offsets.contextLimit += limit - offsets.limit; michael@0: offsets.limit = limit; michael@0: // In incremental mode, only advance the cursor up to the last michael@0: // open delimiter candidate. michael@0: offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor; michael@0: michael@0: uprv_free(cbuf); michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */