intl/icu/source/i18n/name2uni.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2001-2011, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * Date Name Description
michael@0 7 * 06/07/01 aliu Creation.
michael@0 8 **********************************************************************
michael@0 9 */
michael@0 10
michael@0 11 #include "unicode/utypes.h"
michael@0 12
michael@0 13 #if !UCONFIG_NO_TRANSLITERATION
michael@0 14
michael@0 15 #include "unicode/unifilt.h"
michael@0 16 #include "unicode/uchar.h"
michael@0 17 #include "unicode/uniset.h"
michael@0 18 #include "unicode/utf16.h"
michael@0 19 #include "cmemory.h"
michael@0 20 #include "name2uni.h"
michael@0 21 #include "patternprops.h"
michael@0 22 #include "uprops.h"
michael@0 23 #include "uinvchar.h"
michael@0 24 #include "util.h"
michael@0 25
michael@0 26 U_NAMESPACE_BEGIN
michael@0 27
michael@0 28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NameUnicodeTransliterator)
michael@0 29
michael@0 30 static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~"
michael@0 31 static const UChar OPEN_DELIM = 92; // '\\' first char of OPEN
michael@0 32 static const UChar CLOSE_DELIM = 125; // '}'
michael@0 33 static const UChar SPACE = 32; // ' '
michael@0 34
michael@0 35 U_CDECL_BEGIN
michael@0 36
michael@0 37 // USetAdder implementation
michael@0 38 // Does not use uset.h to reduce code dependencies
michael@0 39 static void U_CALLCONV
michael@0 40 _set_add(USet *set, UChar32 c) {
michael@0 41 uset_add(set, c);
michael@0 42 }
michael@0 43
michael@0 44 // These functions aren't used.
michael@0 45 /*static void U_CALLCONV
michael@0 46 _set_addRange(USet *set, UChar32 start, UChar32 end) {
michael@0 47 ((UnicodeSet *)set)->add(start, end);
michael@0 48 }
michael@0 49
michael@0 50 static void U_CALLCONV
michael@0 51 _set_addString(USet *set, const UChar *str, int32_t length) {
michael@0 52 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
michael@0 53 }*/
michael@0 54
michael@0 55 U_CDECL_END
michael@0 56
michael@0 57 /**
michael@0 58 * Constructs a transliterator with the default delimiters '{' and
michael@0 59 * '}'.
michael@0 60 */
michael@0 61 NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
michael@0 62 Transliterator(UNICODE_STRING("Name-Any", 8), adoptedFilter) {
michael@0 63
michael@0 64 UnicodeSet *legalPtr = &legal;
michael@0 65 // Get the legal character set
michael@0 66 USetAdder sa = {
michael@0 67 (USet *)legalPtr, // USet* == UnicodeSet*
michael@0 68 _set_add,
michael@0 69 NULL, // Don't need _set_addRange
michael@0 70 NULL, // Don't need _set_addString
michael@0 71 NULL, // Don't need remove()
michael@0 72 NULL
michael@0 73 };
michael@0 74 uprv_getCharNameCharacters(&sa);
michael@0 75 }
michael@0 76
michael@0 77 /**
michael@0 78 * Destructor.
michael@0 79 */
michael@0 80 NameUnicodeTransliterator::~NameUnicodeTransliterator() {}
michael@0 81
michael@0 82 /**
michael@0 83 * Copy constructor.
michael@0 84 */
michael@0 85 NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
michael@0 86 Transliterator(o), legal(o.legal) {}
michael@0 87
michael@0 88 /**
michael@0 89 * Assignment operator.
michael@0 90 */
michael@0 91 /*NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
michael@0 92 const NameUnicodeTransliterator& o) {
michael@0 93 Transliterator::operator=(o);
michael@0 94 // not necessary: the legal sets should all be the same -- legal=o.legal;
michael@0 95 return *this;
michael@0 96 }*/
michael@0 97
michael@0 98 /**
michael@0 99 * Transliterator API.
michael@0 100 */
michael@0 101 Transliterator* NameUnicodeTransliterator::clone(void) const {
michael@0 102 return new NameUnicodeTransliterator(*this);
michael@0 103 }
michael@0 104
michael@0 105 /**
michael@0 106 * Implements {@link Transliterator#handleTransliterate}.
michael@0 107 */
michael@0 108 void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
michael@0 109 UBool isIncremental) const {
michael@0 110 // The failure mode, here and below, is to behave like Any-Null,
michael@0 111 // if either there is no name data (max len == 0) or there is no
michael@0 112 // memory (malloc() => NULL).
michael@0 113
michael@0 114 int32_t maxLen = uprv_getMaxCharNameLength();
michael@0 115 if (maxLen == 0) {
michael@0 116 offsets.start = offsets.limit;
michael@0 117 return;
michael@0 118 }
michael@0 119
michael@0 120 // Accomodate the longest possible name
michael@0 121 ++maxLen; // allow for temporary trailing space
michael@0 122 char* cbuf = (char*) uprv_malloc(maxLen);
michael@0 123 if (cbuf == NULL) {
michael@0 124 offsets.start = offsets.limit;
michael@0 125 return;
michael@0 126 }
michael@0 127
michael@0 128 UnicodeString openPat(TRUE, OPEN, -1);
michael@0 129 UnicodeString str, name;
michael@0 130
michael@0 131 int32_t cursor = offsets.start;
michael@0 132 int32_t limit = offsets.limit;
michael@0 133
michael@0 134 // Modes:
michael@0 135 // 0 - looking for open delimiter
michael@0 136 // 1 - after open delimiter
michael@0 137 int32_t mode = 0;
michael@0 138 int32_t openPos = -1; // open delim candidate pos
michael@0 139
michael@0 140 UChar32 c;
michael@0 141 while (cursor < limit) {
michael@0 142 c = text.char32At(cursor);
michael@0 143
michael@0 144 switch (mode) {
michael@0 145 case 0: // looking for open delimiter
michael@0 146 if (c == OPEN_DELIM) { // quick check first
michael@0 147 openPos = cursor;
michael@0 148 int32_t i =
michael@0 149 ICU_Utility::parsePattern(openPat, text, cursor, limit);
michael@0 150 if (i >= 0 && i < limit) {
michael@0 151 mode = 1;
michael@0 152 name.truncate(0);
michael@0 153 cursor = i;
michael@0 154 continue; // *** reprocess char32At(cursor)
michael@0 155 }
michael@0 156 }
michael@0 157 break;
michael@0 158
michael@0 159 case 1: // after open delimiter
michael@0 160 // Look for legal chars. If \s+ is found, convert it
michael@0 161 // to a single space. If closeDelimiter is found, exit
michael@0 162 // the loop. If any other character is found, exit the
michael@0 163 // loop. If the limit is reached, exit the loop.
michael@0 164
michael@0 165 // Convert \s+ => SPACE. This assumes there are no
michael@0 166 // runs of >1 space characters in names.
michael@0 167 if (PatternProps::isWhiteSpace(c)) {
michael@0 168 // Ignore leading whitespace
michael@0 169 if (name.length() > 0 &&
michael@0 170 name.charAt(name.length()-1) != SPACE) {
michael@0 171 name.append(SPACE);
michael@0 172 // If we are too long then abort. maxLen includes
michael@0 173 // temporary trailing space, so use '>'.
michael@0 174 if (name.length() > maxLen) {
michael@0 175 mode = 0;
michael@0 176 }
michael@0 177 }
michael@0 178 break;
michael@0 179 }
michael@0 180
michael@0 181 if (c == CLOSE_DELIM) {
michael@0 182 int32_t len = name.length();
michael@0 183
michael@0 184 // Delete trailing space, if any
michael@0 185 if (len > 0 &&
michael@0 186 name.charAt(len-1) == SPACE) {
michael@0 187 --len;
michael@0 188 }
michael@0 189
michael@0 190 if (uprv_isInvariantUString(name.getBuffer(), len)) {
michael@0 191 name.extract(0, len, cbuf, maxLen, US_INV);
michael@0 192
michael@0 193 UErrorCode status = U_ZERO_ERROR;
michael@0 194 c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
michael@0 195 if (U_SUCCESS(status)) {
michael@0 196 // Lookup succeeded
michael@0 197
michael@0 198 // assert(U16_LENGTH(CLOSE_DELIM) == 1);
michael@0 199 cursor++; // advance over CLOSE_DELIM
michael@0 200
michael@0 201 str.truncate(0);
michael@0 202 str.append(c);
michael@0 203 text.handleReplaceBetween(openPos, cursor, str);
michael@0 204
michael@0 205 // Adjust indices for the change in the length of
michael@0 206 // the string. Do not assume that str.length() ==
michael@0 207 // 1, in case of surrogates.
michael@0 208 int32_t delta = cursor - openPos - str.length();
michael@0 209 cursor -= delta;
michael@0 210 limit -= delta;
michael@0 211 // assert(cursor == openPos + str.length());
michael@0 212 }
michael@0 213 }
michael@0 214 // If the lookup failed, we leave things as-is and
michael@0 215 // still switch to mode 0 and continue.
michael@0 216 mode = 0;
michael@0 217 openPos = -1; // close off candidate
michael@0 218 continue; // *** reprocess char32At(cursor)
michael@0 219 }
michael@0 220
michael@0 221 // Check if c is a legal char. We assume here that
michael@0 222 // legal.contains(OPEN_DELIM) is FALSE, so when we abort a
michael@0 223 // name, we don't have to go back to openPos+1.
michael@0 224 if (legal.contains(c)) {
michael@0 225 name.append(c);
michael@0 226 // If we go past the longest possible name then abort.
michael@0 227 // maxLen includes temporary trailing space, so use '>='.
michael@0 228 if (name.length() >= maxLen) {
michael@0 229 mode = 0;
michael@0 230 }
michael@0 231 }
michael@0 232
michael@0 233 // Invalid character
michael@0 234 else {
michael@0 235 --cursor; // Backup and reprocess this character
michael@0 236 mode = 0;
michael@0 237 }
michael@0 238
michael@0 239 break;
michael@0 240 }
michael@0 241
michael@0 242 cursor += U16_LENGTH(c);
michael@0 243 }
michael@0 244
michael@0 245 offsets.contextLimit += limit - offsets.limit;
michael@0 246 offsets.limit = limit;
michael@0 247 // In incremental mode, only advance the cursor up to the last
michael@0 248 // open delimiter candidate.
michael@0 249 offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
michael@0 250
michael@0 251 uprv_free(cbuf);
michael@0 252 }
michael@0 253
michael@0 254 U_NAMESPACE_END
michael@0 255
michael@0 256 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

mercurial