1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/name2uni.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,256 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2001-2011, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* Date Name Description 1.10 +* 06/07/01 aliu Creation. 1.11 +********************************************************************** 1.12 +*/ 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_TRANSLITERATION 1.17 + 1.18 +#include "unicode/unifilt.h" 1.19 +#include "unicode/uchar.h" 1.20 +#include "unicode/uniset.h" 1.21 +#include "unicode/utf16.h" 1.22 +#include "cmemory.h" 1.23 +#include "name2uni.h" 1.24 +#include "patternprops.h" 1.25 +#include "uprops.h" 1.26 +#include "uinvchar.h" 1.27 +#include "util.h" 1.28 + 1.29 +U_NAMESPACE_BEGIN 1.30 + 1.31 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NameUnicodeTransliterator) 1.32 + 1.33 +static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~" 1.34 +static const UChar OPEN_DELIM = 92; // '\\' first char of OPEN 1.35 +static const UChar CLOSE_DELIM = 125; // '}' 1.36 +static const UChar SPACE = 32; // ' ' 1.37 + 1.38 +U_CDECL_BEGIN 1.39 + 1.40 +// USetAdder implementation 1.41 +// Does not use uset.h to reduce code dependencies 1.42 +static void U_CALLCONV 1.43 +_set_add(USet *set, UChar32 c) { 1.44 + uset_add(set, c); 1.45 +} 1.46 + 1.47 +// These functions aren't used. 1.48 +/*static void U_CALLCONV 1.49 +_set_addRange(USet *set, UChar32 start, UChar32 end) { 1.50 + ((UnicodeSet *)set)->add(start, end); 1.51 +} 1.52 + 1.53 +static void U_CALLCONV 1.54 +_set_addString(USet *set, const UChar *str, int32_t length) { 1.55 + ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); 1.56 +}*/ 1.57 + 1.58 +U_CDECL_END 1.59 + 1.60 +/** 1.61 + * Constructs a transliterator with the default delimiters '{' and 1.62 + * '}'. 1.63 + */ 1.64 +NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) : 1.65 + Transliterator(UNICODE_STRING("Name-Any", 8), adoptedFilter) { 1.66 + 1.67 + UnicodeSet *legalPtr = &legal; 1.68 + // Get the legal character set 1.69 + USetAdder sa = { 1.70 + (USet *)legalPtr, // USet* == UnicodeSet* 1.71 + _set_add, 1.72 + NULL, // Don't need _set_addRange 1.73 + NULL, // Don't need _set_addString 1.74 + NULL, // Don't need remove() 1.75 + NULL 1.76 + }; 1.77 + uprv_getCharNameCharacters(&sa); 1.78 +} 1.79 + 1.80 +/** 1.81 + * Destructor. 1.82 + */ 1.83 +NameUnicodeTransliterator::~NameUnicodeTransliterator() {} 1.84 + 1.85 +/** 1.86 + * Copy constructor. 1.87 + */ 1.88 +NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) : 1.89 + Transliterator(o), legal(o.legal) {} 1.90 + 1.91 +/** 1.92 + * Assignment operator. 1.93 + */ 1.94 +/*NameUnicodeTransliterator& NameUnicodeTransliterator::operator=( 1.95 + const NameUnicodeTransliterator& o) { 1.96 + Transliterator::operator=(o); 1.97 + // not necessary: the legal sets should all be the same -- legal=o.legal; 1.98 + return *this; 1.99 +}*/ 1.100 + 1.101 +/** 1.102 + * Transliterator API. 1.103 + */ 1.104 +Transliterator* NameUnicodeTransliterator::clone(void) const { 1.105 + return new NameUnicodeTransliterator(*this); 1.106 +} 1.107 + 1.108 +/** 1.109 + * Implements {@link Transliterator#handleTransliterate}. 1.110 + */ 1.111 +void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 1.112 + UBool isIncremental) const { 1.113 + // The failure mode, here and below, is to behave like Any-Null, 1.114 + // if either there is no name data (max len == 0) or there is no 1.115 + // memory (malloc() => NULL). 1.116 + 1.117 + int32_t maxLen = uprv_getMaxCharNameLength(); 1.118 + if (maxLen == 0) { 1.119 + offsets.start = offsets.limit; 1.120 + return; 1.121 + } 1.122 + 1.123 + // Accomodate the longest possible name 1.124 + ++maxLen; // allow for temporary trailing space 1.125 + char* cbuf = (char*) uprv_malloc(maxLen); 1.126 + if (cbuf == NULL) { 1.127 + offsets.start = offsets.limit; 1.128 + return; 1.129 + } 1.130 + 1.131 + UnicodeString openPat(TRUE, OPEN, -1); 1.132 + UnicodeString str, name; 1.133 + 1.134 + int32_t cursor = offsets.start; 1.135 + int32_t limit = offsets.limit; 1.136 + 1.137 + // Modes: 1.138 + // 0 - looking for open delimiter 1.139 + // 1 - after open delimiter 1.140 + int32_t mode = 0; 1.141 + int32_t openPos = -1; // open delim candidate pos 1.142 + 1.143 + UChar32 c; 1.144 + while (cursor < limit) { 1.145 + c = text.char32At(cursor); 1.146 + 1.147 + switch (mode) { 1.148 + case 0: // looking for open delimiter 1.149 + if (c == OPEN_DELIM) { // quick check first 1.150 + openPos = cursor; 1.151 + int32_t i = 1.152 + ICU_Utility::parsePattern(openPat, text, cursor, limit); 1.153 + if (i >= 0 && i < limit) { 1.154 + mode = 1; 1.155 + name.truncate(0); 1.156 + cursor = i; 1.157 + continue; // *** reprocess char32At(cursor) 1.158 + } 1.159 + } 1.160 + break; 1.161 + 1.162 + case 1: // after open delimiter 1.163 + // Look for legal chars. If \s+ is found, convert it 1.164 + // to a single space. If closeDelimiter is found, exit 1.165 + // the loop. If any other character is found, exit the 1.166 + // loop. If the limit is reached, exit the loop. 1.167 + 1.168 + // Convert \s+ => SPACE. This assumes there are no 1.169 + // runs of >1 space characters in names. 1.170 + if (PatternProps::isWhiteSpace(c)) { 1.171 + // Ignore leading whitespace 1.172 + if (name.length() > 0 && 1.173 + name.charAt(name.length()-1) != SPACE) { 1.174 + name.append(SPACE); 1.175 + // If we are too long then abort. maxLen includes 1.176 + // temporary trailing space, so use '>'. 1.177 + if (name.length() > maxLen) { 1.178 + mode = 0; 1.179 + } 1.180 + } 1.181 + break; 1.182 + } 1.183 + 1.184 + if (c == CLOSE_DELIM) { 1.185 + int32_t len = name.length(); 1.186 + 1.187 + // Delete trailing space, if any 1.188 + if (len > 0 && 1.189 + name.charAt(len-1) == SPACE) { 1.190 + --len; 1.191 + } 1.192 + 1.193 + if (uprv_isInvariantUString(name.getBuffer(), len)) { 1.194 + name.extract(0, len, cbuf, maxLen, US_INV); 1.195 + 1.196 + UErrorCode status = U_ZERO_ERROR; 1.197 + c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status); 1.198 + if (U_SUCCESS(status)) { 1.199 + // Lookup succeeded 1.200 + 1.201 + // assert(U16_LENGTH(CLOSE_DELIM) == 1); 1.202 + cursor++; // advance over CLOSE_DELIM 1.203 + 1.204 + str.truncate(0); 1.205 + str.append(c); 1.206 + text.handleReplaceBetween(openPos, cursor, str); 1.207 + 1.208 + // Adjust indices for the change in the length of 1.209 + // the string. Do not assume that str.length() == 1.210 + // 1, in case of surrogates. 1.211 + int32_t delta = cursor - openPos - str.length(); 1.212 + cursor -= delta; 1.213 + limit -= delta; 1.214 + // assert(cursor == openPos + str.length()); 1.215 + } 1.216 + } 1.217 + // If the lookup failed, we leave things as-is and 1.218 + // still switch to mode 0 and continue. 1.219 + mode = 0; 1.220 + openPos = -1; // close off candidate 1.221 + continue; // *** reprocess char32At(cursor) 1.222 + } 1.223 + 1.224 + // Check if c is a legal char. We assume here that 1.225 + // legal.contains(OPEN_DELIM) is FALSE, so when we abort a 1.226 + // name, we don't have to go back to openPos+1. 1.227 + if (legal.contains(c)) { 1.228 + name.append(c); 1.229 + // If we go past the longest possible name then abort. 1.230 + // maxLen includes temporary trailing space, so use '>='. 1.231 + if (name.length() >= maxLen) { 1.232 + mode = 0; 1.233 + } 1.234 + } 1.235 + 1.236 + // Invalid character 1.237 + else { 1.238 + --cursor; // Backup and reprocess this character 1.239 + mode = 0; 1.240 + } 1.241 + 1.242 + break; 1.243 + } 1.244 + 1.245 + cursor += U16_LENGTH(c); 1.246 + } 1.247 + 1.248 + offsets.contextLimit += limit - offsets.limit; 1.249 + offsets.limit = limit; 1.250 + // In incremental mode, only advance the cursor up to the last 1.251 + // open delimiter candidate. 1.252 + offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor; 1.253 + 1.254 + uprv_free(cbuf); 1.255 +} 1.256 + 1.257 +U_NAMESPACE_END 1.258 + 1.259 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */