Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 1999-2011, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * Date Name Description |
michael@0 | 7 | * 11/17/99 aliu Creation. |
michael@0 | 8 | ********************************************************************** |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "unicode/utypes.h" |
michael@0 | 12 | |
michael@0 | 13 | #if !UCONFIG_NO_TRANSLITERATION |
michael@0 | 14 | |
michael@0 | 15 | #include "unicode/unifilt.h" |
michael@0 | 16 | #include "unicode/uniset.h" |
michael@0 | 17 | #include "cpdtrans.h" |
michael@0 | 18 | #include "uvector.h" |
michael@0 | 19 | #include "tridpars.h" |
michael@0 | 20 | #include "cmemory.h" |
michael@0 | 21 | |
michael@0 | 22 | // keep in sync with Transliterator |
michael@0 | 23 | //static const UChar ID_SEP = 0x002D; /*-*/ |
michael@0 | 24 | static const UChar ID_DELIM = 0x003B; /*;*/ |
michael@0 | 25 | static const UChar NEWLINE = 10; |
michael@0 | 26 | |
michael@0 | 27 | static const UChar COLON_COLON[] = {0x3A, 0x3A, 0}; //"::" |
michael@0 | 28 | |
michael@0 | 29 | U_NAMESPACE_BEGIN |
michael@0 | 30 | |
michael@0 | 31 | const UChar CompoundTransliterator::PASS_STRING[] = { 0x0025, 0x0050, 0x0061, 0x0073, 0x0073, 0 }; // "%Pass" |
michael@0 | 32 | |
michael@0 | 33 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompoundTransliterator) |
michael@0 | 34 | |
michael@0 | 35 | /** |
michael@0 | 36 | * Constructs a new compound transliterator given an array of |
michael@0 | 37 | * transliterators. The array of transliterators may be of any |
michael@0 | 38 | * length, including zero or one, however, useful compound |
michael@0 | 39 | * transliterators have at least two components. |
michael@0 | 40 | * @param transliterators array of <code>Transliterator</code> |
michael@0 | 41 | * objects |
michael@0 | 42 | * @param transliteratorCount The number of |
michael@0 | 43 | * <code>Transliterator</code> objects in transliterators. |
michael@0 | 44 | * @param filter the filter. Any character for which |
michael@0 | 45 | * <tt>filter.contains()</tt> returns <tt>false</tt> will not be |
michael@0 | 46 | * altered by this transliterator. If <tt>filter</tt> is |
michael@0 | 47 | * <tt>null</tt> then no filtering is applied. |
michael@0 | 48 | */ |
michael@0 | 49 | CompoundTransliterator::CompoundTransliterator( |
michael@0 | 50 | Transliterator* const transliterators[], |
michael@0 | 51 | int32_t transliteratorCount, |
michael@0 | 52 | UnicodeFilter* adoptedFilter) : |
michael@0 | 53 | Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter), |
michael@0 | 54 | trans(0), count(0), numAnonymousRBTs(0) { |
michael@0 | 55 | setTransliterators(transliterators, transliteratorCount); |
michael@0 | 56 | } |
michael@0 | 57 | |
michael@0 | 58 | /** |
michael@0 | 59 | * Splits an ID of the form "ID;ID;..." into a compound using each |
michael@0 | 60 | * of the IDs. |
michael@0 | 61 | * @param id of above form |
michael@0 | 62 | * @param forward if false, does the list in reverse order, and |
michael@0 | 63 | * takes the inverse of each ID. |
michael@0 | 64 | */ |
michael@0 | 65 | CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, |
michael@0 | 66 | UTransDirection direction, |
michael@0 | 67 | UnicodeFilter* adoptedFilter, |
michael@0 | 68 | UParseError& /*parseError*/, |
michael@0 | 69 | UErrorCode& status) : |
michael@0 | 70 | Transliterator(id, adoptedFilter), |
michael@0 | 71 | trans(0), numAnonymousRBTs(0) { |
michael@0 | 72 | // TODO add code for parseError...currently unused, but |
michael@0 | 73 | // later may be used by parsing code... |
michael@0 | 74 | init(id, direction, TRUE, status); |
michael@0 | 75 | } |
michael@0 | 76 | |
michael@0 | 77 | CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, |
michael@0 | 78 | UParseError& /*parseError*/, |
michael@0 | 79 | UErrorCode& status) : |
michael@0 | 80 | Transliterator(id, 0), // set filter to 0 here! |
michael@0 | 81 | trans(0), numAnonymousRBTs(0) { |
michael@0 | 82 | // TODO add code for parseError...currently unused, but |
michael@0 | 83 | // later may be used by parsing code... |
michael@0 | 84 | init(id, UTRANS_FORWARD, TRUE, status); |
michael@0 | 85 | } |
michael@0 | 86 | |
michael@0 | 87 | |
michael@0 | 88 | /** |
michael@0 | 89 | * Private constructor for use of TransliteratorAlias |
michael@0 | 90 | */ |
michael@0 | 91 | CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID, |
michael@0 | 92 | UVector& list, |
michael@0 | 93 | UnicodeFilter* adoptedFilter, |
michael@0 | 94 | int32_t anonymousRBTs, |
michael@0 | 95 | UParseError& /*parseError*/, |
michael@0 | 96 | UErrorCode& status) : |
michael@0 | 97 | Transliterator(newID, adoptedFilter), |
michael@0 | 98 | trans(0), numAnonymousRBTs(anonymousRBTs) |
michael@0 | 99 | { |
michael@0 | 100 | init(list, UTRANS_FORWARD, FALSE, status); |
michael@0 | 101 | } |
michael@0 | 102 | |
michael@0 | 103 | /** |
michael@0 | 104 | * Private constructor for Transliterator from a vector of |
michael@0 | 105 | * transliterators. The caller is responsible for fixing up the |
michael@0 | 106 | * ID. |
michael@0 | 107 | */ |
michael@0 | 108 | CompoundTransliterator::CompoundTransliterator(UVector& list, |
michael@0 | 109 | UParseError& /*parseError*/, |
michael@0 | 110 | UErrorCode& status) : |
michael@0 | 111 | Transliterator(UnicodeString(), NULL), |
michael@0 | 112 | trans(0), numAnonymousRBTs(0) |
michael@0 | 113 | { |
michael@0 | 114 | // TODO add code for parseError...currently unused, but |
michael@0 | 115 | // later may be used by parsing code... |
michael@0 | 116 | init(list, UTRANS_FORWARD, FALSE, status); |
michael@0 | 117 | // assume caller will fixup ID |
michael@0 | 118 | } |
michael@0 | 119 | |
michael@0 | 120 | CompoundTransliterator::CompoundTransliterator(UVector& list, |
michael@0 | 121 | int32_t anonymousRBTs, |
michael@0 | 122 | UParseError& /*parseError*/, |
michael@0 | 123 | UErrorCode& status) : |
michael@0 | 124 | Transliterator(UnicodeString(), NULL), |
michael@0 | 125 | trans(0), numAnonymousRBTs(anonymousRBTs) |
michael@0 | 126 | { |
michael@0 | 127 | init(list, UTRANS_FORWARD, FALSE, status); |
michael@0 | 128 | } |
michael@0 | 129 | |
michael@0 | 130 | /** |
michael@0 | 131 | * Finish constructing a transliterator: only to be called by |
michael@0 | 132 | * constructors. Before calling init(), set trans and filter to NULL. |
michael@0 | 133 | * @param id the id containing ';'-separated entries |
michael@0 | 134 | * @param direction either FORWARD or REVERSE |
michael@0 | 135 | * @param idSplitPoint the index into id at which the |
michael@0 | 136 | * adoptedSplitTransliterator should be inserted, if there is one, or |
michael@0 | 137 | * -1 if there is none. |
michael@0 | 138 | * @param adoptedSplitTransliterator a transliterator to be inserted |
michael@0 | 139 | * before the entry at offset idSplitPoint in the id string. May be |
michael@0 | 140 | * NULL to insert no entry. |
michael@0 | 141 | * @param fixReverseID if TRUE, then reconstruct the ID of reverse |
michael@0 | 142 | * entries by calling getID() of component entries. Some constructors |
michael@0 | 143 | * do not require this because they apply a facade ID anyway. |
michael@0 | 144 | * @param status the error code indicating success or failure |
michael@0 | 145 | */ |
michael@0 | 146 | void CompoundTransliterator::init(const UnicodeString& id, |
michael@0 | 147 | UTransDirection direction, |
michael@0 | 148 | UBool fixReverseID, |
michael@0 | 149 | UErrorCode& status) { |
michael@0 | 150 | // assert(trans == 0); |
michael@0 | 151 | |
michael@0 | 152 | if (U_FAILURE(status)) { |
michael@0 | 153 | return; |
michael@0 | 154 | } |
michael@0 | 155 | |
michael@0 | 156 | UVector list(status); |
michael@0 | 157 | UnicodeSet* compoundFilter = NULL; |
michael@0 | 158 | UnicodeString regenID; |
michael@0 | 159 | if (!TransliteratorIDParser::parseCompoundID(id, direction, |
michael@0 | 160 | regenID, list, compoundFilter)) { |
michael@0 | 161 | status = U_INVALID_ID; |
michael@0 | 162 | delete compoundFilter; |
michael@0 | 163 | return; |
michael@0 | 164 | } |
michael@0 | 165 | |
michael@0 | 166 | TransliteratorIDParser::instantiateList(list, status); |
michael@0 | 167 | |
michael@0 | 168 | init(list, direction, fixReverseID, status); |
michael@0 | 169 | |
michael@0 | 170 | if (compoundFilter != NULL) { |
michael@0 | 171 | adoptFilter(compoundFilter); |
michael@0 | 172 | } |
michael@0 | 173 | } |
michael@0 | 174 | |
michael@0 | 175 | /** |
michael@0 | 176 | * Finish constructing a transliterator: only to be called by |
michael@0 | 177 | * constructors. Before calling init(), set trans and filter to NULL. |
michael@0 | 178 | * @param list a vector of transliterator objects to be adopted. It |
michael@0 | 179 | * should NOT be empty. The list should be in declared order. That |
michael@0 | 180 | * is, it should be in the FORWARD order; if direction is REVERSE then |
michael@0 | 181 | * the list order will be reversed. |
michael@0 | 182 | * @param direction either FORWARD or REVERSE |
michael@0 | 183 | * @param fixReverseID if TRUE, then reconstruct the ID of reverse |
michael@0 | 184 | * entries by calling getID() of component entries. Some constructors |
michael@0 | 185 | * do not require this because they apply a facade ID anyway. |
michael@0 | 186 | * @param status the error code indicating success or failure |
michael@0 | 187 | */ |
michael@0 | 188 | void CompoundTransliterator::init(UVector& list, |
michael@0 | 189 | UTransDirection direction, |
michael@0 | 190 | UBool fixReverseID, |
michael@0 | 191 | UErrorCode& status) { |
michael@0 | 192 | // assert(trans == 0); |
michael@0 | 193 | |
michael@0 | 194 | // Allocate array |
michael@0 | 195 | if (U_SUCCESS(status)) { |
michael@0 | 196 | count = list.size(); |
michael@0 | 197 | trans = (Transliterator **)uprv_malloc(count * sizeof(Transliterator *)); |
michael@0 | 198 | /* test for NULL */ |
michael@0 | 199 | if (trans == 0) { |
michael@0 | 200 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 201 | return; |
michael@0 | 202 | } |
michael@0 | 203 | } |
michael@0 | 204 | |
michael@0 | 205 | if (U_FAILURE(status) || trans == 0) { |
michael@0 | 206 | // assert(trans == 0); |
michael@0 | 207 | return; |
michael@0 | 208 | } |
michael@0 | 209 | |
michael@0 | 210 | // Move the transliterators from the vector into an array. |
michael@0 | 211 | // Reverse the order if necessary. |
michael@0 | 212 | int32_t i; |
michael@0 | 213 | for (i=0; i<count; ++i) { |
michael@0 | 214 | int32_t j = (direction == UTRANS_FORWARD) ? i : count - 1 - i; |
michael@0 | 215 | trans[i] = (Transliterator*) list.elementAt(j); |
michael@0 | 216 | } |
michael@0 | 217 | |
michael@0 | 218 | // If the direction is UTRANS_REVERSE then we may need to fix the |
michael@0 | 219 | // ID. |
michael@0 | 220 | if (direction == UTRANS_REVERSE && fixReverseID) { |
michael@0 | 221 | UnicodeString newID; |
michael@0 | 222 | for (i=0; i<count; ++i) { |
michael@0 | 223 | if (i > 0) { |
michael@0 | 224 | newID.append(ID_DELIM); |
michael@0 | 225 | } |
michael@0 | 226 | newID.append(trans[i]->getID()); |
michael@0 | 227 | } |
michael@0 | 228 | setID(newID); |
michael@0 | 229 | } |
michael@0 | 230 | |
michael@0 | 231 | computeMaximumContextLength(); |
michael@0 | 232 | } |
michael@0 | 233 | |
michael@0 | 234 | /** |
michael@0 | 235 | * Return the IDs of the given list of transliterators, concatenated |
michael@0 | 236 | * with ID_DELIM delimiting them. Equivalent to the perlish expression |
michael@0 | 237 | * join(ID_DELIM, map($_.getID(), transliterators). |
michael@0 | 238 | */ |
michael@0 | 239 | UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterators[], |
michael@0 | 240 | int32_t transCount) { |
michael@0 | 241 | UnicodeString id; |
michael@0 | 242 | for (int32_t i=0; i<transCount; ++i) { |
michael@0 | 243 | if (i > 0) { |
michael@0 | 244 | id.append(ID_DELIM); |
michael@0 | 245 | } |
michael@0 | 246 | id.append(transliterators[i]->getID()); |
michael@0 | 247 | } |
michael@0 | 248 | return id; // Return temporary |
michael@0 | 249 | } |
michael@0 | 250 | |
michael@0 | 251 | /** |
michael@0 | 252 | * Copy constructor. |
michael@0 | 253 | */ |
michael@0 | 254 | CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) : |
michael@0 | 255 | Transliterator(t), trans(0), count(0), numAnonymousRBTs(-1) { |
michael@0 | 256 | *this = t; |
michael@0 | 257 | } |
michael@0 | 258 | |
michael@0 | 259 | /** |
michael@0 | 260 | * Destructor |
michael@0 | 261 | */ |
michael@0 | 262 | CompoundTransliterator::~CompoundTransliterator() { |
michael@0 | 263 | freeTransliterators(); |
michael@0 | 264 | } |
michael@0 | 265 | |
michael@0 | 266 | void CompoundTransliterator::freeTransliterators(void) { |
michael@0 | 267 | if (trans != 0) { |
michael@0 | 268 | for (int32_t i=0; i<count; ++i) { |
michael@0 | 269 | delete trans[i]; |
michael@0 | 270 | } |
michael@0 | 271 | uprv_free(trans); |
michael@0 | 272 | } |
michael@0 | 273 | trans = 0; |
michael@0 | 274 | count = 0; |
michael@0 | 275 | } |
michael@0 | 276 | |
michael@0 | 277 | /** |
michael@0 | 278 | * Assignment operator. |
michael@0 | 279 | */ |
michael@0 | 280 | CompoundTransliterator& CompoundTransliterator::operator=( |
michael@0 | 281 | const CompoundTransliterator& t) |
michael@0 | 282 | { |
michael@0 | 283 | Transliterator::operator=(t); |
michael@0 | 284 | int32_t i = 0; |
michael@0 | 285 | UBool failed = FALSE; |
michael@0 | 286 | if (trans != NULL) { |
michael@0 | 287 | for (i=0; i<count; ++i) { |
michael@0 | 288 | delete trans[i]; |
michael@0 | 289 | trans[i] = 0; |
michael@0 | 290 | } |
michael@0 | 291 | } |
michael@0 | 292 | if (t.count > count) { |
michael@0 | 293 | if (trans != NULL) { |
michael@0 | 294 | uprv_free(trans); |
michael@0 | 295 | } |
michael@0 | 296 | trans = (Transliterator **)uprv_malloc(t.count * sizeof(Transliterator *)); |
michael@0 | 297 | } |
michael@0 | 298 | count = t.count; |
michael@0 | 299 | if (trans != NULL) { |
michael@0 | 300 | for (i=0; i<count; ++i) { |
michael@0 | 301 | trans[i] = t.trans[i]->clone(); |
michael@0 | 302 | if (trans[i] == NULL) { |
michael@0 | 303 | failed = TRUE; |
michael@0 | 304 | break; |
michael@0 | 305 | } |
michael@0 | 306 | } |
michael@0 | 307 | } |
michael@0 | 308 | |
michael@0 | 309 | // if memory allocation failed delete backwards trans array |
michael@0 | 310 | if (failed && i > 0) { |
michael@0 | 311 | int32_t n; |
michael@0 | 312 | for (n = i-1; n >= 0; n--) { |
michael@0 | 313 | uprv_free(trans[n]); |
michael@0 | 314 | trans[n] = NULL; |
michael@0 | 315 | } |
michael@0 | 316 | } |
michael@0 | 317 | numAnonymousRBTs = t.numAnonymousRBTs; |
michael@0 | 318 | return *this; |
michael@0 | 319 | } |
michael@0 | 320 | |
michael@0 | 321 | /** |
michael@0 | 322 | * Transliterator API. |
michael@0 | 323 | */ |
michael@0 | 324 | Transliterator* CompoundTransliterator::clone(void) const { |
michael@0 | 325 | return new CompoundTransliterator(*this); |
michael@0 | 326 | } |
michael@0 | 327 | |
michael@0 | 328 | /** |
michael@0 | 329 | * Returns the number of transliterators in this chain. |
michael@0 | 330 | * @return number of transliterators in this chain. |
michael@0 | 331 | */ |
michael@0 | 332 | int32_t CompoundTransliterator::getCount(void) const { |
michael@0 | 333 | return count; |
michael@0 | 334 | } |
michael@0 | 335 | |
michael@0 | 336 | /** |
michael@0 | 337 | * Returns the transliterator at the given index in this chain. |
michael@0 | 338 | * @param index index into chain, from 0 to <code>getCount() - 1</code> |
michael@0 | 339 | * @return transliterator at the given index |
michael@0 | 340 | */ |
michael@0 | 341 | const Transliterator& CompoundTransliterator::getTransliterator(int32_t index) const { |
michael@0 | 342 | return *trans[index]; |
michael@0 | 343 | } |
michael@0 | 344 | |
michael@0 | 345 | void CompoundTransliterator::setTransliterators(Transliterator* const transliterators[], |
michael@0 | 346 | int32_t transCount) { |
michael@0 | 347 | Transliterator** a = (Transliterator **)uprv_malloc(transCount * sizeof(Transliterator *)); |
michael@0 | 348 | if (a == NULL) { |
michael@0 | 349 | return; |
michael@0 | 350 | } |
michael@0 | 351 | int32_t i = 0; |
michael@0 | 352 | UBool failed = FALSE; |
michael@0 | 353 | for (i=0; i<transCount; ++i) { |
michael@0 | 354 | a[i] = transliterators[i]->clone(); |
michael@0 | 355 | if (a[i] == NULL) { |
michael@0 | 356 | failed = TRUE; |
michael@0 | 357 | break; |
michael@0 | 358 | } |
michael@0 | 359 | } |
michael@0 | 360 | if (failed && i > 0) { |
michael@0 | 361 | int32_t n; |
michael@0 | 362 | for (n = i-1; n >= 0; n--) { |
michael@0 | 363 | uprv_free(a[n]); |
michael@0 | 364 | a[n] = NULL; |
michael@0 | 365 | } |
michael@0 | 366 | return; |
michael@0 | 367 | } |
michael@0 | 368 | adoptTransliterators(a, transCount); |
michael@0 | 369 | } |
michael@0 | 370 | |
michael@0 | 371 | void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransliterators[], |
michael@0 | 372 | int32_t transCount) { |
michael@0 | 373 | // First free trans[] and set count to zero. Once this is done, |
michael@0 | 374 | // orphan the filter. Set up the new trans[]. |
michael@0 | 375 | freeTransliterators(); |
michael@0 | 376 | trans = adoptedTransliterators; |
michael@0 | 377 | count = transCount; |
michael@0 | 378 | computeMaximumContextLength(); |
michael@0 | 379 | setID(joinIDs(trans, count)); |
michael@0 | 380 | } |
michael@0 | 381 | |
michael@0 | 382 | /** |
michael@0 | 383 | * Append c to buf, unless buf is empty or buf already ends in c. |
michael@0 | 384 | */ |
michael@0 | 385 | static void _smartAppend(UnicodeString& buf, UChar c) { |
michael@0 | 386 | if (buf.length() != 0 && |
michael@0 | 387 | buf.charAt(buf.length() - 1) != c) { |
michael@0 | 388 | buf.append(c); |
michael@0 | 389 | } |
michael@0 | 390 | } |
michael@0 | 391 | |
michael@0 | 392 | UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource, |
michael@0 | 393 | UBool escapeUnprintable) const { |
michael@0 | 394 | // We do NOT call toRules() on our component transliterators, in |
michael@0 | 395 | // general. If we have several rule-based transliterators, this |
michael@0 | 396 | // yields a concatenation of the rules -- not what we want. We do |
michael@0 | 397 | // handle compound RBT transliterators specially -- those for which |
michael@0 | 398 | // compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex, |
michael@0 | 399 | // we do call toRules() recursively. |
michael@0 | 400 | rulesSource.truncate(0); |
michael@0 | 401 | if (numAnonymousRBTs >= 1 && getFilter() != NULL) { |
michael@0 | 402 | // If we are a compound RBT and if we have a global |
michael@0 | 403 | // filter, then emit it at the top. |
michael@0 | 404 | UnicodeString pat; |
michael@0 | 405 | rulesSource.append(COLON_COLON, 2).append(getFilter()->toPattern(pat, escapeUnprintable)).append(ID_DELIM); |
michael@0 | 406 | } |
michael@0 | 407 | for (int32_t i=0; i<count; ++i) { |
michael@0 | 408 | UnicodeString rule; |
michael@0 | 409 | |
michael@0 | 410 | // Anonymous RuleBasedTransliterators (inline rules and |
michael@0 | 411 | // ::BEGIN/::END blocks) are given IDs that begin with |
michael@0 | 412 | // "%Pass": use toRules() to write all the rules to the output |
michael@0 | 413 | // (and insert "::Null;" if we have two in a row) |
michael@0 | 414 | if (trans[i]->getID().startsWith(PASS_STRING, 5)) { |
michael@0 | 415 | trans[i]->toRules(rule, escapeUnprintable); |
michael@0 | 416 | if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1]->getID().startsWith(PASS_STRING, 5)) |
michael@0 | 417 | rule = UNICODE_STRING_SIMPLE("::Null;") + rule; |
michael@0 | 418 | |
michael@0 | 419 | // we also use toRules() on CompoundTransliterators (which we |
michael@0 | 420 | // check for by looking for a semicolon in the ID)-- this gets |
michael@0 | 421 | // the list of their child transliterators output in the right |
michael@0 | 422 | // format |
michael@0 | 423 | } else if (trans[i]->getID().indexOf(ID_DELIM) >= 0) { |
michael@0 | 424 | trans[i]->toRules(rule, escapeUnprintable); |
michael@0 | 425 | |
michael@0 | 426 | // for everything else, use Transliterator::toRules() |
michael@0 | 427 | } else { |
michael@0 | 428 | trans[i]->Transliterator::toRules(rule, escapeUnprintable); |
michael@0 | 429 | } |
michael@0 | 430 | _smartAppend(rulesSource, NEWLINE); |
michael@0 | 431 | rulesSource.append(rule); |
michael@0 | 432 | _smartAppend(rulesSource, ID_DELIM); |
michael@0 | 433 | } |
michael@0 | 434 | return rulesSource; |
michael@0 | 435 | } |
michael@0 | 436 | |
michael@0 | 437 | /** |
michael@0 | 438 | * Implement Transliterator framework |
michael@0 | 439 | */ |
michael@0 | 440 | void CompoundTransliterator::handleGetSourceSet(UnicodeSet& result) const { |
michael@0 | 441 | UnicodeSet set; |
michael@0 | 442 | result.clear(); |
michael@0 | 443 | for (int32_t i=0; i<count; ++i) { |
michael@0 | 444 | result.addAll(trans[i]->getSourceSet(set)); |
michael@0 | 445 | // Take the example of Hiragana-Latin. This is really |
michael@0 | 446 | // Hiragana-Katakana; Katakana-Latin. The source set of |
michael@0 | 447 | // these two is roughly [:Hiragana:] and [:Katakana:]. |
michael@0 | 448 | // But the source set for the entire transliterator is |
michael@0 | 449 | // actually [:Hiragana:] ONLY -- that is, the first |
michael@0 | 450 | // non-empty source set. |
michael@0 | 451 | |
michael@0 | 452 | // This is a heuristic, and not 100% reliable. |
michael@0 | 453 | if (!result.isEmpty()) { |
michael@0 | 454 | break; |
michael@0 | 455 | } |
michael@0 | 456 | } |
michael@0 | 457 | } |
michael@0 | 458 | |
michael@0 | 459 | /** |
michael@0 | 460 | * Override Transliterator framework |
michael@0 | 461 | */ |
michael@0 | 462 | UnicodeSet& CompoundTransliterator::getTargetSet(UnicodeSet& result) const { |
michael@0 | 463 | UnicodeSet set; |
michael@0 | 464 | result.clear(); |
michael@0 | 465 | for (int32_t i=0; i<count; ++i) { |
michael@0 | 466 | // This is a heuristic, and not 100% reliable. |
michael@0 | 467 | result.addAll(trans[i]->getTargetSet(set)); |
michael@0 | 468 | } |
michael@0 | 469 | return result; |
michael@0 | 470 | } |
michael@0 | 471 | |
michael@0 | 472 | /** |
michael@0 | 473 | * Implements {@link Transliterator#handleTransliterate}. |
michael@0 | 474 | */ |
michael@0 | 475 | void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, |
michael@0 | 476 | UBool incremental) const { |
michael@0 | 477 | /* Call each transliterator with the same contextStart and |
michael@0 | 478 | * start, but with the limit as modified |
michael@0 | 479 | * by preceding transliterators. The start index must be |
michael@0 | 480 | * reset for each transliterator to give each a chance to |
michael@0 | 481 | * transliterate the text. The initial contextStart index is known |
michael@0 | 482 | * to still point to the same place after each transliterator |
michael@0 | 483 | * is called because each transliterator will not change the |
michael@0 | 484 | * text between contextStart and the initial start index. |
michael@0 | 485 | * |
michael@0 | 486 | * IMPORTANT: After the first transliterator, each subsequent |
michael@0 | 487 | * transliterator only gets to transliterate text committed by |
michael@0 | 488 | * preceding transliterators; that is, the start (output |
michael@0 | 489 | * value) of transliterator i becomes the limit (input value) |
michael@0 | 490 | * of transliterator i+1. Finally, the overall limit is fixed |
michael@0 | 491 | * up before we return. |
michael@0 | 492 | * |
michael@0 | 493 | * Assumptions we make here: |
michael@0 | 494 | * (1) contextStart <= start <= limit <= contextLimit <= text.length() |
michael@0 | 495 | * (2) start <= start' <= limit' ;cursor doesn't move back |
michael@0 | 496 | * (3) start <= limit' ;text before cursor unchanged |
michael@0 | 497 | * - start' is the value of start after calling handleKT |
michael@0 | 498 | * - limit' is the value of limit after calling handleKT |
michael@0 | 499 | */ |
michael@0 | 500 | |
michael@0 | 501 | /** |
michael@0 | 502 | * Example: 3 transliterators. This example illustrates the |
michael@0 | 503 | * mechanics we need to implement. C, S, and L are the contextStart, |
michael@0 | 504 | * start, and limit. gl is the globalLimit. contextLimit is |
michael@0 | 505 | * equal to limit throughout. |
michael@0 | 506 | * |
michael@0 | 507 | * 1. h-u, changes hex to Unicode |
michael@0 | 508 | * |
michael@0 | 509 | * 4 7 a d 0 4 7 a |
michael@0 | 510 | * abc/u0061/u => abca/u |
michael@0 | 511 | * C S L C S L gl=f->a |
michael@0 | 512 | * |
michael@0 | 513 | * 2. upup, changes "x" to "XX" |
michael@0 | 514 | * |
michael@0 | 515 | * 4 7 a 4 7 a |
michael@0 | 516 | * abca/u => abcAA/u |
michael@0 | 517 | * C SL C S |
michael@0 | 518 | * L gl=a->b |
michael@0 | 519 | * 3. u-h, changes Unicode to hex |
michael@0 | 520 | * |
michael@0 | 521 | * 4 7 a 4 7 a d 0 3 |
michael@0 | 522 | * abcAA/u => abc/u0041/u0041/u |
michael@0 | 523 | * C S L C S |
michael@0 | 524 | * L gl=b->15 |
michael@0 | 525 | * 4. return |
michael@0 | 526 | * |
michael@0 | 527 | * 4 7 a d 0 3 |
michael@0 | 528 | * abc/u0041/u0041/u |
michael@0 | 529 | * C S L |
michael@0 | 530 | */ |
michael@0 | 531 | |
michael@0 | 532 | if (count < 1) { |
michael@0 | 533 | index.start = index.limit; |
michael@0 | 534 | return; // Short circuit for empty compound transliterators |
michael@0 | 535 | } |
michael@0 | 536 | |
michael@0 | 537 | // compoundLimit is the limit value for the entire compound |
michael@0 | 538 | // operation. We overwrite index.limit with the previous |
michael@0 | 539 | // index.start. After each transliteration, we update |
michael@0 | 540 | // compoundLimit for insertions or deletions that have happened. |
michael@0 | 541 | int32_t compoundLimit = index.limit; |
michael@0 | 542 | |
michael@0 | 543 | // compoundStart is the start for the entire compound |
michael@0 | 544 | // operation. |
michael@0 | 545 | int32_t compoundStart = index.start; |
michael@0 | 546 | |
michael@0 | 547 | int32_t delta = 0; // delta in length |
michael@0 | 548 | |
michael@0 | 549 | // Give each transliterator a crack at the run of characters. |
michael@0 | 550 | // See comments at the top of the method for more detail. |
michael@0 | 551 | for (int32_t i=0; i<count; ++i) { |
michael@0 | 552 | index.start = compoundStart; // Reset start |
michael@0 | 553 | int32_t limit = index.limit; |
michael@0 | 554 | |
michael@0 | 555 | if (index.start == index.limit) { |
michael@0 | 556 | // Short circuit for empty range |
michael@0 | 557 | break; |
michael@0 | 558 | } |
michael@0 | 559 | |
michael@0 | 560 | trans[i]->filteredTransliterate(text, index, incremental); |
michael@0 | 561 | |
michael@0 | 562 | // In a properly written transliterator, start == limit after |
michael@0 | 563 | // handleTransliterate() returns when incremental is false. |
michael@0 | 564 | // Catch cases where the subclass doesn't do this, and throw |
michael@0 | 565 | // an exception. (Just pinning start to limit is a bad idea, |
michael@0 | 566 | // because what's probably happening is that the subclass |
michael@0 | 567 | // isn't transliterating all the way to the end, and it should |
michael@0 | 568 | // in non-incremental mode.) |
michael@0 | 569 | if (!incremental && index.start != index.limit) { |
michael@0 | 570 | // We can't throw an exception, so just fudge things |
michael@0 | 571 | index.start = index.limit; |
michael@0 | 572 | } |
michael@0 | 573 | |
michael@0 | 574 | // Cumulative delta for insertions/deletions |
michael@0 | 575 | delta += index.limit - limit; |
michael@0 | 576 | |
michael@0 | 577 | if (incremental) { |
michael@0 | 578 | // In the incremental case, only allow subsequent |
michael@0 | 579 | // transliterators to modify what has already been |
michael@0 | 580 | // completely processed by prior transliterators. In the |
michael@0 | 581 | // non-incrmental case, allow each transliterator to |
michael@0 | 582 | // process the entire text. |
michael@0 | 583 | index.limit = index.start; |
michael@0 | 584 | } |
michael@0 | 585 | } |
michael@0 | 586 | |
michael@0 | 587 | compoundLimit += delta; |
michael@0 | 588 | |
michael@0 | 589 | // Start is good where it is -- where the last transliterator left |
michael@0 | 590 | // it. Limit needs to be put back where it was, modulo |
michael@0 | 591 | // adjustments for deletions/insertions. |
michael@0 | 592 | index.limit = compoundLimit; |
michael@0 | 593 | } |
michael@0 | 594 | |
michael@0 | 595 | /** |
michael@0 | 596 | * Sets the length of the longest context required by this transliterator. |
michael@0 | 597 | * This is <em>preceding</em> context. |
michael@0 | 598 | */ |
michael@0 | 599 | void CompoundTransliterator::computeMaximumContextLength(void) { |
michael@0 | 600 | int32_t max = 0; |
michael@0 | 601 | for (int32_t i=0; i<count; ++i) { |
michael@0 | 602 | int32_t len = trans[i]->getMaximumContextLength(); |
michael@0 | 603 | if (len > max) { |
michael@0 | 604 | max = len; |
michael@0 | 605 | } |
michael@0 | 606 | } |
michael@0 | 607 | setMaximumContextLength(max); |
michael@0 | 608 | } |
michael@0 | 609 | |
michael@0 | 610 | U_NAMESPACE_END |
michael@0 | 611 | |
michael@0 | 612 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
michael@0 | 613 | |
michael@0 | 614 | /* eof */ |