intl/icu/source/common/normlzr.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 *************************************************************************
michael@0 3 * COPYRIGHT:
michael@0 4 * Copyright (c) 1996-2012, International Business Machines Corporation and
michael@0 5 * others. All Rights Reserved.
michael@0 6 *************************************************************************
michael@0 7 */
michael@0 8
michael@0 9 #include "unicode/utypes.h"
michael@0 10
michael@0 11 #if !UCONFIG_NO_NORMALIZATION
michael@0 12
michael@0 13 #include "unicode/uniset.h"
michael@0 14 #include "unicode/unistr.h"
michael@0 15 #include "unicode/chariter.h"
michael@0 16 #include "unicode/schriter.h"
michael@0 17 #include "unicode/uchriter.h"
michael@0 18 #include "unicode/normlzr.h"
michael@0 19 #include "unicode/utf16.h"
michael@0 20 #include "cmemory.h"
michael@0 21 #include "normalizer2impl.h"
michael@0 22 #include "uprops.h" // for uniset_getUnicode32Instance()
michael@0 23
michael@0 24 U_NAMESPACE_BEGIN
michael@0 25
michael@0 26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
michael@0 27
michael@0 28 //-------------------------------------------------------------------------
michael@0 29 // Constructors and other boilerplate
michael@0 30 //-------------------------------------------------------------------------
michael@0 31
michael@0 32 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
michael@0 33 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
michael@0 34 text(new StringCharacterIterator(str)),
michael@0 35 currentIndex(0), nextIndex(0),
michael@0 36 buffer(), bufferPos(0)
michael@0 37 {
michael@0 38 init();
michael@0 39 }
michael@0 40
michael@0 41 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
michael@0 42 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
michael@0 43 text(new UCharCharacterIterator(str, length)),
michael@0 44 currentIndex(0), nextIndex(0),
michael@0 45 buffer(), bufferPos(0)
michael@0 46 {
michael@0 47 init();
michael@0 48 }
michael@0 49
michael@0 50 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
michael@0 51 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
michael@0 52 text(iter.clone()),
michael@0 53 currentIndex(0), nextIndex(0),
michael@0 54 buffer(), bufferPos(0)
michael@0 55 {
michael@0 56 init();
michael@0 57 }
michael@0 58
michael@0 59 Normalizer::Normalizer(const Normalizer &copy) :
michael@0 60 UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
michael@0 61 text(copy.text->clone()),
michael@0 62 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
michael@0 63 buffer(copy.buffer), bufferPos(copy.bufferPos)
michael@0 64 {
michael@0 65 init();
michael@0 66 }
michael@0 67
michael@0 68 void
michael@0 69 Normalizer::init() {
michael@0 70 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 71 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
michael@0 72 if(fOptions&UNORM_UNICODE_3_2) {
michael@0 73 delete fFilteredNorm2;
michael@0 74 fNorm2=fFilteredNorm2=
michael@0 75 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
michael@0 76 }
michael@0 77 if(U_FAILURE(errorCode)) {
michael@0 78 errorCode=U_ZERO_ERROR;
michael@0 79 fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
michael@0 80 }
michael@0 81 }
michael@0 82
michael@0 83 Normalizer::~Normalizer()
michael@0 84 {
michael@0 85 delete fFilteredNorm2;
michael@0 86 delete text;
michael@0 87 }
michael@0 88
michael@0 89 Normalizer*
michael@0 90 Normalizer::clone() const
michael@0 91 {
michael@0 92 return new Normalizer(*this);
michael@0 93 }
michael@0 94
michael@0 95 /**
michael@0 96 * Generates a hash code for this iterator.
michael@0 97 */
michael@0 98 int32_t Normalizer::hashCode() const
michael@0 99 {
michael@0 100 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
michael@0 101 }
michael@0 102
michael@0 103 UBool Normalizer::operator==(const Normalizer& that) const
michael@0 104 {
michael@0 105 return
michael@0 106 this==&that ||
michael@0 107 (fUMode==that.fUMode &&
michael@0 108 fOptions==that.fOptions &&
michael@0 109 *text==*that.text &&
michael@0 110 buffer==that.buffer &&
michael@0 111 bufferPos==that.bufferPos &&
michael@0 112 nextIndex==that.nextIndex);
michael@0 113 }
michael@0 114
michael@0 115 //-------------------------------------------------------------------------
michael@0 116 // Static utility methods
michael@0 117 //-------------------------------------------------------------------------
michael@0 118
michael@0 119 void U_EXPORT2
michael@0 120 Normalizer::normalize(const UnicodeString& source,
michael@0 121 UNormalizationMode mode, int32_t options,
michael@0 122 UnicodeString& result,
michael@0 123 UErrorCode &status) {
michael@0 124 if(source.isBogus() || U_FAILURE(status)) {
michael@0 125 result.setToBogus();
michael@0 126 if(U_SUCCESS(status)) {
michael@0 127 status=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 128 }
michael@0 129 } else {
michael@0 130 UnicodeString localDest;
michael@0 131 UnicodeString *dest;
michael@0 132
michael@0 133 if(&source!=&result) {
michael@0 134 dest=&result;
michael@0 135 } else {
michael@0 136 // the source and result strings are the same object, use a temporary one
michael@0 137 dest=&localDest;
michael@0 138 }
michael@0 139 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
michael@0 140 if(U_SUCCESS(status)) {
michael@0 141 if(options&UNORM_UNICODE_3_2) {
michael@0 142 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
michael@0 143 normalize(source, *dest, status);
michael@0 144 } else {
michael@0 145 n2->normalize(source, *dest, status);
michael@0 146 }
michael@0 147 }
michael@0 148 if(dest==&localDest && U_SUCCESS(status)) {
michael@0 149 result=*dest;
michael@0 150 }
michael@0 151 }
michael@0 152 }
michael@0 153
michael@0 154 void U_EXPORT2
michael@0 155 Normalizer::compose(const UnicodeString& source,
michael@0 156 UBool compat, int32_t options,
michael@0 157 UnicodeString& result,
michael@0 158 UErrorCode &status) {
michael@0 159 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
michael@0 160 }
michael@0 161
michael@0 162 void U_EXPORT2
michael@0 163 Normalizer::decompose(const UnicodeString& source,
michael@0 164 UBool compat, int32_t options,
michael@0 165 UnicodeString& result,
michael@0 166 UErrorCode &status) {
michael@0 167 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
michael@0 168 }
michael@0 169
michael@0 170 UNormalizationCheckResult
michael@0 171 Normalizer::quickCheck(const UnicodeString& source,
michael@0 172 UNormalizationMode mode, int32_t options,
michael@0 173 UErrorCode &status) {
michael@0 174 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
michael@0 175 if(U_SUCCESS(status)) {
michael@0 176 if(options&UNORM_UNICODE_3_2) {
michael@0 177 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
michael@0 178 quickCheck(source, status);
michael@0 179 } else {
michael@0 180 return n2->quickCheck(source, status);
michael@0 181 }
michael@0 182 } else {
michael@0 183 return UNORM_MAYBE;
michael@0 184 }
michael@0 185 }
michael@0 186
michael@0 187 UBool
michael@0 188 Normalizer::isNormalized(const UnicodeString& source,
michael@0 189 UNormalizationMode mode, int32_t options,
michael@0 190 UErrorCode &status) {
michael@0 191 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
michael@0 192 if(U_SUCCESS(status)) {
michael@0 193 if(options&UNORM_UNICODE_3_2) {
michael@0 194 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
michael@0 195 isNormalized(source, status);
michael@0 196 } else {
michael@0 197 return n2->isNormalized(source, status);
michael@0 198 }
michael@0 199 } else {
michael@0 200 return FALSE;
michael@0 201 }
michael@0 202 }
michael@0 203
michael@0 204 UnicodeString & U_EXPORT2
michael@0 205 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
michael@0 206 UnicodeString &result,
michael@0 207 UNormalizationMode mode, int32_t options,
michael@0 208 UErrorCode &errorCode) {
michael@0 209 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
michael@0 210 result.setToBogus();
michael@0 211 if(U_SUCCESS(errorCode)) {
michael@0 212 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 213 }
michael@0 214 } else {
michael@0 215 UnicodeString localDest;
michael@0 216 UnicodeString *dest;
michael@0 217
michael@0 218 if(&right!=&result) {
michael@0 219 dest=&result;
michael@0 220 } else {
michael@0 221 // the right and result strings are the same object, use a temporary one
michael@0 222 dest=&localDest;
michael@0 223 }
michael@0 224 *dest=left;
michael@0 225 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
michael@0 226 if(U_SUCCESS(errorCode)) {
michael@0 227 if(options&UNORM_UNICODE_3_2) {
michael@0 228 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
michael@0 229 append(*dest, right, errorCode);
michael@0 230 } else {
michael@0 231 n2->append(*dest, right, errorCode);
michael@0 232 }
michael@0 233 }
michael@0 234 if(dest==&localDest && U_SUCCESS(errorCode)) {
michael@0 235 result=*dest;
michael@0 236 }
michael@0 237 }
michael@0 238 return result;
michael@0 239 }
michael@0 240
michael@0 241 //-------------------------------------------------------------------------
michael@0 242 // Iteration API
michael@0 243 //-------------------------------------------------------------------------
michael@0 244
michael@0 245 /**
michael@0 246 * Return the current character in the normalized text.
michael@0 247 */
michael@0 248 UChar32 Normalizer::current() {
michael@0 249 if(bufferPos<buffer.length() || nextNormalize()) {
michael@0 250 return buffer.char32At(bufferPos);
michael@0 251 } else {
michael@0 252 return DONE;
michael@0 253 }
michael@0 254 }
michael@0 255
michael@0 256 /**
michael@0 257 * Return the next character in the normalized text and advance
michael@0 258 * the iteration position by one. If the end
michael@0 259 * of the text has already been reached, {@link #DONE} is returned.
michael@0 260 */
michael@0 261 UChar32 Normalizer::next() {
michael@0 262 if(bufferPos<buffer.length() || nextNormalize()) {
michael@0 263 UChar32 c=buffer.char32At(bufferPos);
michael@0 264 bufferPos+=U16_LENGTH(c);
michael@0 265 return c;
michael@0 266 } else {
michael@0 267 return DONE;
michael@0 268 }
michael@0 269 }
michael@0 270
michael@0 271 /**
michael@0 272 * Return the previous character in the normalized text and decrement
michael@0 273 * the iteration position by one. If the beginning
michael@0 274 * of the text has already been reached, {@link #DONE} is returned.
michael@0 275 */
michael@0 276 UChar32 Normalizer::previous() {
michael@0 277 if(bufferPos>0 || previousNormalize()) {
michael@0 278 UChar32 c=buffer.char32At(bufferPos-1);
michael@0 279 bufferPos-=U16_LENGTH(c);
michael@0 280 return c;
michael@0 281 } else {
michael@0 282 return DONE;
michael@0 283 }
michael@0 284 }
michael@0 285
michael@0 286 void Normalizer::reset() {
michael@0 287 currentIndex=nextIndex=text->setToStart();
michael@0 288 clearBuffer();
michael@0 289 }
michael@0 290
michael@0 291 void
michael@0 292 Normalizer::setIndexOnly(int32_t index) {
michael@0 293 text->setIndex(index); // pins index
michael@0 294 currentIndex=nextIndex=text->getIndex();
michael@0 295 clearBuffer();
michael@0 296 }
michael@0 297
michael@0 298 /**
michael@0 299 * Return the first character in the normalized text. This resets
michael@0 300 * the <tt>Normalizer's</tt> position to the beginning of the text.
michael@0 301 */
michael@0 302 UChar32 Normalizer::first() {
michael@0 303 reset();
michael@0 304 return next();
michael@0 305 }
michael@0 306
michael@0 307 /**
michael@0 308 * Return the last character in the normalized text. This resets
michael@0 309 * the <tt>Normalizer's</tt> position to be just before the
michael@0 310 * the input text corresponding to that normalized character.
michael@0 311 */
michael@0 312 UChar32 Normalizer::last() {
michael@0 313 currentIndex=nextIndex=text->setToEnd();
michael@0 314 clearBuffer();
michael@0 315 return previous();
michael@0 316 }
michael@0 317
michael@0 318 /**
michael@0 319 * Retrieve the current iteration position in the input text that is
michael@0 320 * being normalized. This method is useful in applications such as
michael@0 321 * searching, where you need to be able to determine the position in
michael@0 322 * the input text that corresponds to a given normalized output character.
michael@0 323 * <p>
michael@0 324 * <b>Note:</b> This method sets the position in the <em>input</em>, while
michael@0 325 * {@link #next} and {@link #previous} iterate through characters in the
michael@0 326 * <em>output</em>. This means that there is not necessarily a one-to-one
michael@0 327 * correspondence between characters returned by <tt>next</tt> and
michael@0 328 * <tt>previous</tt> and the indices passed to and returned from
michael@0 329 * <tt>setIndex</tt> and {@link #getIndex}.
michael@0 330 *
michael@0 331 */
michael@0 332 int32_t Normalizer::getIndex() const {
michael@0 333 if(bufferPos<buffer.length()) {
michael@0 334 return currentIndex;
michael@0 335 } else {
michael@0 336 return nextIndex;
michael@0 337 }
michael@0 338 }
michael@0 339
michael@0 340 /**
michael@0 341 * Retrieve the index of the start of the input text. This is the begin index
michael@0 342 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
michael@0 343 * over which this <tt>Normalizer</tt> is iterating
michael@0 344 */
michael@0 345 int32_t Normalizer::startIndex() const {
michael@0 346 return text->startIndex();
michael@0 347 }
michael@0 348
michael@0 349 /**
michael@0 350 * Retrieve the index of the end of the input text. This is the end index
michael@0 351 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
michael@0 352 * over which this <tt>Normalizer</tt> is iterating
michael@0 353 */
michael@0 354 int32_t Normalizer::endIndex() const {
michael@0 355 return text->endIndex();
michael@0 356 }
michael@0 357
michael@0 358 //-------------------------------------------------------------------------
michael@0 359 // Property access methods
michael@0 360 //-------------------------------------------------------------------------
michael@0 361
michael@0 362 void
michael@0 363 Normalizer::setMode(UNormalizationMode newMode)
michael@0 364 {
michael@0 365 fUMode = newMode;
michael@0 366 init();
michael@0 367 }
michael@0 368
michael@0 369 UNormalizationMode
michael@0 370 Normalizer::getUMode() const
michael@0 371 {
michael@0 372 return fUMode;
michael@0 373 }
michael@0 374
michael@0 375 void
michael@0 376 Normalizer::setOption(int32_t option,
michael@0 377 UBool value)
michael@0 378 {
michael@0 379 if (value) {
michael@0 380 fOptions |= option;
michael@0 381 } else {
michael@0 382 fOptions &= (~option);
michael@0 383 }
michael@0 384 init();
michael@0 385 }
michael@0 386
michael@0 387 UBool
michael@0 388 Normalizer::getOption(int32_t option) const
michael@0 389 {
michael@0 390 return (fOptions & option) != 0;
michael@0 391 }
michael@0 392
michael@0 393 /**
michael@0 394 * Set the input text over which this <tt>Normalizer</tt> will iterate.
michael@0 395 * The iteration position is set to the beginning of the input text.
michael@0 396 */
michael@0 397 void
michael@0 398 Normalizer::setText(const UnicodeString& newText,
michael@0 399 UErrorCode &status)
michael@0 400 {
michael@0 401 if (U_FAILURE(status)) {
michael@0 402 return;
michael@0 403 }
michael@0 404 CharacterIterator *newIter = new StringCharacterIterator(newText);
michael@0 405 if (newIter == NULL) {
michael@0 406 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 407 return;
michael@0 408 }
michael@0 409 delete text;
michael@0 410 text = newIter;
michael@0 411 reset();
michael@0 412 }
michael@0 413
michael@0 414 /**
michael@0 415 * Set the input text over which this <tt>Normalizer</tt> will iterate.
michael@0 416 * The iteration position is set to the beginning of the string.
michael@0 417 */
michael@0 418 void
michael@0 419 Normalizer::setText(const CharacterIterator& newText,
michael@0 420 UErrorCode &status)
michael@0 421 {
michael@0 422 if (U_FAILURE(status)) {
michael@0 423 return;
michael@0 424 }
michael@0 425 CharacterIterator *newIter = newText.clone();
michael@0 426 if (newIter == NULL) {
michael@0 427 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 428 return;
michael@0 429 }
michael@0 430 delete text;
michael@0 431 text = newIter;
michael@0 432 reset();
michael@0 433 }
michael@0 434
michael@0 435 void
michael@0 436 Normalizer::setText(const UChar* newText,
michael@0 437 int32_t length,
michael@0 438 UErrorCode &status)
michael@0 439 {
michael@0 440 if (U_FAILURE(status)) {
michael@0 441 return;
michael@0 442 }
michael@0 443 CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
michael@0 444 if (newIter == NULL) {
michael@0 445 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 446 return;
michael@0 447 }
michael@0 448 delete text;
michael@0 449 text = newIter;
michael@0 450 reset();
michael@0 451 }
michael@0 452
michael@0 453 /**
michael@0 454 * Copies the text under iteration into the UnicodeString referred to by "result".
michael@0 455 * @param result Receives a copy of the text under iteration.
michael@0 456 */
michael@0 457 void
michael@0 458 Normalizer::getText(UnicodeString& result)
michael@0 459 {
michael@0 460 text->getText(result);
michael@0 461 }
michael@0 462
michael@0 463 //-------------------------------------------------------------------------
michael@0 464 // Private utility methods
michael@0 465 //-------------------------------------------------------------------------
michael@0 466
michael@0 467 void Normalizer::clearBuffer() {
michael@0 468 buffer.remove();
michael@0 469 bufferPos=0;
michael@0 470 }
michael@0 471
michael@0 472 UBool
michael@0 473 Normalizer::nextNormalize() {
michael@0 474 clearBuffer();
michael@0 475 currentIndex=nextIndex;
michael@0 476 text->setIndex(nextIndex);
michael@0 477 if(!text->hasNext()) {
michael@0 478 return FALSE;
michael@0 479 }
michael@0 480 // Skip at least one character so we make progress.
michael@0 481 UnicodeString segment(text->next32PostInc());
michael@0 482 while(text->hasNext()) {
michael@0 483 UChar32 c;
michael@0 484 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
michael@0 485 text->move32(-1, CharacterIterator::kCurrent);
michael@0 486 break;
michael@0 487 }
michael@0 488 segment.append(c);
michael@0 489 }
michael@0 490 nextIndex=text->getIndex();
michael@0 491 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 492 fNorm2->normalize(segment, buffer, errorCode);
michael@0 493 return U_SUCCESS(errorCode) && !buffer.isEmpty();
michael@0 494 }
michael@0 495
michael@0 496 UBool
michael@0 497 Normalizer::previousNormalize() {
michael@0 498 clearBuffer();
michael@0 499 nextIndex=currentIndex;
michael@0 500 text->setIndex(currentIndex);
michael@0 501 if(!text->hasPrevious()) {
michael@0 502 return FALSE;
michael@0 503 }
michael@0 504 UnicodeString segment;
michael@0 505 while(text->hasPrevious()) {
michael@0 506 UChar32 c=text->previous32();
michael@0 507 segment.insert(0, c);
michael@0 508 if(fNorm2->hasBoundaryBefore(c)) {
michael@0 509 break;
michael@0 510 }
michael@0 511 }
michael@0 512 currentIndex=text->getIndex();
michael@0 513 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 514 fNorm2->normalize(segment, buffer, errorCode);
michael@0 515 bufferPos=buffer.length();
michael@0 516 return U_SUCCESS(errorCode) && !buffer.isEmpty();
michael@0 517 }
michael@0 518
michael@0 519 U_NAMESPACE_END
michael@0 520
michael@0 521 #endif /* #if !UCONFIG_NO_NORMALIZATION */

mercurial