The Tor Browser: comparison intl/icu/source/common/normlzr.cpp

--1:000000000000
+:bc03b8ad4bee
+/*
+*************************************************************************
+* COPYRIGHT:
+* Copyright (c) 1996-2012, International Business Machines Corporation and
+* others. All Rights Reserved.
+*************************************************************************
+*/
+#include "unicode/utypes.h"
+#if !UCONFIG_NO_NORMALIZATION
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+#include "unicode/chariter.h"
+#include "unicode/schriter.h"
+#include "unicode/uchriter.h"
+#include "unicode/normlzr.h"
+#include "unicode/utf16.h"
+#include "cmemory.h"
+#include "normalizer2impl.h"
+#include "uprops.h"  // for uniset_getUnicode32Instance()
+U_NAMESPACE_BEGIN
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
+//-------------------------------------------------------------------------
+// Constructors and other boilerplate
+//-------------------------------------------------------------------------
+Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
+UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
+text(new StringCharacterIterator(str)),
+currentIndex(0), nextIndex(0),
+buffer(), bufferPos(0)
+{
+init();
+}
+Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
+UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
+text(new UCharCharacterIterator(str, length)),
+currentIndex(0), nextIndex(0),
+buffer(), bufferPos(0)
+{
+init();
+}
+Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
+UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
+text(iter.clone()),
+currentIndex(0), nextIndex(0),
+buffer(), bufferPos(0)
+{
+init();
+}
+Normalizer::Normalizer(const Normalizer &copy) :
+UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
+text(copy.text->clone()),
+currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
+buffer(copy.buffer), bufferPos(copy.bufferPos)
+{
+init();
+}
+void
+Normalizer::init() {
+UErrorCode errorCode=U_ZERO_ERROR;
+fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
+if(fOptions&UNORM_UNICODE_3_2) {
+delete fFilteredNorm2;
+fNorm2=fFilteredNorm2=
+new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
+}
+if(U_FAILURE(errorCode)) {
+errorCode=U_ZERO_ERROR;
+fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
+}
+}
+Normalizer::~Normalizer()
+{
+delete fFilteredNorm2;
+delete text;
+}
+Normalizer*
+Normalizer::clone() const
+{
+return new Normalizer(*this);
+}
+/**
+* Generates a hash code for this iterator.
+*/
+int32_t Normalizer::hashCode() const
+{
+return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
+}
+UBool Normalizer::operator==(const Normalizer& that) const
+{
+return
+this==&that ||
+(fUMode==that.fUMode &&
+fOptions==that.fOptions &&
+*text==*that.text &&
+buffer==that.buffer &&
+bufferPos==that.bufferPos &&
+nextIndex==that.nextIndex);
+}
+//-------------------------------------------------------------------------
+// Static utility methods
+//-------------------------------------------------------------------------
+void U_EXPORT2
+Normalizer::normalize(const UnicodeString& source,
+UNormalizationMode mode, int32_t options,
+UnicodeString& result,
+UErrorCode &status) {
+if(source.isBogus() || U_FAILURE(status)) {
+result.setToBogus();
+if(U_SUCCESS(status)) {
+status=U_ILLEGAL_ARGUMENT_ERROR;
+}
+} else {
+UnicodeString localDest;
+UnicodeString *dest;
+if(&source!=&result) {
+dest=&result;
+} else {
+// the source and result strings are the same object, use a temporary one
+dest=&localDest;
+}
+const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
+if(U_SUCCESS(status)) {
+if(options&UNORM_UNICODE_3_2) {
+FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
+normalize(source, *dest, status);
+} else {
+n2->normalize(source, *dest, status);
+}
+}
+if(dest==&localDest && U_SUCCESS(status)) {
+result=*dest;
+}
+}
+}
+void U_EXPORT2
+Normalizer::compose(const UnicodeString& source,
+UBool compat, int32_t options,
+UnicodeString& result,
+UErrorCode &status) {
+normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
+}
+void U_EXPORT2
+Normalizer::decompose(const UnicodeString& source,
+UBool compat, int32_t options,
+UnicodeString& result,
+UErrorCode &status) {
+normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
+}
+UNormalizationCheckResult
+Normalizer::quickCheck(const UnicodeString& source,
+UNormalizationMode mode, int32_t options,
+UErrorCode &status) {
+const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
+if(U_SUCCESS(status)) {
+if(options&UNORM_UNICODE_3_2) {
+return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
+quickCheck(source, status);
+} else {
+return n2->quickCheck(source, status);
+}
+} else {
+return UNORM_MAYBE;
+}
+}
+UBool
+Normalizer::isNormalized(const UnicodeString& source,
+UNormalizationMode mode, int32_t options,
+UErrorCode &status) {
+const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
+if(U_SUCCESS(status)) {
+if(options&UNORM_UNICODE_3_2) {
+return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
+isNormalized(source, status);
+} else {
+return n2->isNormalized(source, status);
+}
+} else {
+return FALSE;
+}
+}
+UnicodeString & U_EXPORT2
+Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
+UnicodeString &result,
+UNormalizationMode mode, int32_t options,
+UErrorCode &errorCode) {
+if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
+result.setToBogus();
+if(U_SUCCESS(errorCode)) {
+errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+}
+} else {
+UnicodeString localDest;
+UnicodeString *dest;
+if(&right!=&result) {
+dest=&result;
+} else {
+// the right and result strings are the same object, use a temporary one
+dest=&localDest;
+}
+*dest=left;
+const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
+if(U_SUCCESS(errorCode)) {
+if(options&UNORM_UNICODE_3_2) {
+FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
+append(*dest, right, errorCode);
+} else {
+n2->append(*dest, right, errorCode);
+}
+}
+if(dest==&localDest && U_SUCCESS(errorCode)) {
+result=*dest;
+}
+}
+return result;
+}
+//-------------------------------------------------------------------------
+// Iteration API
+//-------------------------------------------------------------------------
+/**
+* Return the current character in the normalized text.
+*/
+UChar32 Normalizer::current() {
+if(bufferPos<buffer.length() || nextNormalize()) {
+return buffer.char32At(bufferPos);
+} else {
+return DONE;
+}
+}
+/**
+* Return the next character in the normalized text and advance
+* the iteration position by one.  If the end
+* of the text has already been reached, {@link #DONE} is returned.
+*/
+UChar32 Normalizer::next() {
+if(bufferPos<buffer.length() ||  nextNormalize()) {
+UChar32 c=buffer.char32At(bufferPos);
+bufferPos+=U16_LENGTH(c);
+return c;
+} else {
+return DONE;
+}
+}
+/**
+* Return the previous character in the normalized text and decrement
+* the iteration position by one.  If the beginning
+* of the text has already been reached, {@link #DONE} is returned.
+*/
+UChar32 Normalizer::previous() {
+if(bufferPos>0 || previousNormalize()) {
+UChar32 c=buffer.char32At(bufferPos-1);
+bufferPos-=U16_LENGTH(c);
+return c;
+} else {
+return DONE;
+}
+}
+void Normalizer::reset() {
+currentIndex=nextIndex=text->setToStart();
+clearBuffer();
+}
+void
+Normalizer::setIndexOnly(int32_t index) {
+text->setIndex(index);  // pins index
+currentIndex=nextIndex=text->getIndex();
+clearBuffer();
+}
+/**
+* Return the first character in the normalized text.  This resets
+* the <tt>Normalizer's</tt> position to the beginning of the text.
+*/
+UChar32 Normalizer::first() {
+reset();
+return next();
+}
+/**
+* Return the last character in the normalized text.  This resets
+* the <tt>Normalizer's</tt> position to be just before the
+* the input text corresponding to that normalized character.
+*/
+UChar32 Normalizer::last() {
+currentIndex=nextIndex=text->setToEnd();
+clearBuffer();
+return previous();
+}
+/**
+* Retrieve the current iteration position in the input text that is
+* being normalized.  This method is useful in applications such as
+* searching, where you need to be able to determine the position in
+* the input text that corresponds to a given normalized output character.
+* <p>
+* <b>Note:</b> This method sets the position in the <em>input</em>, while
+* {@link #next} and {@link #previous} iterate through characters in the
+* <em>output</em>.  This means that there is not necessarily a one-to-one
+* correspondence between characters returned by <tt>next</tt> and
+* <tt>previous</tt> and the indices passed to and returned from
+* <tt>setIndex</tt> and {@link #getIndex}.
+*
+*/
+int32_t Normalizer::getIndex() const {
+if(bufferPos<buffer.length()) {
+return currentIndex;
+} else {
+return nextIndex;
+}
+}
+/**
+* Retrieve the index of the start of the input text.  This is the begin index
+* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
+* over which this <tt>Normalizer</tt> is iterating
+*/
+int32_t Normalizer::startIndex() const {
+return text->startIndex();
+}
+/**
+* Retrieve the index of the end of the input text.  This is the end index
+* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
+* over which this <tt>Normalizer</tt> is iterating
+*/
+int32_t Normalizer::endIndex() const {
+return text->endIndex();
+}
+//-------------------------------------------------------------------------
+// Property access methods
+//-------------------------------------------------------------------------
+void
+Normalizer::setMode(UNormalizationMode newMode)
+{
+fUMode = newMode;
+init();
+}
+UNormalizationMode
+Normalizer::getUMode() const
+{
+return fUMode;
+}
+void
+Normalizer::setOption(int32_t option,
+UBool value)
+{
+if (value) {
+fOptions |= option;
+} else {
+fOptions &= (~option);
+}
+init();
+}
+UBool
+Normalizer::getOption(int32_t option) const
+{
+return (fOptions & option) != 0;
+}
+/**
+* Set the input text over which this <tt>Normalizer</tt> will iterate.
+* The iteration position is set to the beginning of the input text.
+*/
+void
+Normalizer::setText(const UnicodeString& newText,
+UErrorCode &status)
+{
+if (U_FAILURE(status)) {
+return;
+}
+CharacterIterator *newIter = new StringCharacterIterator(newText);
+if (newIter == NULL) {
+status = U_MEMORY_ALLOCATION_ERROR;
+return;
+}
+delete text;
+text = newIter;
+reset();
+}
+/**
+* Set the input text over which this <tt>Normalizer</tt> will iterate.
+* The iteration position is set to the beginning of the string.
+*/
+void
+Normalizer::setText(const CharacterIterator& newText,
+UErrorCode &status)
+{
+if (U_FAILURE(status)) {
+return;
+}
+CharacterIterator *newIter = newText.clone();
+if (newIter == NULL) {
+status = U_MEMORY_ALLOCATION_ERROR;
+return;
+}
+delete text;
+text = newIter;
+reset();
+}
+void
+Normalizer::setText(const UChar* newText,
+int32_t length,
+UErrorCode &status)
+{
+if (U_FAILURE(status)) {
+return;
+}
+CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
+if (newIter == NULL) {
+status = U_MEMORY_ALLOCATION_ERROR;
+return;
+}
+delete text;
+text = newIter;
+reset();
+}
+/**
+* Copies the text under iteration into the UnicodeString referred to by "result".
+* @param result Receives a copy of the text under iteration.
+*/
+void
+Normalizer::getText(UnicodeString&  result)
+{
+text->getText(result);
+}
+//-------------------------------------------------------------------------
+// Private utility methods
+//-------------------------------------------------------------------------
+void Normalizer::clearBuffer() {
+buffer.remove();
+bufferPos=0;
+}
+UBool
+Normalizer::nextNormalize() {
+clearBuffer();
+currentIndex=nextIndex;
+text->setIndex(nextIndex);
+if(!text->hasNext()) {
+return FALSE;
+}
+// Skip at least one character so we make progress.
+UnicodeString segment(text->next32PostInc());
+while(text->hasNext()) {
+UChar32 c;
+if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
+text->move32(-1, CharacterIterator::kCurrent);
+break;
+}
+segment.append(c);
+}
+nextIndex=text->getIndex();
+UErrorCode errorCode=U_ZERO_ERROR;
+fNorm2->normalize(segment, buffer, errorCode);
+return U_SUCCESS(errorCode) && !buffer.isEmpty();
+}
+UBool
+Normalizer::previousNormalize() {
+clearBuffer();
+nextIndex=currentIndex;
+text->setIndex(currentIndex);
+if(!text->hasPrevious()) {
+return FALSE;
+}
+UnicodeString segment;
+while(text->hasPrevious()) {
+UChar32 c=text->previous32();
+segment.insert(0, c);
+if(fNorm2->hasBoundaryBefore(c)) {
+break;
+}
+}
+currentIndex=text->getIndex();
+UErrorCode errorCode=U_ZERO_ERROR;
+fNorm2->normalize(segment, buffer, errorCode);
+bufferPos=buffer.length();
+return U_SUCCESS(errorCode) && !buffer.isEmpty();
+}
+U_NAMESPACE_END
+#endif /* #if !UCONFIG_NO_NORMALIZATION */

The Tor Browser / file comparison

comparison: intl/icu/source/common/normlzr.cpp

intl/icu/source/common/normlzr.cpp