intl/icu/source/i18n/nortrans.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/nortrans.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,176 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 2001-2011, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   Date        Name        Description
    1.10 +*   07/03/01    aliu        Creation.
    1.11 +**********************************************************************
    1.12 +*/
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +
    1.16 +#if !UCONFIG_NO_TRANSLITERATION
    1.17 +
    1.18 +#include "unicode/normalizer2.h"
    1.19 +#include "unicode/utf16.h"
    1.20 +#include "cstring.h"
    1.21 +#include "nortrans.h"
    1.22 +
    1.23 +U_NAMESPACE_BEGIN
    1.24 +
    1.25 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
    1.26 +
    1.27 +static inline Transliterator::Token cstrToken(const char *s) {
    1.28 +    return Transliterator::pointerToken((void *)s);
    1.29 +}
    1.30 +
    1.31 +/**
    1.32 + * System registration hook.
    1.33 + */
    1.34 +void NormalizationTransliterator::registerIDs() {
    1.35 +    // In the Token, the byte after the NUL is the UNormalization2Mode.
    1.36 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
    1.37 +                                     _create, cstrToken("nfc\0\0"));
    1.38 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
    1.39 +                                     _create, cstrToken("nfkc\0\0"));
    1.40 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
    1.41 +                                     _create, cstrToken("nfc\0\1"));
    1.42 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
    1.43 +                                     _create, cstrToken("nfkc\0\1"));
    1.44 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
    1.45 +                                     _create, cstrToken("nfc\0\2"));
    1.46 +    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
    1.47 +                                     _create, cstrToken("nfc\0\3"));
    1.48 +    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
    1.49 +                                            UNICODE_STRING_SIMPLE("NFD"), TRUE);
    1.50 +    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
    1.51 +                                            UNICODE_STRING_SIMPLE("NFKD"), TRUE);
    1.52 +    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
    1.53 +                                            UNICODE_STRING_SIMPLE("NFD"), FALSE);
    1.54 +    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
    1.55 +                                            UNICODE_STRING_SIMPLE("FCD"), FALSE);
    1.56 +}
    1.57 +
    1.58 +/**
    1.59 + * Factory methods
    1.60 + */
    1.61 +Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
    1.62 +                                                     Token context) {
    1.63 +    const char *name = (const char *)context.pointer;
    1.64 +    UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
    1.65 +    UErrorCode errorCode = U_ZERO_ERROR;
    1.66 +    const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
    1.67 +    if(U_SUCCESS(errorCode)) {
    1.68 +        return new NormalizationTransliterator(ID, *norm2);
    1.69 +    } else {
    1.70 +        return NULL;
    1.71 +    }
    1.72 +}
    1.73 +
    1.74 +/**
    1.75 + * Constructs a transliterator.
    1.76 + */
    1.77 +NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
    1.78 +                                                         const Normalizer2 &norm2) :
    1.79 +    Transliterator(id, 0), fNorm2(norm2) {}
    1.80 +
    1.81 +/**
    1.82 + * Destructor.
    1.83 + */
    1.84 +NormalizationTransliterator::~NormalizationTransliterator() {
    1.85 +}
    1.86 +
    1.87 +/**
    1.88 + * Copy constructor.
    1.89 + */
    1.90 +NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
    1.91 +    Transliterator(o), fNorm2(o.fNorm2) {}
    1.92 +
    1.93 +/**
    1.94 + * Transliterator API.
    1.95 + */
    1.96 +Transliterator* NormalizationTransliterator::clone(void) const {
    1.97 +    return new NormalizationTransliterator(*this);
    1.98 +}
    1.99 +
   1.100 +/**
   1.101 + * Implements {@link Transliterator#handleTransliterate}.
   1.102 + */
   1.103 +void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
   1.104 +                                                      UBool isIncremental) const {
   1.105 +    // start and limit of the input range
   1.106 +    int32_t start = offsets.start;
   1.107 +    int32_t limit = offsets.limit;
   1.108 +    if(start >= limit) {
   1.109 +        return;
   1.110 +    }
   1.111 +
   1.112 +    /*
   1.113 +     * Normalize as short chunks at a time as possible even in
   1.114 +     * bulk mode, so that styled text is minimally disrupted.
   1.115 +     * In incremental mode, a chunk that ends with offsets.limit
   1.116 +     * must not be normalized.
   1.117 +     *
   1.118 +     * If it was known that the input text is not styled, then
   1.119 +     * a bulk mode normalization could look like this:
   1.120 +
   1.121 +    UnicodeString input, normalized;
   1.122 +    int32_t length = limit - start;
   1.123 +    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
   1.124 +    input.releaseBuffer(length);
   1.125 +
   1.126 +    UErrorCode status = U_ZERO_ERROR;
   1.127 +    fNorm2.normalize(input, normalized, status);
   1.128 +
   1.129 +    text.handleReplaceBetween(start, limit, normalized);
   1.130 +
   1.131 +    int32_t delta = normalized.length() - length;
   1.132 +    offsets.contextLimit += delta;
   1.133 +    offsets.limit += delta;
   1.134 +    offsets.start = limit + delta;
   1.135 +
   1.136 +     */
   1.137 +    UErrorCode errorCode = U_ZERO_ERROR;
   1.138 +    UnicodeString segment;
   1.139 +    UnicodeString normalized;
   1.140 +    UChar32 c = text.char32At(start);
   1.141 +    do {
   1.142 +        int32_t prev = start;
   1.143 +        // Skip at least one character so we make progress.
   1.144 +        // c holds the character at start.
   1.145 +        segment.remove();
   1.146 +        do {
   1.147 +            segment.append(c);
   1.148 +            start += U16_LENGTH(c);
   1.149 +        } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
   1.150 +        if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
   1.151 +            // stop in incremental mode when we reach the input limit
   1.152 +            // in case there are additional characters that could change the
   1.153 +            // normalization result
   1.154 +            start=prev;
   1.155 +            break;
   1.156 +        }
   1.157 +        fNorm2.normalize(segment, normalized, errorCode);
   1.158 +        if(U_FAILURE(errorCode)) {
   1.159 +            break;
   1.160 +        }
   1.161 +        if(segment != normalized) {
   1.162 +            // replace the input chunk with its normalized form
   1.163 +            text.handleReplaceBetween(prev, start, normalized);
   1.164 +
   1.165 +            // update all necessary indexes accordingly
   1.166 +            int32_t delta = normalized.length() - (start - prev);
   1.167 +            start += delta;
   1.168 +            limit += delta;
   1.169 +        }
   1.170 +    } while(start < limit);
   1.171 +
   1.172 +    offsets.start = start;
   1.173 +    offsets.contextLimit += limit - offsets.limit;
   1.174 +    offsets.limit = limit;
   1.175 +}
   1.176 +
   1.177 +U_NAMESPACE_END
   1.178 +
   1.179 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */

mercurial