intl/icu/source/i18n/brktrans.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/brktrans.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,188 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 2008-2010, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   Date        Name        Description
    1.10 +*   05/11/2008  Andy Heninger  Port from Java
    1.11 +**********************************************************************
    1.12 +*/
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +
    1.16 +#if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
    1.17 +
    1.18 +#include "unicode/unifilt.h"
    1.19 +#include "unicode/uchar.h"
    1.20 +#include "unicode/uniset.h"
    1.21 +#include "unicode/brkiter.h"
    1.22 +#include "brktrans.h"
    1.23 +#include "unicode/uchar.h"
    1.24 +#include "cmemory.h"
    1.25 +#include "uprops.h"
    1.26 +#include "uinvchar.h"
    1.27 +#include "util.h"
    1.28 +#include "uvectr32.h"
    1.29 +
    1.30 +U_NAMESPACE_BEGIN
    1.31 +
    1.32 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
    1.33 +
    1.34 +static const UChar SPACE       = 32;  // ' '
    1.35 +
    1.36 +
    1.37 +/**
    1.38 + * Constructs a transliterator with the default delimiters '{' and
    1.39 + * '}'.
    1.40 + */
    1.41 +BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
    1.42 +    Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
    1.43 +    fInsertion(SPACE) {
    1.44 +        bi = NULL;
    1.45 +        UErrorCode status = U_ZERO_ERROR;
    1.46 +        boundaries = new UVector32(status);
    1.47 +    }
    1.48 +
    1.49 +
    1.50 +/**
    1.51 + * Destructor.
    1.52 + */
    1.53 +BreakTransliterator::~BreakTransliterator() {
    1.54 +    delete bi;
    1.55 +    bi = NULL;
    1.56 +    delete boundaries;
    1.57 +    boundaries = NULL;
    1.58 +}
    1.59 +
    1.60 +/**
    1.61 + * Copy constructor.
    1.62 + */
    1.63 +BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
    1.64 +    Transliterator(o) {
    1.65 +        bi = NULL;
    1.66 +        if (o.bi != NULL) {
    1.67 +            bi = o.bi->clone();
    1.68 +        }
    1.69 +        fInsertion = o.fInsertion;
    1.70 +        UErrorCode status = U_ZERO_ERROR;
    1.71 +        boundaries = new UVector32(status);
    1.72 +    }
    1.73 +
    1.74 +
    1.75 +/**
    1.76 + * Transliterator API.
    1.77 + */
    1.78 +Transliterator* BreakTransliterator::clone(void) const {
    1.79 +    return new BreakTransliterator(*this);
    1.80 +}
    1.81 +
    1.82 +/**
    1.83 + * Implements {@link Transliterator#handleTransliterate}.
    1.84 + */
    1.85 +void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
    1.86 +                                                    UBool isIncremental ) const {
    1.87 +
    1.88 +        UErrorCode status = U_ZERO_ERROR;
    1.89 +        boundaries->removeAllElements();
    1.90 +        BreakTransliterator *nonConstThis = (BreakTransliterator *)this;
    1.91 +        nonConstThis->getBreakIterator(); // Lazy-create it if necessary
    1.92 +        UnicodeString sText = replaceableAsString(text);
    1.93 +        bi->setText(sText);
    1.94 +        bi->preceding(offsets.start);
    1.95 +
    1.96 +        // To make things much easier, we will stack the boundaries, and then insert at the end.
    1.97 +        // generally, we won't need too many, since we will be filtered.
    1.98 +
    1.99 +        int32_t boundary;
   1.100 +        for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
   1.101 +            if (boundary == 0) continue;
   1.102 +            // HACK: Check to see that preceeding item was a letter
   1.103 +
   1.104 +            UChar32 cp = sText.char32At(boundary-1);
   1.105 +            int type = u_charType(cp);
   1.106 +            //System.out.println(Integer.toString(cp,16) + " (before): " + type);
   1.107 +            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
   1.108 +
   1.109 +            cp = sText.char32At(boundary);
   1.110 +            type = u_charType(cp);
   1.111 +            //System.out.println(Integer.toString(cp,16) + " (after): " + type);
   1.112 +            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
   1.113 +
   1.114 +            boundaries->addElement(boundary, status);
   1.115 +            // printf("Boundary at %d\n", boundary);
   1.116 +        }
   1.117 +
   1.118 +        int delta = 0;
   1.119 +        int lastBoundary = 0;
   1.120 +
   1.121 +        if (boundaries->size() != 0) { // if we found something, adjust
   1.122 +            delta = boundaries->size() * fInsertion.length();
   1.123 +            lastBoundary = boundaries->lastElementi();
   1.124 +
   1.125 +            // we do this from the end backwards, so that we don't have to keep updating.
   1.126 +
   1.127 +            while (boundaries->size() > 0) {
   1.128 +                boundary = boundaries->popi();
   1.129 +                text.handleReplaceBetween(boundary, boundary, fInsertion);
   1.130 +            }
   1.131 +        }
   1.132 +
   1.133 +        // Now fix up the return values
   1.134 +        offsets.contextLimit += delta;
   1.135 +        offsets.limit += delta;
   1.136 +        offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
   1.137 +
   1.138 +        // TODO:  do something with U_FAILURE(status);
   1.139 +        //        (need to look at transliterators overall, not just here.)
   1.140 +}
   1.141 +
   1.142 +//
   1.143 +//  getInsertion()
   1.144 +//
   1.145 +const UnicodeString &BreakTransliterator::getInsertion() const {
   1.146 +    return fInsertion;
   1.147 +}
   1.148 +
   1.149 +//
   1.150 +//  setInsertion()
   1.151 +//
   1.152 +void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
   1.153 +    this->fInsertion = insertion;
   1.154 +}
   1.155 +
   1.156 +//
   1.157 +//  getBreakIterator     Lazily create the break iterator if it does
   1.158 +//                       not already exist.  Copied from Java, probably
   1.159 +//                       better to just create it in the constructor.
   1.160 +//
   1.161 +BreakIterator *BreakTransliterator::getBreakIterator() {
   1.162 +    UErrorCode status = U_ZERO_ERROR;
   1.163 +    if (bi == NULL) {
   1.164 +        // Note:  Thai breaking behavior is universal, it is not
   1.165 +        //        tied to the Thai locale.
   1.166 +        bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
   1.167 +    }
   1.168 +    return bi;
   1.169 +}
   1.170 +
   1.171 +//
   1.172 +//   replaceableAsString   Hack to let break iterators work
   1.173 +//                         on the replaceable text from transliterators.
   1.174 +//                         In practice, the only real Replaceable type that we
   1.175 +//                         will be seeing is UnicodeString, so this function
   1.176 +//                         will normally be efficient.
   1.177 +//
   1.178 +UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
   1.179 +    UnicodeString s;
   1.180 +    UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
   1.181 +    if (rs != NULL) {
   1.182 +        s = *rs;
   1.183 +    } else {
   1.184 +        r.extractBetween(0, r.length(), s);
   1.185 +    }
   1.186 +    return s;
   1.187 +}
   1.188 +
   1.189 +U_NAMESPACE_END
   1.190 +
   1.191 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */

mercurial