michael@0: /*
michael@0: **********************************************************************
michael@0: *   Copyright (C) 2008-2010, International Business Machines
michael@0: *   Corporation and others.  All Rights Reserved.
michael@0: **********************************************************************
michael@0: *   Date        Name        Description
michael@0: *   05/11/2008  Andy Heninger  Port from Java
michael@0: **********************************************************************
michael@0: */
michael@0: 
michael@0: #include "unicode/utypes.h"
michael@0: 
michael@0: #if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
michael@0: 
michael@0: #include "unicode/unifilt.h"
michael@0: #include "unicode/uchar.h"
michael@0: #include "unicode/uniset.h"
michael@0: #include "unicode/brkiter.h"
michael@0: #include "brktrans.h"
michael@0: #include "unicode/uchar.h"
michael@0: #include "cmemory.h"
michael@0: #include "uprops.h"
michael@0: #include "uinvchar.h"
michael@0: #include "util.h"
michael@0: #include "uvectr32.h"
michael@0: 
michael@0: U_NAMESPACE_BEGIN
michael@0: 
michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
michael@0: 
michael@0: static const UChar SPACE       = 32;  // ' '
michael@0: 
michael@0: 
michael@0: /**
michael@0:  * Constructs a transliterator with the default delimiters '{' and
michael@0:  * '}'.
michael@0:  */
michael@0: BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
michael@0:     Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
michael@0:     fInsertion(SPACE) {
michael@0:         bi = NULL;
michael@0:         UErrorCode status = U_ZERO_ERROR;
michael@0:         boundaries = new UVector32(status);
michael@0:     }
michael@0: 
michael@0: 
michael@0: /**
michael@0:  * Destructor.
michael@0:  */
michael@0: BreakTransliterator::~BreakTransliterator() {
michael@0:     delete bi;
michael@0:     bi = NULL;
michael@0:     delete boundaries;
michael@0:     boundaries = NULL;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Copy constructor.
michael@0:  */
michael@0: BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
michael@0:     Transliterator(o) {
michael@0:         bi = NULL;
michael@0:         if (o.bi != NULL) {
michael@0:             bi = o.bi->clone();
michael@0:         }
michael@0:         fInsertion = o.fInsertion;
michael@0:         UErrorCode status = U_ZERO_ERROR;
michael@0:         boundaries = new UVector32(status);
michael@0:     }
michael@0: 
michael@0: 
michael@0: /**
michael@0:  * Transliterator API.
michael@0:  */
michael@0: Transliterator* BreakTransliterator::clone(void) const {
michael@0:     return new BreakTransliterator(*this);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Implements {@link Transliterator#handleTransliterate}.
michael@0:  */
michael@0: void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
michael@0:                                                     UBool isIncremental ) const {
michael@0: 
michael@0:         UErrorCode status = U_ZERO_ERROR;
michael@0:         boundaries->removeAllElements();
michael@0:         BreakTransliterator *nonConstThis = (BreakTransliterator *)this;
michael@0:         nonConstThis->getBreakIterator(); // Lazy-create it if necessary
michael@0:         UnicodeString sText = replaceableAsString(text);
michael@0:         bi->setText(sText);
michael@0:         bi->preceding(offsets.start);
michael@0: 
michael@0:         // To make things much easier, we will stack the boundaries, and then insert at the end.
michael@0:         // generally, we won't need too many, since we will be filtered.
michael@0: 
michael@0:         int32_t boundary;
michael@0:         for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
michael@0:             if (boundary == 0) continue;
michael@0:             // HACK: Check to see that preceeding item was a letter
michael@0: 
michael@0:             UChar32 cp = sText.char32At(boundary-1);
michael@0:             int type = u_charType(cp);
michael@0:             //System.out.println(Integer.toString(cp,16) + " (before): " + type);
michael@0:             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
michael@0: 
michael@0:             cp = sText.char32At(boundary);
michael@0:             type = u_charType(cp);
michael@0:             //System.out.println(Integer.toString(cp,16) + " (after): " + type);
michael@0:             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
michael@0: 
michael@0:             boundaries->addElement(boundary, status);
michael@0:             // printf("Boundary at %d\n", boundary);
michael@0:         }
michael@0: 
michael@0:         int delta = 0;
michael@0:         int lastBoundary = 0;
michael@0: 
michael@0:         if (boundaries->size() != 0) { // if we found something, adjust
michael@0:             delta = boundaries->size() * fInsertion.length();
michael@0:             lastBoundary = boundaries->lastElementi();
michael@0: 
michael@0:             // we do this from the end backwards, so that we don't have to keep updating.
michael@0: 
michael@0:             while (boundaries->size() > 0) {
michael@0:                 boundary = boundaries->popi();
michael@0:                 text.handleReplaceBetween(boundary, boundary, fInsertion);
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         // Now fix up the return values
michael@0:         offsets.contextLimit += delta;
michael@0:         offsets.limit += delta;
michael@0:         offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
michael@0: 
michael@0:         // TODO:  do something with U_FAILURE(status);
michael@0:         //        (need to look at transliterators overall, not just here.)
michael@0: }
michael@0: 
michael@0: //
michael@0: //  getInsertion()
michael@0: //
michael@0: const UnicodeString &BreakTransliterator::getInsertion() const {
michael@0:     return fInsertion;
michael@0: }
michael@0: 
michael@0: //
michael@0: //  setInsertion()
michael@0: //
michael@0: void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
michael@0:     this->fInsertion = insertion;
michael@0: }
michael@0: 
michael@0: //
michael@0: //  getBreakIterator     Lazily create the break iterator if it does
michael@0: //                       not already exist.  Copied from Java, probably
michael@0: //                       better to just create it in the constructor.
michael@0: //
michael@0: BreakIterator *BreakTransliterator::getBreakIterator() {
michael@0:     UErrorCode status = U_ZERO_ERROR;
michael@0:     if (bi == NULL) {
michael@0:         // Note:  Thai breaking behavior is universal, it is not
michael@0:         //        tied to the Thai locale.
michael@0:         bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
michael@0:     }
michael@0:     return bi;
michael@0: }
michael@0: 
michael@0: //
michael@0: //   replaceableAsString   Hack to let break iterators work
michael@0: //                         on the replaceable text from transliterators.
michael@0: //                         In practice, the only real Replaceable type that we
michael@0: //                         will be seeing is UnicodeString, so this function
michael@0: //                         will normally be efficient.
michael@0: //
michael@0: UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
michael@0:     UnicodeString s;
michael@0:     UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
michael@0:     if (rs != NULL) {
michael@0:         s = *rs;
michael@0:     } else {
michael@0:         r.extractBetween(0, r.length(), s);
michael@0:     }
michael@0:     return s;
michael@0: }
michael@0: 
michael@0: U_NAMESPACE_END
michael@0: 
michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */