michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2008-2010, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: * Date Name Description michael@0: * 05/11/2008 Andy Heninger Port from Java michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION michael@0: michael@0: #include "unicode/unifilt.h" michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/brkiter.h" michael@0: #include "brktrans.h" michael@0: #include "unicode/uchar.h" michael@0: #include "cmemory.h" michael@0: #include "uprops.h" michael@0: #include "uinvchar.h" michael@0: #include "util.h" michael@0: #include "uvectr32.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) michael@0: michael@0: static const UChar SPACE = 32; // ' ' michael@0: michael@0: michael@0: /** michael@0: * Constructs a transliterator with the default delimiters '{' and michael@0: * '}'. michael@0: */ michael@0: BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : michael@0: Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), michael@0: fInsertion(SPACE) { michael@0: bi = NULL; michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: boundaries = new UVector32(status); michael@0: } michael@0: michael@0: michael@0: /** michael@0: * Destructor. michael@0: */ michael@0: BreakTransliterator::~BreakTransliterator() { michael@0: delete bi; michael@0: bi = NULL; michael@0: delete boundaries; michael@0: boundaries = NULL; michael@0: } michael@0: michael@0: /** michael@0: * Copy constructor. michael@0: */ michael@0: BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : michael@0: Transliterator(o) { michael@0: bi = NULL; michael@0: if (o.bi != NULL) { michael@0: bi = o.bi->clone(); michael@0: } michael@0: fInsertion = o.fInsertion; michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: boundaries = new UVector32(status); michael@0: } michael@0: michael@0: michael@0: /** michael@0: * Transliterator API. michael@0: */ michael@0: Transliterator* BreakTransliterator::clone(void) const { michael@0: return new BreakTransliterator(*this); michael@0: } michael@0: michael@0: /** michael@0: * Implements {@link Transliterator#handleTransliterate}. michael@0: */ michael@0: void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, michael@0: UBool isIncremental ) const { michael@0: michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: boundaries->removeAllElements(); michael@0: BreakTransliterator *nonConstThis = (BreakTransliterator *)this; michael@0: nonConstThis->getBreakIterator(); // Lazy-create it if necessary michael@0: UnicodeString sText = replaceableAsString(text); michael@0: bi->setText(sText); michael@0: bi->preceding(offsets.start); michael@0: michael@0: // To make things much easier, we will stack the boundaries, and then insert at the end. michael@0: // generally, we won't need too many, since we will be filtered. michael@0: michael@0: int32_t boundary; michael@0: for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { michael@0: if (boundary == 0) continue; michael@0: // HACK: Check to see that preceeding item was a letter michael@0: michael@0: UChar32 cp = sText.char32At(boundary-1); michael@0: int type = u_charType(cp); michael@0: //System.out.println(Integer.toString(cp,16) + " (before): " + type); michael@0: if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; michael@0: michael@0: cp = sText.char32At(boundary); michael@0: type = u_charType(cp); michael@0: //System.out.println(Integer.toString(cp,16) + " (after): " + type); michael@0: if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; michael@0: michael@0: boundaries->addElement(boundary, status); michael@0: // printf("Boundary at %d\n", boundary); michael@0: } michael@0: michael@0: int delta = 0; michael@0: int lastBoundary = 0; michael@0: michael@0: if (boundaries->size() != 0) { // if we found something, adjust michael@0: delta = boundaries->size() * fInsertion.length(); michael@0: lastBoundary = boundaries->lastElementi(); michael@0: michael@0: // we do this from the end backwards, so that we don't have to keep updating. michael@0: michael@0: while (boundaries->size() > 0) { michael@0: boundary = boundaries->popi(); michael@0: text.handleReplaceBetween(boundary, boundary, fInsertion); michael@0: } michael@0: } michael@0: michael@0: // Now fix up the return values michael@0: offsets.contextLimit += delta; michael@0: offsets.limit += delta; michael@0: offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; michael@0: michael@0: // TODO: do something with U_FAILURE(status); michael@0: // (need to look at transliterators overall, not just here.) michael@0: } michael@0: michael@0: // michael@0: // getInsertion() michael@0: // michael@0: const UnicodeString &BreakTransliterator::getInsertion() const { michael@0: return fInsertion; michael@0: } michael@0: michael@0: // michael@0: // setInsertion() michael@0: // michael@0: void BreakTransliterator::setInsertion(const UnicodeString &insertion) { michael@0: this->fInsertion = insertion; michael@0: } michael@0: michael@0: // michael@0: // getBreakIterator Lazily create the break iterator if it does michael@0: // not already exist. Copied from Java, probably michael@0: // better to just create it in the constructor. michael@0: // michael@0: BreakIterator *BreakTransliterator::getBreakIterator() { michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: if (bi == NULL) { michael@0: // Note: Thai breaking behavior is universal, it is not michael@0: // tied to the Thai locale. michael@0: bi = BreakIterator::createWordInstance(Locale::getEnglish(), status); michael@0: } michael@0: return bi; michael@0: } michael@0: michael@0: // michael@0: // replaceableAsString Hack to let break iterators work michael@0: // on the replaceable text from transliterators. michael@0: // In practice, the only real Replaceable type that we michael@0: // will be seeing is UnicodeString, so this function michael@0: // will normally be efficient. michael@0: // michael@0: UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { michael@0: UnicodeString s; michael@0: UnicodeString *rs = dynamic_cast(&r); michael@0: if (rs != NULL) { michael@0: s = *rs; michael@0: } else { michael@0: r.extractBetween(0, r.length(), s); michael@0: } michael@0: return s; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */