michael@0: /* michael@0: *************************************************************************** michael@0: * Copyright (C) 1999-2010 International Business Machines Corporation * michael@0: * and others. All rights reserved. * michael@0: *************************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "rbbidata.h" michael@0: #include "rbbirb.h" michael@0: #include "utrie.h" michael@0: #include "udatamem.h" michael@0: #include "cmemory.h" michael@0: #include "cstring.h" michael@0: #include "umutex.h" michael@0: michael@0: #include "uassert.h" michael@0: michael@0: michael@0: //----------------------------------------------------------------------------------- michael@0: // michael@0: // Trie access folding function. Copied as-is from properties code in uchar.c michael@0: // michael@0: //----------------------------------------------------------------------------------- michael@0: U_CDECL_BEGIN michael@0: static int32_t U_CALLCONV michael@0: getFoldingOffset(uint32_t data) { michael@0: /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */ michael@0: if(data&0x8000) { michael@0: return (int32_t)(data&0x7fff); michael@0: } else { michael@0: return 0; michael@0: } michael@0: } michael@0: U_CDECL_END michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // Constructors. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) { michael@0: init(data, status); michael@0: } michael@0: michael@0: RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) { michael@0: init(data, status); michael@0: fDontFreeData = TRUE; michael@0: } michael@0: michael@0: RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) { michael@0: const RBBIDataHeader *d = (const RBBIDataHeader *) michael@0: // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size); michael@0: // taking into consideration the padding added in by udata_write michael@0: ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize); michael@0: init(d, status); michael@0: fUDataMem = udm; michael@0: } michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // init(). Does most of the work of construction, shared between the michael@0: // constructors. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: fHeader = data; michael@0: if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3) michael@0: { michael@0: status = U_INVALID_FORMAT_ERROR; michael@0: return; michael@0: } michael@0: // Note: in ICU version 3.2 and earlier, there was a formatVersion 1 michael@0: // that is no longer supported. At that time fFormatVersion was michael@0: // an int32_t field, rather than an array of 4 bytes. michael@0: michael@0: fDontFreeData = FALSE; michael@0: fUDataMem = NULL; michael@0: fReverseTable = NULL; michael@0: fSafeFwdTable = NULL; michael@0: fSafeRevTable = NULL; michael@0: if (data->fFTableLen != 0) { michael@0: fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable); michael@0: } michael@0: if (data->fRTableLen != 0) { michael@0: fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable); michael@0: } michael@0: if (data->fSFTableLen != 0) { michael@0: fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable); michael@0: } michael@0: if (data->fSRTableLen != 0) { michael@0: fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable); michael@0: } michael@0: michael@0: michael@0: utrie_unserialize(&fTrie, michael@0: (uint8_t *)data + fHeader->fTrie, michael@0: fHeader->fTrieLen, michael@0: &status); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: fTrie.getFoldingOffset=getFoldingOffset; michael@0: michael@0: michael@0: fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource); michael@0: fRuleString.setTo(TRUE, fRuleSource, -1); michael@0: U_ASSERT(data->fRuleSourceLen > 0); michael@0: michael@0: fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable); michael@0: fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t); michael@0: michael@0: fRefCount = 1; michael@0: michael@0: #ifdef RBBI_DEBUG michael@0: char *debugEnv = getenv("U_RBBIDEBUG"); michael@0: if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();} michael@0: #endif michael@0: } michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // Destructor. Don't call this - use removeReference() instead. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: RBBIDataWrapper::~RBBIDataWrapper() { michael@0: U_ASSERT(fRefCount == 0); michael@0: if (fUDataMem) { michael@0: udata_close(fUDataMem); michael@0: } else if (!fDontFreeData) { michael@0: uprv_free((void *)fHeader); michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // Operator == Consider two RBBIDataWrappers to be equal if they michael@0: // refer to the same underlying data. Although michael@0: // the data wrappers are normally shared between michael@0: // iterator instances, it's possible to independently michael@0: // open the same data twice, and get two instances, which michael@0: // should still be ==. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const { michael@0: if (fHeader == other.fHeader) { michael@0: return TRUE; michael@0: } michael@0: if (fHeader->fLength != other.fHeader->fLength) { michael@0: return FALSE; michael@0: } michael@0: if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) { michael@0: return TRUE; michael@0: } michael@0: return FALSE; michael@0: } michael@0: michael@0: int32_t RBBIDataWrapper::hashCode() { michael@0: return fHeader->fFTableLen; michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // Reference Counting. A single RBBIDataWrapper object is shared among michael@0: // however many RulesBasedBreakIterator instances are michael@0: // referencing the same data. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBIDataWrapper::removeReference() { michael@0: if (umtx_atomic_dec(&fRefCount) == 0) { michael@0: delete this; michael@0: } michael@0: } michael@0: michael@0: michael@0: RBBIDataWrapper *RBBIDataWrapper::addReference() { michael@0: umtx_atomic_inc(&fRefCount); michael@0: return this; michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // getRuleSourceString michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: const UnicodeString &RBBIDataWrapper::getRuleSourceString() const { michael@0: return fRuleString; michael@0: } michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // print - debugging function to dump the runtime data tables. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: #ifdef RBBI_DEBUG michael@0: void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) { michael@0: uint32_t c; michael@0: uint32_t s; michael@0: michael@0: RBBIDebugPrintf(" %s\n", heading); michael@0: michael@0: RBBIDebugPrintf("State | Acc LA TagIx"); michael@0: for (c=0; cfCatCount; c++) {RBBIDebugPrintf("%3d ", c);} michael@0: RBBIDebugPrintf("\n------|---------------"); for (c=0;cfCatCount; c++) { michael@0: RBBIDebugPrintf("----"); michael@0: } michael@0: RBBIDebugPrintf("\n"); michael@0: michael@0: if (table == NULL) { michael@0: RBBIDebugPrintf(" N U L L T A B L E\n\n"); michael@0: return; michael@0: } michael@0: for (s=0; sfNumStates; s++) { michael@0: RBBIStateTableRow *row = (RBBIStateTableRow *) michael@0: (table->fTableData + (table->fRowLen * s)); michael@0: RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx); michael@0: for (c=0; cfCatCount; c++) { michael@0: RBBIDebugPrintf("%3d ", row->fNextState[c]); michael@0: } michael@0: RBBIDebugPrintf("\n"); michael@0: } michael@0: RBBIDebugPrintf("\n"); michael@0: } michael@0: #endif michael@0: michael@0: michael@0: #ifdef RBBI_DEBUG michael@0: void RBBIDataWrapper::printData() { michael@0: RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader); michael@0: RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1], michael@0: fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]); michael@0: RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength); michael@0: RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount); michael@0: michael@0: printTable("Forward State Transition Table", fForwardTable); michael@0: printTable("Reverse State Transition Table", fReverseTable); michael@0: printTable("Safe Forward State Transition Table", fSafeFwdTable); michael@0: printTable("Safe Reverse State Transition Table", fSafeRevTable); michael@0: michael@0: RBBIDebugPrintf("\nOrignal Rules source:\n"); michael@0: for (int32_t c=0; fRuleSource[c] != 0; c++) { michael@0: RBBIDebugPrintf("%c", fRuleSource[c]); michael@0: } michael@0: RBBIDebugPrintf("\n\n"); michael@0: } michael@0: #endif michael@0: michael@0: michael@0: U_NAMESPACE_END michael@0: U_NAMESPACE_USE michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // ubrk_swap - byte swap and char encoding swap of RBBI data michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, michael@0: UErrorCode *status) { michael@0: michael@0: if (status == NULL || U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { michael@0: *status=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: // michael@0: // Check that the data header is for for break data. michael@0: // (Header contents are defined in genbrk.cpp) michael@0: // michael@0: const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); michael@0: if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */ michael@0: pInfo->dataFormat[1]==0x72 && michael@0: pInfo->dataFormat[2]==0x6b && michael@0: pInfo->dataFormat[3]==0x20 && michael@0: pInfo->formatVersion[0]==3 )) { michael@0: udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", michael@0: pInfo->dataFormat[0], pInfo->dataFormat[1], michael@0: pInfo->dataFormat[2], pInfo->dataFormat[3], michael@0: pInfo->formatVersion[0]); michael@0: *status=U_UNSUPPORTED_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: // michael@0: // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific michael@0: // RBBIDataHeader). This swap also conveniently gets us michael@0: // the size of the ICU d.h., which lets us locate the start michael@0: // of the RBBI specific data. michael@0: // michael@0: int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); michael@0: michael@0: michael@0: // michael@0: // Get the RRBI Data Header, and check that it appears to be OK. michael@0: // michael@0: // Note: ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually michael@0: // an int32_t with a value of 1. Starting with ICU 3.4, michael@0: // RBBI's fDataFormat matches the dataFormat field from the michael@0: // UDataInfo header, four int8_t bytes. The value is {3,1,0,0} michael@0: // michael@0: const uint8_t *inBytes =(const uint8_t *)inData+headerSize; michael@0: RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes; michael@0: if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 || michael@0: rbbiDH->fFormatVersion[0] != 3 || michael@0: ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) michael@0: { michael@0: udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n"); michael@0: *status=U_UNSUPPORTED_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: // michael@0: // Prefight operation? Just return the size michael@0: // michael@0: int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength); michael@0: int32_t totalSize = headerSize + breakDataLength; michael@0: if (length < 0) { michael@0: return totalSize; michael@0: } michael@0: michael@0: // michael@0: // Check that length passed in is consistent with length from RBBI data header. michael@0: // michael@0: if (length < totalSize) { michael@0: udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n", michael@0: breakDataLength); michael@0: *status=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: michael@0: // michael@0: // Swap the Data. Do the data itself first, then the RBBI Data Header, because michael@0: // we need to reference the header to locate the data, and an michael@0: // inplace swap of the header leaves it unusable. michael@0: // michael@0: uint8_t *outBytes = (uint8_t *)outData + headerSize; michael@0: RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes; michael@0: michael@0: int32_t tableStartOffset; michael@0: int32_t tableLength; michael@0: michael@0: // michael@0: // If not swapping in place, zero out the output buffer before starting. michael@0: // Individual tables and other data items within are aligned to 8 byte boundaries michael@0: // when originally created. Any unused space between items needs to be zero. michael@0: // michael@0: if (inBytes != outBytes) { michael@0: uprv_memset(outBytes, 0, breakDataLength); michael@0: } michael@0: michael@0: // michael@0: // Each state table begins with several 32 bit fields. Calculate the size michael@0: // in bytes of these. michael@0: // michael@0: int32_t topSize = offsetof(RBBIStateTable, fTableData); michael@0: michael@0: // Forward state table. michael@0: tableStartOffset = ds->readUInt32(rbbiDH->fFTable); michael@0: tableLength = ds->readUInt32(rbbiDH->fFTableLen); michael@0: michael@0: if (tableLength > 0) { michael@0: ds->swapArray32(ds, inBytes+tableStartOffset, topSize, michael@0: outBytes+tableStartOffset, status); michael@0: ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, michael@0: outBytes+tableStartOffset+topSize, status); michael@0: } michael@0: michael@0: // Reverse state table. Same layout as forward table, above. michael@0: tableStartOffset = ds->readUInt32(rbbiDH->fRTable); michael@0: tableLength = ds->readUInt32(rbbiDH->fRTableLen); michael@0: michael@0: if (tableLength > 0) { michael@0: ds->swapArray32(ds, inBytes+tableStartOffset, topSize, michael@0: outBytes+tableStartOffset, status); michael@0: ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, michael@0: outBytes+tableStartOffset+topSize, status); michael@0: } michael@0: michael@0: // Safe Forward state table. Same layout as forward table, above. michael@0: tableStartOffset = ds->readUInt32(rbbiDH->fSFTable); michael@0: tableLength = ds->readUInt32(rbbiDH->fSFTableLen); michael@0: michael@0: if (tableLength > 0) { michael@0: ds->swapArray32(ds, inBytes+tableStartOffset, topSize, michael@0: outBytes+tableStartOffset, status); michael@0: ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, michael@0: outBytes+tableStartOffset+topSize, status); michael@0: } michael@0: michael@0: // Safe Reverse state table. Same layout as forward table, above. michael@0: tableStartOffset = ds->readUInt32(rbbiDH->fSRTable); michael@0: tableLength = ds->readUInt32(rbbiDH->fSRTableLen); michael@0: michael@0: if (tableLength > 0) { michael@0: ds->swapArray32(ds, inBytes+tableStartOffset, topSize, michael@0: outBytes+tableStartOffset, status); michael@0: ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, michael@0: outBytes+tableStartOffset+topSize, status); michael@0: } michael@0: michael@0: // Trie table for character categories michael@0: utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), michael@0: outBytes+ds->readUInt32(rbbiDH->fTrie), status); michael@0: michael@0: // Source Rules Text. It's UChar data michael@0: ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen), michael@0: outBytes+ds->readUInt32(rbbiDH->fRuleSource), status); michael@0: michael@0: // Table of rule status values. It's all int_32 values michael@0: ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen), michael@0: outBytes+ds->readUInt32(rbbiDH->fStatusTable), status); michael@0: michael@0: // And, last, the header. michael@0: // It is all int32_t values except for fFormataVersion, which is an array of four bytes. michael@0: // Swap the whole thing as int32_t, then re-swap the one field. michael@0: // michael@0: ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status); michael@0: ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status); michael@0: michael@0: return totalSize; michael@0: } michael@0: michael@0: michael@0: #endif /* #if !UCONFIG_NO_BREAK_ITERATION */