1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/rbbidata.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,446 @@ 1.4 +/* 1.5 +*************************************************************************** 1.6 +* Copyright (C) 1999-2010 International Business Machines Corporation * 1.7 +* and others. All rights reserved. * 1.8 +*************************************************************************** 1.9 +*/ 1.10 + 1.11 +#include "unicode/utypes.h" 1.12 + 1.13 +#if !UCONFIG_NO_BREAK_ITERATION 1.14 + 1.15 +#include "unicode/utypes.h" 1.16 +#include "rbbidata.h" 1.17 +#include "rbbirb.h" 1.18 +#include "utrie.h" 1.19 +#include "udatamem.h" 1.20 +#include "cmemory.h" 1.21 +#include "cstring.h" 1.22 +#include "umutex.h" 1.23 + 1.24 +#include "uassert.h" 1.25 + 1.26 + 1.27 +//----------------------------------------------------------------------------------- 1.28 +// 1.29 +// Trie access folding function. Copied as-is from properties code in uchar.c 1.30 +// 1.31 +//----------------------------------------------------------------------------------- 1.32 +U_CDECL_BEGIN 1.33 +static int32_t U_CALLCONV 1.34 +getFoldingOffset(uint32_t data) { 1.35 + /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */ 1.36 + if(data&0x8000) { 1.37 + return (int32_t)(data&0x7fff); 1.38 + } else { 1.39 + return 0; 1.40 + } 1.41 +} 1.42 +U_CDECL_END 1.43 + 1.44 +U_NAMESPACE_BEGIN 1.45 + 1.46 +//----------------------------------------------------------------------------- 1.47 +// 1.48 +// Constructors. 1.49 +// 1.50 +//----------------------------------------------------------------------------- 1.51 +RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) { 1.52 + init(data, status); 1.53 +} 1.54 + 1.55 +RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) { 1.56 + init(data, status); 1.57 + fDontFreeData = TRUE; 1.58 +} 1.59 + 1.60 +RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) { 1.61 + const RBBIDataHeader *d = (const RBBIDataHeader *) 1.62 + // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size); 1.63 + // taking into consideration the padding added in by udata_write 1.64 + ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize); 1.65 + init(d, status); 1.66 + fUDataMem = udm; 1.67 +} 1.68 + 1.69 +//----------------------------------------------------------------------------- 1.70 +// 1.71 +// init(). Does most of the work of construction, shared between the 1.72 +// constructors. 1.73 +// 1.74 +//----------------------------------------------------------------------------- 1.75 +void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { 1.76 + if (U_FAILURE(status)) { 1.77 + return; 1.78 + } 1.79 + fHeader = data; 1.80 + if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3) 1.81 + { 1.82 + status = U_INVALID_FORMAT_ERROR; 1.83 + return; 1.84 + } 1.85 + // Note: in ICU version 3.2 and earlier, there was a formatVersion 1 1.86 + // that is no longer supported. At that time fFormatVersion was 1.87 + // an int32_t field, rather than an array of 4 bytes. 1.88 + 1.89 + fDontFreeData = FALSE; 1.90 + fUDataMem = NULL; 1.91 + fReverseTable = NULL; 1.92 + fSafeFwdTable = NULL; 1.93 + fSafeRevTable = NULL; 1.94 + if (data->fFTableLen != 0) { 1.95 + fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable); 1.96 + } 1.97 + if (data->fRTableLen != 0) { 1.98 + fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable); 1.99 + } 1.100 + if (data->fSFTableLen != 0) { 1.101 + fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable); 1.102 + } 1.103 + if (data->fSRTableLen != 0) { 1.104 + fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable); 1.105 + } 1.106 + 1.107 + 1.108 + utrie_unserialize(&fTrie, 1.109 + (uint8_t *)data + fHeader->fTrie, 1.110 + fHeader->fTrieLen, 1.111 + &status); 1.112 + if (U_FAILURE(status)) { 1.113 + return; 1.114 + } 1.115 + fTrie.getFoldingOffset=getFoldingOffset; 1.116 + 1.117 + 1.118 + fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource); 1.119 + fRuleString.setTo(TRUE, fRuleSource, -1); 1.120 + U_ASSERT(data->fRuleSourceLen > 0); 1.121 + 1.122 + fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable); 1.123 + fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t); 1.124 + 1.125 + fRefCount = 1; 1.126 + 1.127 +#ifdef RBBI_DEBUG 1.128 + char *debugEnv = getenv("U_RBBIDEBUG"); 1.129 + if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();} 1.130 +#endif 1.131 +} 1.132 + 1.133 + 1.134 +//----------------------------------------------------------------------------- 1.135 +// 1.136 +// Destructor. Don't call this - use removeReference() instead. 1.137 +// 1.138 +//----------------------------------------------------------------------------- 1.139 +RBBIDataWrapper::~RBBIDataWrapper() { 1.140 + U_ASSERT(fRefCount == 0); 1.141 + if (fUDataMem) { 1.142 + udata_close(fUDataMem); 1.143 + } else if (!fDontFreeData) { 1.144 + uprv_free((void *)fHeader); 1.145 + } 1.146 +} 1.147 + 1.148 + 1.149 + 1.150 +//----------------------------------------------------------------------------- 1.151 +// 1.152 +// Operator == Consider two RBBIDataWrappers to be equal if they 1.153 +// refer to the same underlying data. Although 1.154 +// the data wrappers are normally shared between 1.155 +// iterator instances, it's possible to independently 1.156 +// open the same data twice, and get two instances, which 1.157 +// should still be ==. 1.158 +// 1.159 +//----------------------------------------------------------------------------- 1.160 +UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const { 1.161 + if (fHeader == other.fHeader) { 1.162 + return TRUE; 1.163 + } 1.164 + if (fHeader->fLength != other.fHeader->fLength) { 1.165 + return FALSE; 1.166 + } 1.167 + if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) { 1.168 + return TRUE; 1.169 + } 1.170 + return FALSE; 1.171 +} 1.172 + 1.173 +int32_t RBBIDataWrapper::hashCode() { 1.174 + return fHeader->fFTableLen; 1.175 +} 1.176 + 1.177 + 1.178 + 1.179 +//----------------------------------------------------------------------------- 1.180 +// 1.181 +// Reference Counting. A single RBBIDataWrapper object is shared among 1.182 +// however many RulesBasedBreakIterator instances are 1.183 +// referencing the same data. 1.184 +// 1.185 +//----------------------------------------------------------------------------- 1.186 +void RBBIDataWrapper::removeReference() { 1.187 + if (umtx_atomic_dec(&fRefCount) == 0) { 1.188 + delete this; 1.189 + } 1.190 +} 1.191 + 1.192 + 1.193 +RBBIDataWrapper *RBBIDataWrapper::addReference() { 1.194 + umtx_atomic_inc(&fRefCount); 1.195 + return this; 1.196 +} 1.197 + 1.198 + 1.199 + 1.200 +//----------------------------------------------------------------------------- 1.201 +// 1.202 +// getRuleSourceString 1.203 +// 1.204 +//----------------------------------------------------------------------------- 1.205 +const UnicodeString &RBBIDataWrapper::getRuleSourceString() const { 1.206 + return fRuleString; 1.207 +} 1.208 + 1.209 + 1.210 +//----------------------------------------------------------------------------- 1.211 +// 1.212 +// print - debugging function to dump the runtime data tables. 1.213 +// 1.214 +//----------------------------------------------------------------------------- 1.215 +#ifdef RBBI_DEBUG 1.216 +void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) { 1.217 + uint32_t c; 1.218 + uint32_t s; 1.219 + 1.220 + RBBIDebugPrintf(" %s\n", heading); 1.221 + 1.222 + RBBIDebugPrintf("State | Acc LA TagIx"); 1.223 + for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);} 1.224 + RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) { 1.225 + RBBIDebugPrintf("----"); 1.226 + } 1.227 + RBBIDebugPrintf("\n"); 1.228 + 1.229 + if (table == NULL) { 1.230 + RBBIDebugPrintf(" N U L L T A B L E\n\n"); 1.231 + return; 1.232 + } 1.233 + for (s=0; s<table->fNumStates; s++) { 1.234 + RBBIStateTableRow *row = (RBBIStateTableRow *) 1.235 + (table->fTableData + (table->fRowLen * s)); 1.236 + RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx); 1.237 + for (c=0; c<fHeader->fCatCount; c++) { 1.238 + RBBIDebugPrintf("%3d ", row->fNextState[c]); 1.239 + } 1.240 + RBBIDebugPrintf("\n"); 1.241 + } 1.242 + RBBIDebugPrintf("\n"); 1.243 +} 1.244 +#endif 1.245 + 1.246 + 1.247 +#ifdef RBBI_DEBUG 1.248 +void RBBIDataWrapper::printData() { 1.249 + RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader); 1.250 + RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1], 1.251 + fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]); 1.252 + RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength); 1.253 + RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount); 1.254 + 1.255 + printTable("Forward State Transition Table", fForwardTable); 1.256 + printTable("Reverse State Transition Table", fReverseTable); 1.257 + printTable("Safe Forward State Transition Table", fSafeFwdTable); 1.258 + printTable("Safe Reverse State Transition Table", fSafeRevTable); 1.259 + 1.260 + RBBIDebugPrintf("\nOrignal Rules source:\n"); 1.261 + for (int32_t c=0; fRuleSource[c] != 0; c++) { 1.262 + RBBIDebugPrintf("%c", fRuleSource[c]); 1.263 + } 1.264 + RBBIDebugPrintf("\n\n"); 1.265 +} 1.266 +#endif 1.267 + 1.268 + 1.269 +U_NAMESPACE_END 1.270 +U_NAMESPACE_USE 1.271 + 1.272 +//----------------------------------------------------------------------------- 1.273 +// 1.274 +// ubrk_swap - byte swap and char encoding swap of RBBI data 1.275 +// 1.276 +//----------------------------------------------------------------------------- 1.277 + 1.278 +U_CAPI int32_t U_EXPORT2 1.279 +ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 1.280 + UErrorCode *status) { 1.281 + 1.282 + if (status == NULL || U_FAILURE(*status)) { 1.283 + return 0; 1.284 + } 1.285 + if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { 1.286 + *status=U_ILLEGAL_ARGUMENT_ERROR; 1.287 + return 0; 1.288 + } 1.289 + 1.290 + // 1.291 + // Check that the data header is for for break data. 1.292 + // (Header contents are defined in genbrk.cpp) 1.293 + // 1.294 + const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); 1.295 + if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */ 1.296 + pInfo->dataFormat[1]==0x72 && 1.297 + pInfo->dataFormat[2]==0x6b && 1.298 + pInfo->dataFormat[3]==0x20 && 1.299 + pInfo->formatVersion[0]==3 )) { 1.300 + udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", 1.301 + pInfo->dataFormat[0], pInfo->dataFormat[1], 1.302 + pInfo->dataFormat[2], pInfo->dataFormat[3], 1.303 + pInfo->formatVersion[0]); 1.304 + *status=U_UNSUPPORTED_ERROR; 1.305 + return 0; 1.306 + } 1.307 + 1.308 + // 1.309 + // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific 1.310 + // RBBIDataHeader). This swap also conveniently gets us 1.311 + // the size of the ICU d.h., which lets us locate the start 1.312 + // of the RBBI specific data. 1.313 + // 1.314 + int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); 1.315 + 1.316 + 1.317 + // 1.318 + // Get the RRBI Data Header, and check that it appears to be OK. 1.319 + // 1.320 + // Note: ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually 1.321 + // an int32_t with a value of 1. Starting with ICU 3.4, 1.322 + // RBBI's fDataFormat matches the dataFormat field from the 1.323 + // UDataInfo header, four int8_t bytes. The value is {3,1,0,0} 1.324 + // 1.325 + const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 1.326 + RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes; 1.327 + if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 || 1.328 + rbbiDH->fFormatVersion[0] != 3 || 1.329 + ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) 1.330 + { 1.331 + udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n"); 1.332 + *status=U_UNSUPPORTED_ERROR; 1.333 + return 0; 1.334 + } 1.335 + 1.336 + // 1.337 + // Prefight operation? Just return the size 1.338 + // 1.339 + int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength); 1.340 + int32_t totalSize = headerSize + breakDataLength; 1.341 + if (length < 0) { 1.342 + return totalSize; 1.343 + } 1.344 + 1.345 + // 1.346 + // Check that length passed in is consistent with length from RBBI data header. 1.347 + // 1.348 + if (length < totalSize) { 1.349 + udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n", 1.350 + breakDataLength); 1.351 + *status=U_INDEX_OUTOFBOUNDS_ERROR; 1.352 + return 0; 1.353 + } 1.354 + 1.355 + 1.356 + // 1.357 + // Swap the Data. Do the data itself first, then the RBBI Data Header, because 1.358 + // we need to reference the header to locate the data, and an 1.359 + // inplace swap of the header leaves it unusable. 1.360 + // 1.361 + uint8_t *outBytes = (uint8_t *)outData + headerSize; 1.362 + RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes; 1.363 + 1.364 + int32_t tableStartOffset; 1.365 + int32_t tableLength; 1.366 + 1.367 + // 1.368 + // If not swapping in place, zero out the output buffer before starting. 1.369 + // Individual tables and other data items within are aligned to 8 byte boundaries 1.370 + // when originally created. Any unused space between items needs to be zero. 1.371 + // 1.372 + if (inBytes != outBytes) { 1.373 + uprv_memset(outBytes, 0, breakDataLength); 1.374 + } 1.375 + 1.376 + // 1.377 + // Each state table begins with several 32 bit fields. Calculate the size 1.378 + // in bytes of these. 1.379 + // 1.380 + int32_t topSize = offsetof(RBBIStateTable, fTableData); 1.381 + 1.382 + // Forward state table. 1.383 + tableStartOffset = ds->readUInt32(rbbiDH->fFTable); 1.384 + tableLength = ds->readUInt32(rbbiDH->fFTableLen); 1.385 + 1.386 + if (tableLength > 0) { 1.387 + ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 1.388 + outBytes+tableStartOffset, status); 1.389 + ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 1.390 + outBytes+tableStartOffset+topSize, status); 1.391 + } 1.392 + 1.393 + // Reverse state table. Same layout as forward table, above. 1.394 + tableStartOffset = ds->readUInt32(rbbiDH->fRTable); 1.395 + tableLength = ds->readUInt32(rbbiDH->fRTableLen); 1.396 + 1.397 + if (tableLength > 0) { 1.398 + ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 1.399 + outBytes+tableStartOffset, status); 1.400 + ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 1.401 + outBytes+tableStartOffset+topSize, status); 1.402 + } 1.403 + 1.404 + // Safe Forward state table. Same layout as forward table, above. 1.405 + tableStartOffset = ds->readUInt32(rbbiDH->fSFTable); 1.406 + tableLength = ds->readUInt32(rbbiDH->fSFTableLen); 1.407 + 1.408 + if (tableLength > 0) { 1.409 + ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 1.410 + outBytes+tableStartOffset, status); 1.411 + ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 1.412 + outBytes+tableStartOffset+topSize, status); 1.413 + } 1.414 + 1.415 + // Safe Reverse state table. Same layout as forward table, above. 1.416 + tableStartOffset = ds->readUInt32(rbbiDH->fSRTable); 1.417 + tableLength = ds->readUInt32(rbbiDH->fSRTableLen); 1.418 + 1.419 + if (tableLength > 0) { 1.420 + ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 1.421 + outBytes+tableStartOffset, status); 1.422 + ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 1.423 + outBytes+tableStartOffset+topSize, status); 1.424 + } 1.425 + 1.426 + // Trie table for character categories 1.427 + utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), 1.428 + outBytes+ds->readUInt32(rbbiDH->fTrie), status); 1.429 + 1.430 + // Source Rules Text. It's UChar data 1.431 + ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen), 1.432 + outBytes+ds->readUInt32(rbbiDH->fRuleSource), status); 1.433 + 1.434 + // Table of rule status values. It's all int_32 values 1.435 + ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen), 1.436 + outBytes+ds->readUInt32(rbbiDH->fStatusTable), status); 1.437 + 1.438 + // And, last, the header. 1.439 + // It is all int32_t values except for fFormataVersion, which is an array of four bytes. 1.440 + // Swap the whole thing as int32_t, then re-swap the one field. 1.441 + // 1.442 + ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status); 1.443 + ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status); 1.444 + 1.445 + return totalSize; 1.446 +} 1.447 + 1.448 + 1.449 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */