intl/icu/source/common/rbbidata.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 ***************************************************************************
michael@0 3 * Copyright (C) 1999-2010 International Business Machines Corporation *
michael@0 4 * and others. All rights reserved. *
michael@0 5 ***************************************************************************
michael@0 6 */
michael@0 7
michael@0 8 #include "unicode/utypes.h"
michael@0 9
michael@0 10 #if !UCONFIG_NO_BREAK_ITERATION
michael@0 11
michael@0 12 #include "unicode/utypes.h"
michael@0 13 #include "rbbidata.h"
michael@0 14 #include "rbbirb.h"
michael@0 15 #include "utrie.h"
michael@0 16 #include "udatamem.h"
michael@0 17 #include "cmemory.h"
michael@0 18 #include "cstring.h"
michael@0 19 #include "umutex.h"
michael@0 20
michael@0 21 #include "uassert.h"
michael@0 22
michael@0 23
michael@0 24 //-----------------------------------------------------------------------------------
michael@0 25 //
michael@0 26 // Trie access folding function. Copied as-is from properties code in uchar.c
michael@0 27 //
michael@0 28 //-----------------------------------------------------------------------------------
michael@0 29 U_CDECL_BEGIN
michael@0 30 static int32_t U_CALLCONV
michael@0 31 getFoldingOffset(uint32_t data) {
michael@0 32 /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
michael@0 33 if(data&0x8000) {
michael@0 34 return (int32_t)(data&0x7fff);
michael@0 35 } else {
michael@0 36 return 0;
michael@0 37 }
michael@0 38 }
michael@0 39 U_CDECL_END
michael@0 40
michael@0 41 U_NAMESPACE_BEGIN
michael@0 42
michael@0 43 //-----------------------------------------------------------------------------
michael@0 44 //
michael@0 45 // Constructors.
michael@0 46 //
michael@0 47 //-----------------------------------------------------------------------------
michael@0 48 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
michael@0 49 init(data, status);
michael@0 50 }
michael@0 51
michael@0 52 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
michael@0 53 init(data, status);
michael@0 54 fDontFreeData = TRUE;
michael@0 55 }
michael@0 56
michael@0 57 RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
michael@0 58 const RBBIDataHeader *d = (const RBBIDataHeader *)
michael@0 59 // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size);
michael@0 60 // taking into consideration the padding added in by udata_write
michael@0 61 ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
michael@0 62 init(d, status);
michael@0 63 fUDataMem = udm;
michael@0 64 }
michael@0 65
michael@0 66 //-----------------------------------------------------------------------------
michael@0 67 //
michael@0 68 // init(). Does most of the work of construction, shared between the
michael@0 69 // constructors.
michael@0 70 //
michael@0 71 //-----------------------------------------------------------------------------
michael@0 72 void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
michael@0 73 if (U_FAILURE(status)) {
michael@0 74 return;
michael@0 75 }
michael@0 76 fHeader = data;
michael@0 77 if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3)
michael@0 78 {
michael@0 79 status = U_INVALID_FORMAT_ERROR;
michael@0 80 return;
michael@0 81 }
michael@0 82 // Note: in ICU version 3.2 and earlier, there was a formatVersion 1
michael@0 83 // that is no longer supported. At that time fFormatVersion was
michael@0 84 // an int32_t field, rather than an array of 4 bytes.
michael@0 85
michael@0 86 fDontFreeData = FALSE;
michael@0 87 fUDataMem = NULL;
michael@0 88 fReverseTable = NULL;
michael@0 89 fSafeFwdTable = NULL;
michael@0 90 fSafeRevTable = NULL;
michael@0 91 if (data->fFTableLen != 0) {
michael@0 92 fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
michael@0 93 }
michael@0 94 if (data->fRTableLen != 0) {
michael@0 95 fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
michael@0 96 }
michael@0 97 if (data->fSFTableLen != 0) {
michael@0 98 fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
michael@0 99 }
michael@0 100 if (data->fSRTableLen != 0) {
michael@0 101 fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
michael@0 102 }
michael@0 103
michael@0 104
michael@0 105 utrie_unserialize(&fTrie,
michael@0 106 (uint8_t *)data + fHeader->fTrie,
michael@0 107 fHeader->fTrieLen,
michael@0 108 &status);
michael@0 109 if (U_FAILURE(status)) {
michael@0 110 return;
michael@0 111 }
michael@0 112 fTrie.getFoldingOffset=getFoldingOffset;
michael@0 113
michael@0 114
michael@0 115 fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource);
michael@0 116 fRuleString.setTo(TRUE, fRuleSource, -1);
michael@0 117 U_ASSERT(data->fRuleSourceLen > 0);
michael@0 118
michael@0 119 fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
michael@0 120 fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t);
michael@0 121
michael@0 122 fRefCount = 1;
michael@0 123
michael@0 124 #ifdef RBBI_DEBUG
michael@0 125 char *debugEnv = getenv("U_RBBIDEBUG");
michael@0 126 if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
michael@0 127 #endif
michael@0 128 }
michael@0 129
michael@0 130
michael@0 131 //-----------------------------------------------------------------------------
michael@0 132 //
michael@0 133 // Destructor. Don't call this - use removeReference() instead.
michael@0 134 //
michael@0 135 //-----------------------------------------------------------------------------
michael@0 136 RBBIDataWrapper::~RBBIDataWrapper() {
michael@0 137 U_ASSERT(fRefCount == 0);
michael@0 138 if (fUDataMem) {
michael@0 139 udata_close(fUDataMem);
michael@0 140 } else if (!fDontFreeData) {
michael@0 141 uprv_free((void *)fHeader);
michael@0 142 }
michael@0 143 }
michael@0 144
michael@0 145
michael@0 146
michael@0 147 //-----------------------------------------------------------------------------
michael@0 148 //
michael@0 149 // Operator == Consider two RBBIDataWrappers to be equal if they
michael@0 150 // refer to the same underlying data. Although
michael@0 151 // the data wrappers are normally shared between
michael@0 152 // iterator instances, it's possible to independently
michael@0 153 // open the same data twice, and get two instances, which
michael@0 154 // should still be ==.
michael@0 155 //
michael@0 156 //-----------------------------------------------------------------------------
michael@0 157 UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
michael@0 158 if (fHeader == other.fHeader) {
michael@0 159 return TRUE;
michael@0 160 }
michael@0 161 if (fHeader->fLength != other.fHeader->fLength) {
michael@0 162 return FALSE;
michael@0 163 }
michael@0 164 if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
michael@0 165 return TRUE;
michael@0 166 }
michael@0 167 return FALSE;
michael@0 168 }
michael@0 169
michael@0 170 int32_t RBBIDataWrapper::hashCode() {
michael@0 171 return fHeader->fFTableLen;
michael@0 172 }
michael@0 173
michael@0 174
michael@0 175
michael@0 176 //-----------------------------------------------------------------------------
michael@0 177 //
michael@0 178 // Reference Counting. A single RBBIDataWrapper object is shared among
michael@0 179 // however many RulesBasedBreakIterator instances are
michael@0 180 // referencing the same data.
michael@0 181 //
michael@0 182 //-----------------------------------------------------------------------------
michael@0 183 void RBBIDataWrapper::removeReference() {
michael@0 184 if (umtx_atomic_dec(&fRefCount) == 0) {
michael@0 185 delete this;
michael@0 186 }
michael@0 187 }
michael@0 188
michael@0 189
michael@0 190 RBBIDataWrapper *RBBIDataWrapper::addReference() {
michael@0 191 umtx_atomic_inc(&fRefCount);
michael@0 192 return this;
michael@0 193 }
michael@0 194
michael@0 195
michael@0 196
michael@0 197 //-----------------------------------------------------------------------------
michael@0 198 //
michael@0 199 // getRuleSourceString
michael@0 200 //
michael@0 201 //-----------------------------------------------------------------------------
michael@0 202 const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
michael@0 203 return fRuleString;
michael@0 204 }
michael@0 205
michael@0 206
michael@0 207 //-----------------------------------------------------------------------------
michael@0 208 //
michael@0 209 // print - debugging function to dump the runtime data tables.
michael@0 210 //
michael@0 211 //-----------------------------------------------------------------------------
michael@0 212 #ifdef RBBI_DEBUG
michael@0 213 void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
michael@0 214 uint32_t c;
michael@0 215 uint32_t s;
michael@0 216
michael@0 217 RBBIDebugPrintf(" %s\n", heading);
michael@0 218
michael@0 219 RBBIDebugPrintf("State | Acc LA TagIx");
michael@0 220 for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
michael@0 221 RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
michael@0 222 RBBIDebugPrintf("----");
michael@0 223 }
michael@0 224 RBBIDebugPrintf("\n");
michael@0 225
michael@0 226 if (table == NULL) {
michael@0 227 RBBIDebugPrintf(" N U L L T A B L E\n\n");
michael@0 228 return;
michael@0 229 }
michael@0 230 for (s=0; s<table->fNumStates; s++) {
michael@0 231 RBBIStateTableRow *row = (RBBIStateTableRow *)
michael@0 232 (table->fTableData + (table->fRowLen * s));
michael@0 233 RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
michael@0 234 for (c=0; c<fHeader->fCatCount; c++) {
michael@0 235 RBBIDebugPrintf("%3d ", row->fNextState[c]);
michael@0 236 }
michael@0 237 RBBIDebugPrintf("\n");
michael@0 238 }
michael@0 239 RBBIDebugPrintf("\n");
michael@0 240 }
michael@0 241 #endif
michael@0 242
michael@0 243
michael@0 244 #ifdef RBBI_DEBUG
michael@0 245 void RBBIDataWrapper::printData() {
michael@0 246 RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
michael@0 247 RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
michael@0 248 fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
michael@0 249 RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength);
michael@0 250 RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
michael@0 251
michael@0 252 printTable("Forward State Transition Table", fForwardTable);
michael@0 253 printTable("Reverse State Transition Table", fReverseTable);
michael@0 254 printTable("Safe Forward State Transition Table", fSafeFwdTable);
michael@0 255 printTable("Safe Reverse State Transition Table", fSafeRevTable);
michael@0 256
michael@0 257 RBBIDebugPrintf("\nOrignal Rules source:\n");
michael@0 258 for (int32_t c=0; fRuleSource[c] != 0; c++) {
michael@0 259 RBBIDebugPrintf("%c", fRuleSource[c]);
michael@0 260 }
michael@0 261 RBBIDebugPrintf("\n\n");
michael@0 262 }
michael@0 263 #endif
michael@0 264
michael@0 265
michael@0 266 U_NAMESPACE_END
michael@0 267 U_NAMESPACE_USE
michael@0 268
michael@0 269 //-----------------------------------------------------------------------------
michael@0 270 //
michael@0 271 // ubrk_swap - byte swap and char encoding swap of RBBI data
michael@0 272 //
michael@0 273 //-----------------------------------------------------------------------------
michael@0 274
michael@0 275 U_CAPI int32_t U_EXPORT2
michael@0 276 ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
michael@0 277 UErrorCode *status) {
michael@0 278
michael@0 279 if (status == NULL || U_FAILURE(*status)) {
michael@0 280 return 0;
michael@0 281 }
michael@0 282 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
michael@0 283 *status=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 284 return 0;
michael@0 285 }
michael@0 286
michael@0 287 //
michael@0 288 // Check that the data header is for for break data.
michael@0 289 // (Header contents are defined in genbrk.cpp)
michael@0 290 //
michael@0 291 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
michael@0 292 if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */
michael@0 293 pInfo->dataFormat[1]==0x72 &&
michael@0 294 pInfo->dataFormat[2]==0x6b &&
michael@0 295 pInfo->dataFormat[3]==0x20 &&
michael@0 296 pInfo->formatVersion[0]==3 )) {
michael@0 297 udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
michael@0 298 pInfo->dataFormat[0], pInfo->dataFormat[1],
michael@0 299 pInfo->dataFormat[2], pInfo->dataFormat[3],
michael@0 300 pInfo->formatVersion[0]);
michael@0 301 *status=U_UNSUPPORTED_ERROR;
michael@0 302 return 0;
michael@0 303 }
michael@0 304
michael@0 305 //
michael@0 306 // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific
michael@0 307 // RBBIDataHeader). This swap also conveniently gets us
michael@0 308 // the size of the ICU d.h., which lets us locate the start
michael@0 309 // of the RBBI specific data.
michael@0 310 //
michael@0 311 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
michael@0 312
michael@0 313
michael@0 314 //
michael@0 315 // Get the RRBI Data Header, and check that it appears to be OK.
michael@0 316 //
michael@0 317 // Note: ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually
michael@0 318 // an int32_t with a value of 1. Starting with ICU 3.4,
michael@0 319 // RBBI's fDataFormat matches the dataFormat field from the
michael@0 320 // UDataInfo header, four int8_t bytes. The value is {3,1,0,0}
michael@0 321 //
michael@0 322 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
michael@0 323 RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
michael@0 324 if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 ||
michael@0 325 rbbiDH->fFormatVersion[0] != 3 ||
michael@0 326 ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader))
michael@0 327 {
michael@0 328 udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
michael@0 329 *status=U_UNSUPPORTED_ERROR;
michael@0 330 return 0;
michael@0 331 }
michael@0 332
michael@0 333 //
michael@0 334 // Prefight operation? Just return the size
michael@0 335 //
michael@0 336 int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
michael@0 337 int32_t totalSize = headerSize + breakDataLength;
michael@0 338 if (length < 0) {
michael@0 339 return totalSize;
michael@0 340 }
michael@0 341
michael@0 342 //
michael@0 343 // Check that length passed in is consistent with length from RBBI data header.
michael@0 344 //
michael@0 345 if (length < totalSize) {
michael@0 346 udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
michael@0 347 breakDataLength);
michael@0 348 *status=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 349 return 0;
michael@0 350 }
michael@0 351
michael@0 352
michael@0 353 //
michael@0 354 // Swap the Data. Do the data itself first, then the RBBI Data Header, because
michael@0 355 // we need to reference the header to locate the data, and an
michael@0 356 // inplace swap of the header leaves it unusable.
michael@0 357 //
michael@0 358 uint8_t *outBytes = (uint8_t *)outData + headerSize;
michael@0 359 RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes;
michael@0 360
michael@0 361 int32_t tableStartOffset;
michael@0 362 int32_t tableLength;
michael@0 363
michael@0 364 //
michael@0 365 // If not swapping in place, zero out the output buffer before starting.
michael@0 366 // Individual tables and other data items within are aligned to 8 byte boundaries
michael@0 367 // when originally created. Any unused space between items needs to be zero.
michael@0 368 //
michael@0 369 if (inBytes != outBytes) {
michael@0 370 uprv_memset(outBytes, 0, breakDataLength);
michael@0 371 }
michael@0 372
michael@0 373 //
michael@0 374 // Each state table begins with several 32 bit fields. Calculate the size
michael@0 375 // in bytes of these.
michael@0 376 //
michael@0 377 int32_t topSize = offsetof(RBBIStateTable, fTableData);
michael@0 378
michael@0 379 // Forward state table.
michael@0 380 tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
michael@0 381 tableLength = ds->readUInt32(rbbiDH->fFTableLen);
michael@0 382
michael@0 383 if (tableLength > 0) {
michael@0 384 ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
michael@0 385 outBytes+tableStartOffset, status);
michael@0 386 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
michael@0 387 outBytes+tableStartOffset+topSize, status);
michael@0 388 }
michael@0 389
michael@0 390 // Reverse state table. Same layout as forward table, above.
michael@0 391 tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
michael@0 392 tableLength = ds->readUInt32(rbbiDH->fRTableLen);
michael@0 393
michael@0 394 if (tableLength > 0) {
michael@0 395 ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
michael@0 396 outBytes+tableStartOffset, status);
michael@0 397 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
michael@0 398 outBytes+tableStartOffset+topSize, status);
michael@0 399 }
michael@0 400
michael@0 401 // Safe Forward state table. Same layout as forward table, above.
michael@0 402 tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
michael@0 403 tableLength = ds->readUInt32(rbbiDH->fSFTableLen);
michael@0 404
michael@0 405 if (tableLength > 0) {
michael@0 406 ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
michael@0 407 outBytes+tableStartOffset, status);
michael@0 408 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
michael@0 409 outBytes+tableStartOffset+topSize, status);
michael@0 410 }
michael@0 411
michael@0 412 // Safe Reverse state table. Same layout as forward table, above.
michael@0 413 tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
michael@0 414 tableLength = ds->readUInt32(rbbiDH->fSRTableLen);
michael@0 415
michael@0 416 if (tableLength > 0) {
michael@0 417 ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
michael@0 418 outBytes+tableStartOffset, status);
michael@0 419 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
michael@0 420 outBytes+tableStartOffset+topSize, status);
michael@0 421 }
michael@0 422
michael@0 423 // Trie table for character categories
michael@0 424 utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
michael@0 425 outBytes+ds->readUInt32(rbbiDH->fTrie), status);
michael@0 426
michael@0 427 // Source Rules Text. It's UChar data
michael@0 428 ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
michael@0 429 outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
michael@0 430
michael@0 431 // Table of rule status values. It's all int_32 values
michael@0 432 ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
michael@0 433 outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
michael@0 434
michael@0 435 // And, last, the header.
michael@0 436 // It is all int32_t values except for fFormataVersion, which is an array of four bytes.
michael@0 437 // Swap the whole thing as int32_t, then re-swap the one field.
michael@0 438 //
michael@0 439 ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
michael@0 440 ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
michael@0 441
michael@0 442 return totalSize;
michael@0 443 }
michael@0 444
michael@0 445
michael@0 446 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial