intl/icu/source/common/rbbidata.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/rbbidata.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,446 @@
     1.4 +/*
     1.5 +***************************************************************************
     1.6 +*   Copyright (C) 1999-2010 International Business Machines Corporation   *
     1.7 +*   and others. All rights reserved.                                      *
     1.8 +***************************************************************************
     1.9 +*/
    1.10 +
    1.11 +#include "unicode/utypes.h"
    1.12 +
    1.13 +#if !UCONFIG_NO_BREAK_ITERATION
    1.14 +
    1.15 +#include "unicode/utypes.h"
    1.16 +#include "rbbidata.h"
    1.17 +#include "rbbirb.h"
    1.18 +#include "utrie.h"
    1.19 +#include "udatamem.h"
    1.20 +#include "cmemory.h"
    1.21 +#include "cstring.h"
    1.22 +#include "umutex.h"
    1.23 +
    1.24 +#include "uassert.h"
    1.25 +
    1.26 +
    1.27 +//-----------------------------------------------------------------------------------
    1.28 +//
    1.29 +//   Trie access folding function.  Copied as-is from properties code in uchar.c
    1.30 +//
    1.31 +//-----------------------------------------------------------------------------------
    1.32 +U_CDECL_BEGIN
    1.33 +static int32_t U_CALLCONV
    1.34 +getFoldingOffset(uint32_t data) {
    1.35 +    /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
    1.36 +    if(data&0x8000) {
    1.37 +        return (int32_t)(data&0x7fff);
    1.38 +    } else {
    1.39 +        return 0;
    1.40 +    }
    1.41 +}
    1.42 +U_CDECL_END
    1.43 +
    1.44 +U_NAMESPACE_BEGIN
    1.45 +
    1.46 +//-----------------------------------------------------------------------------
    1.47 +//
    1.48 +//    Constructors.
    1.49 +//
    1.50 +//-----------------------------------------------------------------------------
    1.51 +RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
    1.52 +    init(data, status);
    1.53 +}
    1.54 +
    1.55 +RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
    1.56 +    init(data, status);
    1.57 +    fDontFreeData = TRUE;
    1.58 +}
    1.59 +
    1.60 +RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
    1.61 +    const RBBIDataHeader *d = (const RBBIDataHeader *)
    1.62 +        // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size);
    1.63 +        // taking into consideration the padding added in by udata_write
    1.64 +        ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
    1.65 +    init(d, status);
    1.66 +    fUDataMem = udm;
    1.67 +}
    1.68 +
    1.69 +//-----------------------------------------------------------------------------
    1.70 +//
    1.71 +//    init().   Does most of the work of construction, shared between the
    1.72 +//              constructors.
    1.73 +//
    1.74 +//-----------------------------------------------------------------------------
    1.75 +void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
    1.76 +    if (U_FAILURE(status)) {
    1.77 +        return;
    1.78 +    }
    1.79 +    fHeader = data;
    1.80 +    if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3) 
    1.81 +    {
    1.82 +        status = U_INVALID_FORMAT_ERROR;
    1.83 +        return;
    1.84 +    }
    1.85 +    // Note: in ICU version 3.2 and earlier, there was a formatVersion 1
    1.86 +    //       that is no longer supported.  At that time fFormatVersion was
    1.87 +    //       an int32_t field, rather than an array of 4 bytes.
    1.88 +
    1.89 +    fDontFreeData = FALSE;
    1.90 +    fUDataMem     = NULL;
    1.91 +    fReverseTable = NULL;
    1.92 +    fSafeFwdTable = NULL;
    1.93 +    fSafeRevTable = NULL;
    1.94 +    if (data->fFTableLen != 0) {
    1.95 +        fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
    1.96 +    }
    1.97 +    if (data->fRTableLen != 0) {
    1.98 +        fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
    1.99 +    }
   1.100 +    if (data->fSFTableLen != 0) {
   1.101 +        fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
   1.102 +    }
   1.103 +    if (data->fSRTableLen != 0) {
   1.104 +        fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
   1.105 +    }
   1.106 +
   1.107 +
   1.108 +    utrie_unserialize(&fTrie,
   1.109 +                       (uint8_t *)data + fHeader->fTrie,
   1.110 +                       fHeader->fTrieLen,
   1.111 +                       &status);
   1.112 +    if (U_FAILURE(status)) {
   1.113 +        return;
   1.114 +    }
   1.115 +    fTrie.getFoldingOffset=getFoldingOffset;
   1.116 +
   1.117 +
   1.118 +    fRuleSource   = (UChar *)((char *)data + fHeader->fRuleSource);
   1.119 +    fRuleString.setTo(TRUE, fRuleSource, -1);
   1.120 +    U_ASSERT(data->fRuleSourceLen > 0);
   1.121 +
   1.122 +    fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
   1.123 +    fStatusMaxIdx    = data->fStatusTableLen / sizeof(int32_t);
   1.124 +
   1.125 +    fRefCount = 1;
   1.126 +
   1.127 +#ifdef RBBI_DEBUG
   1.128 +    char *debugEnv = getenv("U_RBBIDEBUG");
   1.129 +    if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
   1.130 +#endif
   1.131 +}
   1.132 +
   1.133 +
   1.134 +//-----------------------------------------------------------------------------
   1.135 +//
   1.136 +//    Destructor.     Don't call this - use removeReference() instead.
   1.137 +//
   1.138 +//-----------------------------------------------------------------------------
   1.139 +RBBIDataWrapper::~RBBIDataWrapper() {
   1.140 +    U_ASSERT(fRefCount == 0);
   1.141 +    if (fUDataMem) {
   1.142 +        udata_close(fUDataMem);
   1.143 +    } else if (!fDontFreeData) {
   1.144 +        uprv_free((void *)fHeader);
   1.145 +    }
   1.146 +}
   1.147 +
   1.148 +
   1.149 +
   1.150 +//-----------------------------------------------------------------------------
   1.151 +//
   1.152 +//   Operator ==    Consider two RBBIDataWrappers to be equal if they
   1.153 +//                  refer to the same underlying data.  Although
   1.154 +//                  the data wrappers are normally shared between
   1.155 +//                  iterator instances, it's possible to independently
   1.156 +//                  open the same data twice, and get two instances, which
   1.157 +//                  should still be ==.
   1.158 +//
   1.159 +//-----------------------------------------------------------------------------
   1.160 +UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
   1.161 +    if (fHeader == other.fHeader) {
   1.162 +        return TRUE;
   1.163 +    }
   1.164 +    if (fHeader->fLength != other.fHeader->fLength) {
   1.165 +        return FALSE;
   1.166 +    }
   1.167 +    if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
   1.168 +        return TRUE;
   1.169 +    }
   1.170 +    return FALSE;
   1.171 +}
   1.172 +
   1.173 +int32_t  RBBIDataWrapper::hashCode() {
   1.174 +    return fHeader->fFTableLen;
   1.175 +}
   1.176 +
   1.177 +
   1.178 +
   1.179 +//-----------------------------------------------------------------------------
   1.180 +//
   1.181 +//    Reference Counting.   A single RBBIDataWrapper object is shared among
   1.182 +//                          however many RulesBasedBreakIterator instances are
   1.183 +//                          referencing the same data.
   1.184 +//
   1.185 +//-----------------------------------------------------------------------------
   1.186 +void RBBIDataWrapper::removeReference() {
   1.187 +    if (umtx_atomic_dec(&fRefCount) == 0) {
   1.188 +        delete this;
   1.189 +    }
   1.190 +}
   1.191 +
   1.192 +
   1.193 +RBBIDataWrapper *RBBIDataWrapper::addReference() {
   1.194 +   umtx_atomic_inc(&fRefCount);
   1.195 +   return this;
   1.196 +}
   1.197 +
   1.198 +
   1.199 +
   1.200 +//-----------------------------------------------------------------------------
   1.201 +//
   1.202 +//  getRuleSourceString
   1.203 +//
   1.204 +//-----------------------------------------------------------------------------
   1.205 +const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
   1.206 +    return fRuleString;
   1.207 +}
   1.208 +
   1.209 +
   1.210 +//-----------------------------------------------------------------------------
   1.211 +//
   1.212 +//  print   -  debugging function to dump the runtime data tables.
   1.213 +//
   1.214 +//-----------------------------------------------------------------------------
   1.215 +#ifdef RBBI_DEBUG
   1.216 +void  RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
   1.217 +    uint32_t   c;
   1.218 +    uint32_t   s;
   1.219 +
   1.220 +    RBBIDebugPrintf("   %s\n", heading);
   1.221 +
   1.222 +    RBBIDebugPrintf("State |  Acc  LA TagIx");
   1.223 +    for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
   1.224 +    RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
   1.225 +        RBBIDebugPrintf("----");
   1.226 +    }
   1.227 +    RBBIDebugPrintf("\n");
   1.228 +
   1.229 +    if (table == NULL) {
   1.230 +        RBBIDebugPrintf("         N U L L   T A B L E\n\n");
   1.231 +        return;
   1.232 +    }
   1.233 +    for (s=0; s<table->fNumStates; s++) {
   1.234 +        RBBIStateTableRow *row = (RBBIStateTableRow *)
   1.235 +                                  (table->fTableData + (table->fRowLen * s));
   1.236 +        RBBIDebugPrintf("%4d  |  %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
   1.237 +        for (c=0; c<fHeader->fCatCount; c++)  {
   1.238 +            RBBIDebugPrintf("%3d ", row->fNextState[c]);
   1.239 +        }
   1.240 +        RBBIDebugPrintf("\n");
   1.241 +    }
   1.242 +    RBBIDebugPrintf("\n");
   1.243 +}
   1.244 +#endif
   1.245 +
   1.246 +
   1.247 +#ifdef RBBI_DEBUG
   1.248 +void  RBBIDataWrapper::printData() {
   1.249 +    RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
   1.250 +    RBBIDebugPrintf("   Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
   1.251 +                                                    fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
   1.252 +    RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
   1.253 +    RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
   1.254 +
   1.255 +    printTable("Forward State Transition Table", fForwardTable);
   1.256 +    printTable("Reverse State Transition Table", fReverseTable);
   1.257 +    printTable("Safe Forward State Transition Table", fSafeFwdTable);
   1.258 +    printTable("Safe Reverse State Transition Table", fSafeRevTable);
   1.259 +
   1.260 +    RBBIDebugPrintf("\nOrignal Rules source:\n");
   1.261 +    for (int32_t c=0; fRuleSource[c] != 0; c++) {
   1.262 +        RBBIDebugPrintf("%c", fRuleSource[c]);
   1.263 +    }
   1.264 +    RBBIDebugPrintf("\n\n");
   1.265 +}
   1.266 +#endif
   1.267 +
   1.268 +
   1.269 +U_NAMESPACE_END
   1.270 +U_NAMESPACE_USE
   1.271 +
   1.272 +//-----------------------------------------------------------------------------
   1.273 +//
   1.274 +//  ubrk_swap   -  byte swap and char encoding swap of RBBI data
   1.275 +//
   1.276 +//-----------------------------------------------------------------------------
   1.277 +
   1.278 +U_CAPI int32_t U_EXPORT2
   1.279 +ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
   1.280 +           UErrorCode *status) {
   1.281 +
   1.282 +    if (status == NULL || U_FAILURE(*status)) {
   1.283 +        return 0;
   1.284 +    }
   1.285 +    if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
   1.286 +        *status=U_ILLEGAL_ARGUMENT_ERROR;
   1.287 +        return 0;
   1.288 +    }
   1.289 +
   1.290 +    //
   1.291 +    //  Check that the data header is for for break data.
   1.292 +    //    (Header contents are defined in genbrk.cpp)
   1.293 +    //
   1.294 +    const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
   1.295 +    if(!(  pInfo->dataFormat[0]==0x42 &&   /* dataFormat="Brk " */
   1.296 +           pInfo->dataFormat[1]==0x72 &&
   1.297 +           pInfo->dataFormat[2]==0x6b &&
   1.298 +           pInfo->dataFormat[3]==0x20 &&
   1.299 +           pInfo->formatVersion[0]==3  )) {
   1.300 +        udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
   1.301 +                         pInfo->dataFormat[0], pInfo->dataFormat[1],
   1.302 +                         pInfo->dataFormat[2], pInfo->dataFormat[3],
   1.303 +                         pInfo->formatVersion[0]);
   1.304 +        *status=U_UNSUPPORTED_ERROR;
   1.305 +        return 0;
   1.306 +    }
   1.307 +
   1.308 +    //
   1.309 +    // Swap the data header.  (This is the generic ICU Data Header, not the RBBI Specific
   1.310 +    //                         RBBIDataHeader).  This swap also conveniently gets us
   1.311 +    //                         the size of the ICU d.h., which lets us locate the start
   1.312 +    //                         of the RBBI specific data.
   1.313 +    //
   1.314 +    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
   1.315 +
   1.316 +
   1.317 +    //
   1.318 +    // Get the RRBI Data Header, and check that it appears to be OK.
   1.319 +    //
   1.320 +    //    Note:  ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually 
   1.321 +    //           an int32_t with a value of 1.  Starting with ICU 3.4,
   1.322 +    //           RBBI's fDataFormat matches the dataFormat field from the
   1.323 +    //           UDataInfo header, four int8_t bytes.  The value is {3,1,0,0}
   1.324 +    //
   1.325 +    const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
   1.326 +    RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
   1.327 +    if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 || 
   1.328 +        rbbiDH->fFormatVersion[0] != 3 ||
   1.329 +        ds->readUInt32(rbbiDH->fLength)  <  sizeof(RBBIDataHeader)) 
   1.330 +    {
   1.331 +        udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
   1.332 +        *status=U_UNSUPPORTED_ERROR;
   1.333 +        return 0;
   1.334 +    }
   1.335 +
   1.336 +    //
   1.337 +    // Prefight operation?  Just return the size
   1.338 +    //
   1.339 +    int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
   1.340 +    int32_t totalSize = headerSize + breakDataLength;
   1.341 +    if (length < 0) {
   1.342 +        return totalSize;
   1.343 +    }
   1.344 +
   1.345 +    //
   1.346 +    // Check that length passed in is consistent with length from RBBI data header.
   1.347 +    //
   1.348 +    if (length < totalSize) {
   1.349 +        udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
   1.350 +                            breakDataLength);
   1.351 +        *status=U_INDEX_OUTOFBOUNDS_ERROR;
   1.352 +        return 0;
   1.353 +        }
   1.354 +
   1.355 +
   1.356 +    //
   1.357 +    // Swap the Data.  Do the data itself first, then the RBBI Data Header, because
   1.358 +    //                 we need to reference the header to locate the data, and an
   1.359 +    //                 inplace swap of the header leaves it unusable.
   1.360 +    //
   1.361 +    uint8_t         *outBytes = (uint8_t *)outData + headerSize;
   1.362 +    RBBIDataHeader  *outputDH = (RBBIDataHeader *)outBytes;
   1.363 +
   1.364 +    int32_t   tableStartOffset;
   1.365 +    int32_t   tableLength;
   1.366 +
   1.367 +    //
   1.368 +    // If not swapping in place, zero out the output buffer before starting.
   1.369 +    //    Individual tables and other data items within are aligned to 8 byte boundaries
   1.370 +    //    when originally created.  Any unused space between items needs to be zero.
   1.371 +    //
   1.372 +    if (inBytes != outBytes) {
   1.373 +        uprv_memset(outBytes, 0, breakDataLength);
   1.374 +    }
   1.375 +
   1.376 +    //
   1.377 +    // Each state table begins with several 32 bit fields.  Calculate the size
   1.378 +    //   in bytes of these.
   1.379 +    //
   1.380 +    int32_t         topSize = offsetof(RBBIStateTable, fTableData);
   1.381 +
   1.382 +    // Forward state table.  
   1.383 +    tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
   1.384 +    tableLength      = ds->readUInt32(rbbiDH->fFTableLen);
   1.385 +
   1.386 +    if (tableLength > 0) {
   1.387 +        ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
   1.388 +                            outBytes+tableStartOffset, status);
   1.389 +        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
   1.390 +                            outBytes+tableStartOffset+topSize, status);
   1.391 +    }
   1.392 +    
   1.393 +    // Reverse state table.  Same layout as forward table, above.
   1.394 +    tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
   1.395 +    tableLength      = ds->readUInt32(rbbiDH->fRTableLen);
   1.396 +
   1.397 +    if (tableLength > 0) {
   1.398 +        ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
   1.399 +                            outBytes+tableStartOffset, status);
   1.400 +        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
   1.401 +                            outBytes+tableStartOffset+topSize, status);
   1.402 +    }
   1.403 +
   1.404 +    // Safe Forward state table.  Same layout as forward table, above.
   1.405 +    tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
   1.406 +    tableLength      = ds->readUInt32(rbbiDH->fSFTableLen);
   1.407 +
   1.408 +    if (tableLength > 0) {
   1.409 +        ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
   1.410 +                            outBytes+tableStartOffset, status);
   1.411 +        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
   1.412 +                            outBytes+tableStartOffset+topSize, status);
   1.413 +    }
   1.414 +
   1.415 +    // Safe Reverse state table.  Same layout as forward table, above.
   1.416 +    tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
   1.417 +    tableLength      = ds->readUInt32(rbbiDH->fSRTableLen);
   1.418 +
   1.419 +    if (tableLength > 0) {
   1.420 +        ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
   1.421 +                            outBytes+tableStartOffset, status);
   1.422 +        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
   1.423 +                            outBytes+tableStartOffset+topSize, status);
   1.424 +    }
   1.425 +
   1.426 +    // Trie table for character categories
   1.427 +    utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
   1.428 +                            outBytes+ds->readUInt32(rbbiDH->fTrie), status);
   1.429 +
   1.430 +    // Source Rules Text.  It's UChar data
   1.431 +    ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
   1.432 +                        outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
   1.433 +
   1.434 +    // Table of rule status values.  It's all int_32 values
   1.435 +    ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
   1.436 +                        outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
   1.437 +
   1.438 +    // And, last, the header.
   1.439 +    //   It is all int32_t values except for fFormataVersion, which is an array of four bytes.
   1.440 +    //   Swap the whole thing as int32_t, then re-swap the one field.
   1.441 +    //
   1.442 +    ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
   1.443 +    ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
   1.444 +
   1.445 +    return totalSize;
   1.446 +}
   1.447 +
   1.448 +
   1.449 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial