intl/icu/source/common/rbbidata.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 ***************************************************************************
     3 *   Copyright (C) 1999-2010 International Business Machines Corporation   *
     4 *   and others. All rights reserved.                                      *
     5 ***************************************************************************
     6 */
     8 #include "unicode/utypes.h"
    10 #if !UCONFIG_NO_BREAK_ITERATION
    12 #include "unicode/utypes.h"
    13 #include "rbbidata.h"
    14 #include "rbbirb.h"
    15 #include "utrie.h"
    16 #include "udatamem.h"
    17 #include "cmemory.h"
    18 #include "cstring.h"
    19 #include "umutex.h"
    21 #include "uassert.h"
    24 //-----------------------------------------------------------------------------------
    25 //
    26 //   Trie access folding function.  Copied as-is from properties code in uchar.c
    27 //
    28 //-----------------------------------------------------------------------------------
    29 U_CDECL_BEGIN
    30 static int32_t U_CALLCONV
    31 getFoldingOffset(uint32_t data) {
    32     /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
    33     if(data&0x8000) {
    34         return (int32_t)(data&0x7fff);
    35     } else {
    36         return 0;
    37     }
    38 }
    39 U_CDECL_END
    41 U_NAMESPACE_BEGIN
    43 //-----------------------------------------------------------------------------
    44 //
    45 //    Constructors.
    46 //
    47 //-----------------------------------------------------------------------------
    48 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
    49     init(data, status);
    50 }
    52 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
    53     init(data, status);
    54     fDontFreeData = TRUE;
    55 }
    57 RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
    58     const RBBIDataHeader *d = (const RBBIDataHeader *)
    59         // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size);
    60         // taking into consideration the padding added in by udata_write
    61         ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
    62     init(d, status);
    63     fUDataMem = udm;
    64 }
    66 //-----------------------------------------------------------------------------
    67 //
    68 //    init().   Does most of the work of construction, shared between the
    69 //              constructors.
    70 //
    71 //-----------------------------------------------------------------------------
    72 void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
    73     if (U_FAILURE(status)) {
    74         return;
    75     }
    76     fHeader = data;
    77     if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3) 
    78     {
    79         status = U_INVALID_FORMAT_ERROR;
    80         return;
    81     }
    82     // Note: in ICU version 3.2 and earlier, there was a formatVersion 1
    83     //       that is no longer supported.  At that time fFormatVersion was
    84     //       an int32_t field, rather than an array of 4 bytes.
    86     fDontFreeData = FALSE;
    87     fUDataMem     = NULL;
    88     fReverseTable = NULL;
    89     fSafeFwdTable = NULL;
    90     fSafeRevTable = NULL;
    91     if (data->fFTableLen != 0) {
    92         fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
    93     }
    94     if (data->fRTableLen != 0) {
    95         fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
    96     }
    97     if (data->fSFTableLen != 0) {
    98         fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
    99     }
   100     if (data->fSRTableLen != 0) {
   101         fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
   102     }
   105     utrie_unserialize(&fTrie,
   106                        (uint8_t *)data + fHeader->fTrie,
   107                        fHeader->fTrieLen,
   108                        &status);
   109     if (U_FAILURE(status)) {
   110         return;
   111     }
   112     fTrie.getFoldingOffset=getFoldingOffset;
   115     fRuleSource   = (UChar *)((char *)data + fHeader->fRuleSource);
   116     fRuleString.setTo(TRUE, fRuleSource, -1);
   117     U_ASSERT(data->fRuleSourceLen > 0);
   119     fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
   120     fStatusMaxIdx    = data->fStatusTableLen / sizeof(int32_t);
   122     fRefCount = 1;
   124 #ifdef RBBI_DEBUG
   125     char *debugEnv = getenv("U_RBBIDEBUG");
   126     if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
   127 #endif
   128 }
   131 //-----------------------------------------------------------------------------
   132 //
   133 //    Destructor.     Don't call this - use removeReference() instead.
   134 //
   135 //-----------------------------------------------------------------------------
   136 RBBIDataWrapper::~RBBIDataWrapper() {
   137     U_ASSERT(fRefCount == 0);
   138     if (fUDataMem) {
   139         udata_close(fUDataMem);
   140     } else if (!fDontFreeData) {
   141         uprv_free((void *)fHeader);
   142     }
   143 }
   147 //-----------------------------------------------------------------------------
   148 //
   149 //   Operator ==    Consider two RBBIDataWrappers to be equal if they
   150 //                  refer to the same underlying data.  Although
   151 //                  the data wrappers are normally shared between
   152 //                  iterator instances, it's possible to independently
   153 //                  open the same data twice, and get two instances, which
   154 //                  should still be ==.
   155 //
   156 //-----------------------------------------------------------------------------
   157 UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
   158     if (fHeader == other.fHeader) {
   159         return TRUE;
   160     }
   161     if (fHeader->fLength != other.fHeader->fLength) {
   162         return FALSE;
   163     }
   164     if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
   165         return TRUE;
   166     }
   167     return FALSE;
   168 }
   170 int32_t  RBBIDataWrapper::hashCode() {
   171     return fHeader->fFTableLen;
   172 }
   176 //-----------------------------------------------------------------------------
   177 //
   178 //    Reference Counting.   A single RBBIDataWrapper object is shared among
   179 //                          however many RulesBasedBreakIterator instances are
   180 //                          referencing the same data.
   181 //
   182 //-----------------------------------------------------------------------------
   183 void RBBIDataWrapper::removeReference() {
   184     if (umtx_atomic_dec(&fRefCount) == 0) {
   185         delete this;
   186     }
   187 }
   190 RBBIDataWrapper *RBBIDataWrapper::addReference() {
   191    umtx_atomic_inc(&fRefCount);
   192    return this;
   193 }
   197 //-----------------------------------------------------------------------------
   198 //
   199 //  getRuleSourceString
   200 //
   201 //-----------------------------------------------------------------------------
   202 const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
   203     return fRuleString;
   204 }
   207 //-----------------------------------------------------------------------------
   208 //
   209 //  print   -  debugging function to dump the runtime data tables.
   210 //
   211 //-----------------------------------------------------------------------------
   212 #ifdef RBBI_DEBUG
   213 void  RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
   214     uint32_t   c;
   215     uint32_t   s;
   217     RBBIDebugPrintf("   %s\n", heading);
   219     RBBIDebugPrintf("State |  Acc  LA TagIx");
   220     for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
   221     RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
   222         RBBIDebugPrintf("----");
   223     }
   224     RBBIDebugPrintf("\n");
   226     if (table == NULL) {
   227         RBBIDebugPrintf("         N U L L   T A B L E\n\n");
   228         return;
   229     }
   230     for (s=0; s<table->fNumStates; s++) {
   231         RBBIStateTableRow *row = (RBBIStateTableRow *)
   232                                   (table->fTableData + (table->fRowLen * s));
   233         RBBIDebugPrintf("%4d  |  %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
   234         for (c=0; c<fHeader->fCatCount; c++)  {
   235             RBBIDebugPrintf("%3d ", row->fNextState[c]);
   236         }
   237         RBBIDebugPrintf("\n");
   238     }
   239     RBBIDebugPrintf("\n");
   240 }
   241 #endif
   244 #ifdef RBBI_DEBUG
   245 void  RBBIDataWrapper::printData() {
   246     RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
   247     RBBIDebugPrintf("   Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
   248                                                     fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
   249     RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
   250     RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
   252     printTable("Forward State Transition Table", fForwardTable);
   253     printTable("Reverse State Transition Table", fReverseTable);
   254     printTable("Safe Forward State Transition Table", fSafeFwdTable);
   255     printTable("Safe Reverse State Transition Table", fSafeRevTable);
   257     RBBIDebugPrintf("\nOrignal Rules source:\n");
   258     for (int32_t c=0; fRuleSource[c] != 0; c++) {
   259         RBBIDebugPrintf("%c", fRuleSource[c]);
   260     }
   261     RBBIDebugPrintf("\n\n");
   262 }
   263 #endif
   266 U_NAMESPACE_END
   267 U_NAMESPACE_USE
   269 //-----------------------------------------------------------------------------
   270 //
   271 //  ubrk_swap   -  byte swap and char encoding swap of RBBI data
   272 //
   273 //-----------------------------------------------------------------------------
   275 U_CAPI int32_t U_EXPORT2
   276 ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
   277            UErrorCode *status) {
   279     if (status == NULL || U_FAILURE(*status)) {
   280         return 0;
   281     }
   282     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
   283         *status=U_ILLEGAL_ARGUMENT_ERROR;
   284         return 0;
   285     }
   287     //
   288     //  Check that the data header is for for break data.
   289     //    (Header contents are defined in genbrk.cpp)
   290     //
   291     const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
   292     if(!(  pInfo->dataFormat[0]==0x42 &&   /* dataFormat="Brk " */
   293            pInfo->dataFormat[1]==0x72 &&
   294            pInfo->dataFormat[2]==0x6b &&
   295            pInfo->dataFormat[3]==0x20 &&
   296            pInfo->formatVersion[0]==3  )) {
   297         udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
   298                          pInfo->dataFormat[0], pInfo->dataFormat[1],
   299                          pInfo->dataFormat[2], pInfo->dataFormat[3],
   300                          pInfo->formatVersion[0]);
   301         *status=U_UNSUPPORTED_ERROR;
   302         return 0;
   303     }
   305     //
   306     // Swap the data header.  (This is the generic ICU Data Header, not the RBBI Specific
   307     //                         RBBIDataHeader).  This swap also conveniently gets us
   308     //                         the size of the ICU d.h., which lets us locate the start
   309     //                         of the RBBI specific data.
   310     //
   311     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
   314     //
   315     // Get the RRBI Data Header, and check that it appears to be OK.
   316     //
   317     //    Note:  ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually 
   318     //           an int32_t with a value of 1.  Starting with ICU 3.4,
   319     //           RBBI's fDataFormat matches the dataFormat field from the
   320     //           UDataInfo header, four int8_t bytes.  The value is {3,1,0,0}
   321     //
   322     const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
   323     RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
   324     if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 || 
   325         rbbiDH->fFormatVersion[0] != 3 ||
   326         ds->readUInt32(rbbiDH->fLength)  <  sizeof(RBBIDataHeader)) 
   327     {
   328         udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
   329         *status=U_UNSUPPORTED_ERROR;
   330         return 0;
   331     }
   333     //
   334     // Prefight operation?  Just return the size
   335     //
   336     int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
   337     int32_t totalSize = headerSize + breakDataLength;
   338     if (length < 0) {
   339         return totalSize;
   340     }
   342     //
   343     // Check that length passed in is consistent with length from RBBI data header.
   344     //
   345     if (length < totalSize) {
   346         udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
   347                             breakDataLength);
   348         *status=U_INDEX_OUTOFBOUNDS_ERROR;
   349         return 0;
   350         }
   353     //
   354     // Swap the Data.  Do the data itself first, then the RBBI Data Header, because
   355     //                 we need to reference the header to locate the data, and an
   356     //                 inplace swap of the header leaves it unusable.
   357     //
   358     uint8_t         *outBytes = (uint8_t *)outData + headerSize;
   359     RBBIDataHeader  *outputDH = (RBBIDataHeader *)outBytes;
   361     int32_t   tableStartOffset;
   362     int32_t   tableLength;
   364     //
   365     // If not swapping in place, zero out the output buffer before starting.
   366     //    Individual tables and other data items within are aligned to 8 byte boundaries
   367     //    when originally created.  Any unused space between items needs to be zero.
   368     //
   369     if (inBytes != outBytes) {
   370         uprv_memset(outBytes, 0, breakDataLength);
   371     }
   373     //
   374     // Each state table begins with several 32 bit fields.  Calculate the size
   375     //   in bytes of these.
   376     //
   377     int32_t         topSize = offsetof(RBBIStateTable, fTableData);
   379     // Forward state table.  
   380     tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
   381     tableLength      = ds->readUInt32(rbbiDH->fFTableLen);
   383     if (tableLength > 0) {
   384         ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
   385                             outBytes+tableStartOffset, status);
   386         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
   387                             outBytes+tableStartOffset+topSize, status);
   388     }
   390     // Reverse state table.  Same layout as forward table, above.
   391     tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
   392     tableLength      = ds->readUInt32(rbbiDH->fRTableLen);
   394     if (tableLength > 0) {
   395         ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
   396                             outBytes+tableStartOffset, status);
   397         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
   398                             outBytes+tableStartOffset+topSize, status);
   399     }
   401     // Safe Forward state table.  Same layout as forward table, above.
   402     tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
   403     tableLength      = ds->readUInt32(rbbiDH->fSFTableLen);
   405     if (tableLength > 0) {
   406         ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
   407                             outBytes+tableStartOffset, status);
   408         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
   409                             outBytes+tableStartOffset+topSize, status);
   410     }
   412     // Safe Reverse state table.  Same layout as forward table, above.
   413     tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
   414     tableLength      = ds->readUInt32(rbbiDH->fSRTableLen);
   416     if (tableLength > 0) {
   417         ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
   418                             outBytes+tableStartOffset, status);
   419         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
   420                             outBytes+tableStartOffset+topSize, status);
   421     }
   423     // Trie table for character categories
   424     utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
   425                             outBytes+ds->readUInt32(rbbiDH->fTrie), status);
   427     // Source Rules Text.  It's UChar data
   428     ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
   429                         outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
   431     // Table of rule status values.  It's all int_32 values
   432     ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
   433                         outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
   435     // And, last, the header.
   436     //   It is all int32_t values except for fFormataVersion, which is an array of four bytes.
   437     //   Swap the whole thing as int32_t, then re-swap the one field.
   438     //
   439     ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
   440     ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
   442     return totalSize;
   443 }
   446 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial