intl/icu/source/common/ucharstrie.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2 *******************************************************************************
     3 *   Copyright (C) 2010-2011, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 *******************************************************************************
     6 *   file name:  ucharstrie.h
     7 *   encoding:   US-ASCII
     8 *   tab size:   8 (not used)
     9 *   indentation:4
    10 *
    11 *   created on: 2010nov14
    12 *   created by: Markus W. Scherer
    13 */
    15 #include "unicode/utypes.h"
    16 #include "unicode/appendable.h"
    17 #include "unicode/ucharstrie.h"
    18 #include "unicode/uobject.h"
    19 #include "unicode/utf16.h"
    20 #include "cmemory.h"
    21 #include "uassert.h"
    23 U_NAMESPACE_BEGIN
    25 UCharsTrie::~UCharsTrie() {
    26     uprv_free(ownedArray_);
    27 }
    29 UStringTrieResult
    30 UCharsTrie::current() const {
    31     const UChar *pos=pos_;
    32     if(pos==NULL) {
    33         return USTRINGTRIE_NO_MATCH;
    34     } else {
    35         int32_t node;
    36         return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
    37                 valueResult(node) : USTRINGTRIE_NO_VALUE;
    38     }
    39 }
    41 UStringTrieResult
    42 UCharsTrie::firstForCodePoint(UChar32 cp) {
    43     return cp<=0xffff ?
    44         first(cp) :
    45         (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ?
    46             next(U16_TRAIL(cp)) :
    47             USTRINGTRIE_NO_MATCH);
    48 }
    50 UStringTrieResult
    51 UCharsTrie::nextForCodePoint(UChar32 cp) {
    52     return cp<=0xffff ?
    53         next(cp) :
    54         (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ?
    55             next(U16_TRAIL(cp)) :
    56             USTRINGTRIE_NO_MATCH);
    57 }
    59 UStringTrieResult
    60 UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) {
    61     // Branch according to the current unit.
    62     if(length==0) {
    63         length=*pos++;
    64     }
    65     ++length;
    66     // The length of the branch is the number of units to select from.
    67     // The data structure encodes a binary search.
    68     while(length>kMaxBranchLinearSubNodeLength) {
    69         if(uchar<*pos++) {
    70             length>>=1;
    71             pos=jumpByDelta(pos);
    72         } else {
    73             length=length-(length>>1);
    74             pos=skipDelta(pos);
    75         }
    76     }
    77     // Drop down to linear search for the last few units.
    78     // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
    79     // and divides length by 2.
    80     do {
    81         if(uchar==*pos++) {
    82             UStringTrieResult result;
    83             int32_t node=*pos;
    84             if(node&kValueIsFinal) {
    85                 // Leave the final value for getValue() to read.
    86                 result=USTRINGTRIE_FINAL_VALUE;
    87             } else {
    88                 // Use the non-final value as the jump delta.
    89                 ++pos;
    90                 // int32_t delta=readValue(pos, node);
    91                 int32_t delta;
    92                 if(node<kMinTwoUnitValueLead) {
    93                     delta=node;
    94                 } else if(node<kThreeUnitValueLead) {
    95                     delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
    96                 } else {
    97                     delta=(pos[0]<<16)|pos[1];
    98                     pos+=2;
    99                 }
   100                 // end readValue()
   101                 pos+=delta;
   102                 node=*pos;
   103                 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
   104             }
   105             pos_=pos;
   106             return result;
   107         }
   108         --length;
   109         pos=skipValue(pos);
   110     } while(length>1);
   111     if(uchar==*pos++) {
   112         pos_=pos;
   113         int32_t node=*pos;
   114         return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
   115     } else {
   116         stop();
   117         return USTRINGTRIE_NO_MATCH;
   118     }
   119 }
   121 UStringTrieResult
   122 UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) {
   123     int32_t node=*pos++;
   124     for(;;) {
   125         if(node<kMinLinearMatch) {
   126             return branchNext(pos, node, uchar);
   127         } else if(node<kMinValueLead) {
   128             // Match the first of length+1 units.
   129             int32_t length=node-kMinLinearMatch;  // Actual match length minus 1.
   130             if(uchar==*pos++) {
   131                 remainingMatchLength_=--length;
   132                 pos_=pos;
   133                 return (length<0 && (node=*pos)>=kMinValueLead) ?
   134                         valueResult(node) : USTRINGTRIE_NO_VALUE;
   135             } else {
   136                 // No match.
   137                 break;
   138             }
   139         } else if(node&kValueIsFinal) {
   140             // No further matching units.
   141             break;
   142         } else {
   143             // Skip intermediate value.
   144             pos=skipNodeValue(pos, node);
   145             node&=kNodeTypeMask;
   146         }
   147     }
   148     stop();
   149     return USTRINGTRIE_NO_MATCH;
   150 }
   152 UStringTrieResult
   153 UCharsTrie::next(int32_t uchar) {
   154     const UChar *pos=pos_;
   155     if(pos==NULL) {
   156         return USTRINGTRIE_NO_MATCH;
   157     }
   158     int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
   159     if(length>=0) {
   160         // Remaining part of a linear-match node.
   161         if(uchar==*pos++) {
   162             remainingMatchLength_=--length;
   163             pos_=pos;
   164             int32_t node;
   165             return (length<0 && (node=*pos)>=kMinValueLead) ?
   166                     valueResult(node) : USTRINGTRIE_NO_VALUE;
   167         } else {
   168             stop();
   169             return USTRINGTRIE_NO_MATCH;
   170         }
   171     }
   172     return nextImpl(pos, uchar);
   173 }
   175 UStringTrieResult
   176 UCharsTrie::next(const UChar *s, int32_t sLength) {
   177     if(sLength<0 ? *s==0 : sLength==0) {
   178         // Empty input.
   179         return current();
   180     }
   181     const UChar *pos=pos_;
   182     if(pos==NULL) {
   183         return USTRINGTRIE_NO_MATCH;
   184     }
   185     int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
   186     for(;;) {
   187         // Fetch the next input unit, if there is one.
   188         // Continue a linear-match node without rechecking sLength<0.
   189         int32_t uchar;
   190         if(sLength<0) {
   191             for(;;) {
   192                 if((uchar=*s++)==0) {
   193                     remainingMatchLength_=length;
   194                     pos_=pos;
   195                     int32_t node;
   196                     return (length<0 && (node=*pos)>=kMinValueLead) ?
   197                             valueResult(node) : USTRINGTRIE_NO_VALUE;
   198                 }
   199                 if(length<0) {
   200                     remainingMatchLength_=length;
   201                     break;
   202                 }
   203                 if(uchar!=*pos) {
   204                     stop();
   205                     return USTRINGTRIE_NO_MATCH;
   206                 }
   207                 ++pos;
   208                 --length;
   209             }
   210         } else {
   211             for(;;) {
   212                 if(sLength==0) {
   213                     remainingMatchLength_=length;
   214                     pos_=pos;
   215                     int32_t node;
   216                     return (length<0 && (node=*pos)>=kMinValueLead) ?
   217                             valueResult(node) : USTRINGTRIE_NO_VALUE;
   218                 }
   219                 uchar=*s++;
   220                 --sLength;
   221                 if(length<0) {
   222                     remainingMatchLength_=length;
   223                     break;
   224                 }
   225                 if(uchar!=*pos) {
   226                     stop();
   227                     return USTRINGTRIE_NO_MATCH;
   228                 }
   229                 ++pos;
   230                 --length;
   231             }
   232         }
   233         int32_t node=*pos++;
   234         for(;;) {
   235             if(node<kMinLinearMatch) {
   236                 UStringTrieResult result=branchNext(pos, node, uchar);
   237                 if(result==USTRINGTRIE_NO_MATCH) {
   238                     return USTRINGTRIE_NO_MATCH;
   239                 }
   240                 // Fetch the next input unit, if there is one.
   241                 if(sLength<0) {
   242                     if((uchar=*s++)==0) {
   243                         return result;
   244                     }
   245                 } else {
   246                     if(sLength==0) {
   247                         return result;
   248                     }
   249                     uchar=*s++;
   250                     --sLength;
   251                 }
   252                 if(result==USTRINGTRIE_FINAL_VALUE) {
   253                     // No further matching units.
   254                     stop();
   255                     return USTRINGTRIE_NO_MATCH;
   256                 }
   257                 pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
   258                 node=*pos++;
   259             } else if(node<kMinValueLead) {
   260                 // Match length+1 units.
   261                 length=node-kMinLinearMatch;  // Actual match length minus 1.
   262                 if(uchar!=*pos) {
   263                     stop();
   264                     return USTRINGTRIE_NO_MATCH;
   265                 }
   266                 ++pos;
   267                 --length;
   268                 break;
   269             } else if(node&kValueIsFinal) {
   270                 // No further matching units.
   271                 stop();
   272                 return USTRINGTRIE_NO_MATCH;
   273             } else {
   274                 // Skip intermediate value.
   275                 pos=skipNodeValue(pos, node);
   276                 node&=kNodeTypeMask;
   277             }
   278         }
   279     }
   280 }
   282 const UChar *
   283 UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length,
   284                                       UBool haveUniqueValue, int32_t &uniqueValue) {
   285     while(length>kMaxBranchLinearSubNodeLength) {
   286         ++pos;  // ignore the comparison unit
   287         if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
   288             return NULL;
   289         }
   290         length=length-(length>>1);
   291         pos=skipDelta(pos);
   292     }
   293     do {
   294         ++pos;  // ignore a comparison unit
   295         // handle its value
   296         int32_t node=*pos++;
   297         UBool isFinal=(UBool)(node>>15);
   298         node&=0x7fff;
   299         int32_t value=readValue(pos, node);
   300         pos=skipValue(pos, node);
   301         if(isFinal) {
   302             if(haveUniqueValue) {
   303                 if(value!=uniqueValue) {
   304                     return NULL;
   305                 }
   306             } else {
   307                 uniqueValue=value;
   308                 haveUniqueValue=TRUE;
   309             }
   310         } else {
   311             if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
   312                 return NULL;
   313             }
   314             haveUniqueValue=TRUE;
   315         }
   316     } while(--length>1);
   317     return pos+1;  // ignore the last comparison unit
   318 }
   320 UBool
   321 UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
   322     int32_t node=*pos++;
   323     for(;;) {
   324         if(node<kMinLinearMatch) {
   325             if(node==0) {
   326                 node=*pos++;
   327             }
   328             pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
   329             if(pos==NULL) {
   330                 return FALSE;
   331             }
   332             haveUniqueValue=TRUE;
   333             node=*pos++;
   334         } else if(node<kMinValueLead) {
   335             // linear-match node
   336             pos+=node-kMinLinearMatch+1;  // Ignore the match units.
   337             node=*pos++;
   338         } else {
   339             UBool isFinal=(UBool)(node>>15);
   340             int32_t value;
   341             if(isFinal) {
   342                 value=readValue(pos, node&0x7fff);
   343             } else {
   344                 value=readNodeValue(pos, node);
   345             }
   346             if(haveUniqueValue) {
   347                 if(value!=uniqueValue) {
   348                     return FALSE;
   349                 }
   350             } else {
   351                 uniqueValue=value;
   352                 haveUniqueValue=TRUE;
   353             }
   354             if(isFinal) {
   355                 return TRUE;
   356             }
   357             pos=skipNodeValue(pos, node);
   358             node&=kNodeTypeMask;
   359         }
   360     }
   361 }
   363 int32_t
   364 UCharsTrie::getNextUChars(Appendable &out) const {
   365     const UChar *pos=pos_;
   366     if(pos==NULL) {
   367         return 0;
   368     }
   369     if(remainingMatchLength_>=0) {
   370         out.appendCodeUnit(*pos);  // Next unit of a pending linear-match node.
   371         return 1;
   372     }
   373     int32_t node=*pos++;
   374     if(node>=kMinValueLead) {
   375         if(node&kValueIsFinal) {
   376             return 0;
   377         } else {
   378             pos=skipNodeValue(pos, node);
   379             node&=kNodeTypeMask;
   380         }
   381     }
   382     if(node<kMinLinearMatch) {
   383         if(node==0) {
   384             node=*pos++;
   385         }
   386         out.reserveAppendCapacity(++node);
   387         getNextBranchUChars(pos, node, out);
   388         return node;
   389     } else {
   390         // First unit of the linear-match node.
   391         out.appendCodeUnit(*pos);
   392         return 1;
   393     }
   394 }
   396 void
   397 UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) {
   398     while(length>kMaxBranchLinearSubNodeLength) {
   399         ++pos;  // ignore the comparison unit
   400         getNextBranchUChars(jumpByDelta(pos), length>>1, out);
   401         length=length-(length>>1);
   402         pos=skipDelta(pos);
   403     }
   404     do {
   405         out.appendCodeUnit(*pos++);
   406         pos=skipValue(pos);
   407     } while(--length>1);
   408     out.appendCodeUnit(*pos);
   409 }
   411 U_NAMESPACE_END

mercurial