intl/icu/source/i18n/ucol_wgt.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*  
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 1999-2011, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  ucol_wgt.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2001mar08
    14 *   created by: Markus W. Scherer
    15 *
    16 *   This file contains code for allocating n collation element weights
    17 *   between two exclusive limits.
    18 *   It is used only internally by ucol_bld.
    19 */
    21 #include "unicode/utypes.h"
    23 #if !UCONFIG_NO_COLLATION
    25 #include "ucol_imp.h"
    26 #include "ucol_wgt.h"
    27 #include "cmemory.h"
    28 #include "uarrsort.h"
    30 #ifdef UCOL_DEBUG
    31 #   include <stdio.h>
    32 #endif
    34 /* collation element weight allocation -------------------------------------- */
    36 /* helper functions for CE weights */
    38 static inline int32_t
    39 lengthOfWeight(uint32_t weight) {
    40     if((weight&0xffffff)==0) {
    41         return 1;
    42     } else if((weight&0xffff)==0) {
    43         return 2;
    44     } else if((weight&0xff)==0) {
    45         return 3;
    46     } else {
    47         return 4;
    48     }
    49 }
    51 static inline uint32_t
    52 getWeightTrail(uint32_t weight, int32_t length) {
    53     return (uint32_t)(weight>>(8*(4-length)))&0xff;
    54 }
    56 static inline uint32_t
    57 setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
    58     length=8*(4-length);
    59     return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length));
    60 }
    62 static inline uint32_t
    63 getWeightByte(uint32_t weight, int32_t idx) {
    64     return getWeightTrail(weight, idx); /* same calculation */
    65 }
    67 static inline uint32_t
    68 setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
    69     uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
    71     idx*=8;
    72     if(idx<32) {
    73         mask=((uint32_t)0xffffffff)>>idx;
    74     } else {
    75         // Do not use uint32_t>>32 because on some platforms that does not shift at all
    76         // while we need it to become 0.
    77         // PowerPC: 0xffffffff>>32 = 0           (wanted)
    78         // x86:     0xffffffff>>32 = 0xffffffff  (not wanted)
    79         //
    80         // ANSI C99 6.5.7 Bitwise shift operators:
    81         // "If the value of the right operand is negative
    82         // or is greater than or equal to the width of the promoted left operand,
    83         // the behavior is undefined."
    84         mask=0;
    85     }
    86     idx=32-idx;
    87     mask|=0xffffff00<<idx;
    88     return (uint32_t)((weight&mask)|(byte<<idx));
    89 }
    91 static inline uint32_t
    92 truncateWeight(uint32_t weight, int32_t length) {
    93     return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
    94 }
    96 static inline uint32_t
    97 incWeightTrail(uint32_t weight, int32_t length) {
    98     return (uint32_t)(weight+(1UL<<(8*(4-length))));
    99 }
   101 static inline uint32_t
   102 decWeightTrail(uint32_t weight, int32_t length) {
   103     return (uint32_t)(weight-(1UL<<(8*(4-length))));
   104 }
   106 static inline uint32_t
   107 incWeight(uint32_t weight, int32_t length, uint32_t maxByte) {
   108     uint32_t byte;
   110     for(;;) {
   111         byte=getWeightByte(weight, length);
   112         if(byte<maxByte) {
   113             return setWeightByte(weight, length, byte+1);
   114         } else {
   115             /* roll over, set this byte to UCOL_BYTE_FIRST_TAILORED and increment the previous one */
   116             weight=setWeightByte(weight, length, UCOL_BYTE_FIRST_TAILORED);
   117             --length;
   118         }
   119     }
   120 }
   122 static inline int32_t
   123 lengthenRange(WeightRange *range, uint32_t maxByte, uint32_t countBytes) {
   124     int32_t length;
   126     length=range->length2+1;
   127     range->start=setWeightTrail(range->start, length, UCOL_BYTE_FIRST_TAILORED);
   128     range->end=setWeightTrail(range->end, length, maxByte);
   129     range->count2*=countBytes;
   130     range->length2=length;
   131     return length;
   132 }
   134 /* for uprv_sortArray: sort ranges in weight order */
   135 static int32_t U_CALLCONV
   136 compareRanges(const void * /*context*/, const void *left, const void *right) {
   137     uint32_t l, r;
   139     l=((const WeightRange *)left)->start;
   140     r=((const WeightRange *)right)->start;
   141     if(l<r) {
   142         return -1;
   143     } else if(l>r) {
   144         return 1;
   145     } else {
   146         return 0;
   147     }
   148 }
   150 /*
   151  * take two CE weights and calculate the
   152  * possible ranges of weights between the two limits, excluding them
   153  * for weights with up to 4 bytes there are up to 2*4-1=7 ranges
   154  */
   155 static inline int32_t
   156 getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit,
   157                 uint32_t maxByte, uint32_t countBytes,
   158                 WeightRange ranges[7]) {
   159     WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
   160     uint32_t weight, trail;
   161     int32_t length, lowerLength, upperLength, rangeCount;
   163     /* assume that both lowerLimit & upperLimit are not 0 */
   165     /* get the lengths of the limits */
   166     lowerLength=lengthOfWeight(lowerLimit);
   167     upperLength=lengthOfWeight(upperLimit);
   169 #ifdef UCOL_DEBUG
   170     printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
   171     printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
   172 #endif
   174     if(lowerLimit>=upperLimit) {
   175 #ifdef UCOL_DEBUG
   176         printf("error: no space between lower & upper limits\n");
   177 #endif
   178         return 0;
   179     }
   181     /* check that neither is a prefix of the other */
   182     if(lowerLength<upperLength) {
   183         if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
   184 #ifdef UCOL_DEBUG
   185             printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
   186 #endif
   187             return 0;
   188         }
   189     }
   190     /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
   192     /* reset local variables */
   193     uprv_memset(lower, 0, sizeof(lower));
   194     uprv_memset(&middle, 0, sizeof(middle));
   195     uprv_memset(upper, 0, sizeof(upper));
   197     /*
   198      * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
   199      * range     minimum length
   200      * lower[4]  4
   201      * lower[3]  3
   202      * lower[2]  2
   203      * middle    1
   204      * upper[2]  2
   205      * upper[3]  3
   206      * upper[4]  4
   207      *
   208      * We are now going to calculate up to 7 ranges.
   209      * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
   210      */
   211     weight=lowerLimit;
   212     for(length=lowerLength; length>=2; --length) {
   213         trail=getWeightTrail(weight, length);
   214         if(trail<maxByte) {
   215             lower[length].start=incWeightTrail(weight, length);
   216             lower[length].end=setWeightTrail(weight, length, maxByte);
   217             lower[length].length=length;
   218             lower[length].count=maxByte-trail;
   219         }
   220         weight=truncateWeight(weight, length-1);
   221     }
   222     middle.start=incWeightTrail(weight, 1);
   224     weight=upperLimit;
   225     for(length=upperLength; length>=2; --length) {
   226         trail=getWeightTrail(weight, length);
   227         if(trail>UCOL_BYTE_FIRST_TAILORED) {
   228             upper[length].start=setWeightTrail(weight, length, UCOL_BYTE_FIRST_TAILORED);
   229             upper[length].end=decWeightTrail(weight, length);
   230             upper[length].length=length;
   231             upper[length].count=trail-UCOL_BYTE_FIRST_TAILORED;
   232         }
   233         weight=truncateWeight(weight, length-1);
   234     }
   235     middle.end=decWeightTrail(weight, 1);
   237     /* set the middle range */
   238     middle.length=1;
   239     if(middle.end>=middle.start) {
   240         middle.count=(int32_t)((middle.end-middle.start)>>24)+1;
   241     } else {
   242         /* eliminate overlaps */
   243         uint32_t start, end;
   245         /* remove the middle range */
   246         middle.count=0;
   248         /* reduce or remove the lower ranges that go beyond upperLimit */
   249         for(length=4; length>=2; --length) {
   250             if(lower[length].count>0 && upper[length].count>0) {
   251                 start=upper[length].start;
   252                 end=lower[length].end;
   254                 if(end>=start || incWeight(end, length, maxByte)==start) {
   255                     /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */
   256                     start=lower[length].start;
   257                     end=lower[length].end=upper[length].end;
   258                     /*
   259                      * merging directly adjacent ranges needs to subtract the 0/1 gaps in between;
   260                      * it may result in a range with count>countBytes
   261                      */
   262                     lower[length].count=
   263                         (int32_t)(getWeightTrail(end, length)-getWeightTrail(start, length)+1+
   264                                   countBytes*(getWeightByte(end, length-1)-getWeightByte(start, length-1)));
   265                     upper[length].count=0;
   266                     while(--length>=2) {
   267                         lower[length].count=upper[length].count=0;
   268                     }
   269                     break;
   270                 }
   271             }
   272         }
   273     }
   275 #ifdef UCOL_DEBUG
   276     /* print ranges */
   277     for(length=4; length>=2; --length) {
   278         if(lower[length].count>0) {
   279             printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
   280         }
   281     }
   282     if(middle.count>0) {
   283         printf("middle   .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
   284     }
   285     for(length=2; length<=4; ++length) {
   286         if(upper[length].count>0) {
   287             printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
   288         }
   289     }
   290 #endif
   292     /* copy the ranges, shortest first, into the result array */
   293     rangeCount=0;
   294     if(middle.count>0) {
   295         uprv_memcpy(ranges, &middle, sizeof(WeightRange));
   296         rangeCount=1;
   297     }
   298     for(length=2; length<=4; ++length) {
   299         /* copy upper first so that later the middle range is more likely the first one to use */
   300         if(upper[length].count>0) {
   301             uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
   302             ++rangeCount;
   303         }
   304         if(lower[length].count>0) {
   305             uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
   306             ++rangeCount;
   307         }
   308     }
   309     return rangeCount;
   310 }
   312 /*
   313  * call getWeightRanges and then determine heuristically
   314  * which ranges to use for a given number of weights between (excluding)
   315  * two limits
   316  */
   317 U_CFUNC int32_t
   318 ucol_allocWeights(uint32_t lowerLimit, uint32_t upperLimit,
   319                   uint32_t n,
   320                   uint32_t maxByte,
   321                   WeightRange ranges[7]) {
   322     /* number of usable byte values 3..maxByte */
   323     uint32_t countBytes=maxByte-UCOL_BYTE_FIRST_TAILORED+1;
   325     uint32_t lengthCounts[6]; /* [0] unused, [5] to make index checks unnecessary */
   326     uint32_t maxCount;
   327     int32_t i, rangeCount, minLength/*, maxLength*/;
   329     /* countBytes to the power of index */
   330     uint32_t powers[5];
   331     /* gcc requires explicit initialization */
   332     powers[0] = 1;
   333     powers[1] = countBytes;
   334     powers[2] = countBytes*countBytes;
   335     powers[3] = countBytes*countBytes*countBytes;
   336     powers[4] = countBytes*countBytes*countBytes*countBytes;
   338 #ifdef UCOL_DEBUG
   339     puts("");
   340 #endif
   342     rangeCount=getWeightRanges(lowerLimit, upperLimit, maxByte, countBytes, ranges);
   343     if(rangeCount<=0) {
   344 #ifdef UCOL_DEBUG
   345         printf("error: unable to get Weight ranges\n");
   346 #endif
   347         return 0;
   348     }
   350     /* what is the maximum number of weights with these ranges? */
   351     maxCount=0;
   352     for(i=0; i<rangeCount; ++i) {
   353         maxCount+=(uint32_t)ranges[i].count*powers[4-ranges[i].length];
   354     }
   355     if(maxCount>=n) {
   356 #ifdef UCOL_DEBUG
   357         printf("the maximum number of %lu weights is sufficient for n=%lu\n", maxCount, n);
   358 #endif
   359     } else {
   360 #ifdef UCOL_DEBUG
   361         printf("error: the maximum number of %lu weights is insufficient for n=%lu\n", maxCount, n);
   362 #endif
   363         return 0;
   364     }
   366     /* set the length2 and count2 fields */
   367     for(i=0; i<rangeCount; ++i) {
   368         ranges[i].length2=ranges[i].length;
   369         ranges[i].count2=(uint32_t)ranges[i].count;
   370     }
   372     /* try until we find suitably large ranges */
   373     for(;;) {
   374         /* get the smallest number of bytes in a range */
   375         minLength=ranges[0].length2;
   377         /* sum up the number of elements that fit into ranges of each byte length */
   378         uprv_memset(lengthCounts, 0, sizeof(lengthCounts));
   379         for(i=0; i<rangeCount; ++i) {
   380             lengthCounts[ranges[i].length2]+=ranges[i].count2;
   381         }
   383         /* now try to allocate n elements in the available short ranges */
   384         if(n<=(lengthCounts[minLength]+lengthCounts[minLength+1])) {
   385             /* trivial cases, use the first few ranges */
   386             maxCount=0;
   387             rangeCount=0;
   388             do {
   389                 maxCount+=ranges[rangeCount].count2;
   390                 ++rangeCount;
   391             } while(n>maxCount);
   392 #ifdef UCOL_DEBUG
   393             printf("take first %ld ranges\n", rangeCount);
   394 #endif
   395             break;
   396         } else if(n<=ranges[0].count2*countBytes) {
   397             /* easy case, just make this one range large enough by lengthening it once more, possibly split it */
   398             uint32_t count1, count2, power_1, power;
   400             /*maxLength=minLength+1;*/
   402             /* calculate how to split the range between maxLength-1 (count1) and maxLength (count2) */
   403             power_1=powers[minLength-ranges[0].length];
   404             power=power_1*countBytes;
   405             count2=(n+power-1)/power;
   406             count1=ranges[0].count-count2;
   408             /* split the range */
   409 #ifdef UCOL_DEBUG
   410             printf("split the first range %ld:%ld\n", count1, count2);
   411 #endif
   412             if(count1<1) {
   413                 rangeCount=1;
   415                 /* lengthen the entire range to maxLength */
   416                 lengthenRange(ranges, maxByte, countBytes);
   417             } else {
   418                 /* really split the range */
   419                 uint32_t byte;
   421                 /* create a new range with the end and initial and current length of the old one */
   422                 rangeCount=2;
   423                 ranges[1].end=ranges[0].end;
   424                 ranges[1].length=ranges[0].length;
   425                 ranges[1].length2=minLength;
   427                 /* set the end of the first range according to count1 */
   428                 i=ranges[0].length;
   429                 byte=getWeightByte(ranges[0].start, i)+count1-1;
   431                 /*
   432                  * ranges[0].count and count1 may be >countBytes
   433                  * from merging adjacent ranges;
   434                  * byte>maxByte is possible
   435                  */
   436                 if(byte<=maxByte) {
   437                     ranges[0].end=setWeightByte(ranges[0].start, i, byte);
   438                 } else /* byte>maxByte */ {
   439                     ranges[0].end=setWeightByte(incWeight(ranges[0].start, i-1, maxByte), i, byte-countBytes);
   440                 }
   442                 /* set the bytes in the end weight at length+1..length2 to maxByte */
   443                 byte=(maxByte<<24)|(maxByte<<16)|(maxByte<<8)|maxByte; /* this used to be 0xffffffff */
   444                 ranges[0].end=truncateWeight(ranges[0].end, i)|
   445                               ((byte>>(8*i))&(byte<<(8*(4-minLength))));
   447                 /* set the start of the second range to immediately follow the end of the first one */
   448                 ranges[1].start=incWeight(ranges[0].end, minLength, maxByte);
   450                 /* set the count values (informational) */
   451                 ranges[0].count=count1;
   452                 ranges[1].count=count2;
   454                 ranges[0].count2=count1*power_1;
   455                 ranges[1].count2=count2*power_1; /* will be *countBytes when lengthened */
   457                 /* lengthen the second range to maxLength */
   458                 lengthenRange(ranges+1, maxByte, countBytes);
   459             }
   460             break;
   461         }
   463         /* no good match, lengthen all minLength ranges and iterate */
   464 #ifdef UCOL_DEBUG
   465         printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
   466 #endif
   467         for(i=0; ranges[i].length2==minLength; ++i) {
   468             lengthenRange(ranges+i, maxByte, countBytes);
   469         }
   470     }
   472     if(rangeCount>1) {
   473         /* sort the ranges by weight values */
   474         UErrorCode errorCode=U_ZERO_ERROR;
   475         uprv_sortArray(ranges, rangeCount, sizeof(WeightRange), compareRanges, NULL, FALSE, &errorCode);
   476         /* ignore error code: we know that the internal sort function will not fail here */
   477     }
   479 #ifdef UCOL_DEBUG
   480     puts("final ranges:");
   481     for(i=0; i<rangeCount; ++i) {
   482         printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .length2=%ld .count=%ld .count2=%lu\n",
   483                i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].length2, ranges[i].count, ranges[i].count2);
   484     }
   485 #endif
   487     /* set maxByte in ranges[0] for ucol_nextWeight() */
   488     ranges[0].count=maxByte;
   490     return rangeCount;
   491 }
   493 /*
   494  * given a set of ranges calculated by ucol_allocWeights(),
   495  * iterate through the weights
   496  */
   497 U_CFUNC uint32_t
   498 ucol_nextWeight(WeightRange ranges[], int32_t *pRangeCount) {
   499     if(*pRangeCount<=0) {
   500         return 0xffffffff;
   501     } else {
   502         uint32_t weight, maxByte;
   504         /* get maxByte from the .count field */
   505         maxByte=ranges[0].count;
   507         /* get the next weight */
   508         weight=ranges[0].start;
   509         if(weight==ranges[0].end) {
   510             /* this range is finished, remove it and move the following ones up */
   511             if(--*pRangeCount>0) {
   512                 uprv_memmove(ranges, ranges+1, *pRangeCount*sizeof(WeightRange));
   513                 ranges[0].count=maxByte; /* keep maxByte in ranges[0] */
   514             }
   515         } else {
   516             /* increment the weight for the next value */
   517             ranges[0].start=incWeight(weight, ranges[0].length2, maxByte);
   518         }
   520         return weight;
   521     }
   522 }
   524 #if 0 // #ifdef UCOL_DEBUG
   526 static void
   527 testAlloc(uint32_t lowerLimit, uint32_t upperLimit, uint32_t n, UBool enumerate) {
   528     WeightRange ranges[8];
   529     int32_t rangeCount;
   531     rangeCount=ucol_allocWeights(lowerLimit, upperLimit, n, ranges);
   532     if(enumerate) {
   533         uint32_t weight;
   535         while(n>0) {
   536             weight=ucol_nextWeight(ranges, &rangeCount);
   537             if(weight==0xffffffff) {
   538                 printf("error: 0xffffffff with %lu more weights to go\n", n);
   539                 break;
   540             }
   541             printf("    0x%08lx\n", weight);
   542             --n;
   543         }
   544     }
   545 }
   547 extern int
   548 main(int argc, const char *argv[]) {
   549 #if 0
   550 #endif
   551     testAlloc(0x364214fc, 0x44b87d23, 5, FALSE);
   552     testAlloc(0x36421500, 0x44b87d23, 5, FALSE);
   553     testAlloc(0x36421500, 0x44b87d23, 20, FALSE);
   554     testAlloc(0x36421500, 0x44b87d23, 13700, FALSE);
   555     testAlloc(0x36421500, 0x38b87d23, 1, FALSE);
   556     testAlloc(0x36421500, 0x38b87d23, 20, FALSE);
   557     testAlloc(0x36421500, 0x38b87d23, 200, TRUE);
   558     testAlloc(0x36421500, 0x38b87d23, 13700, FALSE);
   559     testAlloc(0x36421500, 0x37b87d23, 13700, FALSE);
   560     testAlloc(0x36ef1500, 0x37b87d23, 13700, FALSE);
   561     testAlloc(0x36421500, 0x36b87d23, 13700, FALSE);
   562     testAlloc(0x36b87122, 0x36b87d23, 13700, FALSE);
   563     testAlloc(0x49000000, 0x4a600000, 13700, FALSE);
   564     testAlloc(0x9fffffff, 0xd0000000, 13700, FALSE);
   565     testAlloc(0x9fffffff, 0xd0000000, 67400, FALSE);
   566     testAlloc(0x9fffffff, 0xa0030000, 67400, FALSE);
   567     testAlloc(0x9fffffff, 0xa0030000, 40000, FALSE);
   568     testAlloc(0xa0000000, 0xa0030000, 40000, FALSE);
   569     testAlloc(0xa0031100, 0xa0030000, 40000, FALSE);
   570 #if 0
   571 #endif
   572     return 0;
   573 }
   575 #endif
   577 #endif /* #if !UCONFIG_NO_COLLATION */

mercurial