intl/icu/source/i18n/bocsu.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/bocsu.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,154 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*   Copyright (C) 2001-2011, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +*******************************************************************************
     1.9 +*   file name:  bocsu.cpp
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   Author: Markus W. Scherer
    1.15 +*
    1.16 +*   Modification history:
    1.17 +*   05/18/2001  weiv    Made into separate module
    1.18 +*/
    1.19 +
    1.20 +
    1.21 +#include "unicode/utypes.h"
    1.22 +
    1.23 +#if !UCONFIG_NO_COLLATION
    1.24 +
    1.25 +#include "unicode/bytestream.h"
    1.26 +#include "unicode/utf16.h"
    1.27 +#include "bocsu.h"
    1.28 +
    1.29 +/*
    1.30 + * encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes,
    1.31 + * preserving lexical order
    1.32 + */
    1.33 +U_CFUNC uint8_t *
    1.34 +u_writeDiff(int32_t diff, uint8_t *p) {
    1.35 +    if(diff>=SLOPE_REACH_NEG_1) {
    1.36 +        if(diff<=SLOPE_REACH_POS_1) {
    1.37 +            *p++=(uint8_t)(SLOPE_MIDDLE+diff);
    1.38 +        } else if(diff<=SLOPE_REACH_POS_2) {
    1.39 +            *p++=(uint8_t)(SLOPE_START_POS_2+(diff/SLOPE_TAIL_COUNT));
    1.40 +            *p++=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
    1.41 +        } else if(diff<=SLOPE_REACH_POS_3) {
    1.42 +            p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
    1.43 +            diff/=SLOPE_TAIL_COUNT;
    1.44 +            p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
    1.45 +            *p=(uint8_t)(SLOPE_START_POS_3+(diff/SLOPE_TAIL_COUNT));
    1.46 +            p+=3;
    1.47 +        } else {
    1.48 +            p[3]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
    1.49 +            diff/=SLOPE_TAIL_COUNT;
    1.50 +            p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
    1.51 +            diff/=SLOPE_TAIL_COUNT;
    1.52 +            p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
    1.53 +            *p=SLOPE_MAX;
    1.54 +            p+=4;
    1.55 +        }
    1.56 +    } else {
    1.57 +        int32_t m;
    1.58 +
    1.59 +        if(diff>=SLOPE_REACH_NEG_2) {
    1.60 +            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
    1.61 +            *p++=(uint8_t)(SLOPE_START_NEG_2+diff);
    1.62 +            *p++=(uint8_t)(SLOPE_MIN+m);
    1.63 +        } else if(diff>=SLOPE_REACH_NEG_3) {
    1.64 +            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
    1.65 +            p[2]=(uint8_t)(SLOPE_MIN+m);
    1.66 +            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
    1.67 +            p[1]=(uint8_t)(SLOPE_MIN+m);
    1.68 +            *p=(uint8_t)(SLOPE_START_NEG_3+diff);
    1.69 +            p+=3;
    1.70 +        } else {
    1.71 +            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
    1.72 +            p[3]=(uint8_t)(SLOPE_MIN+m);
    1.73 +            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
    1.74 +            p[2]=(uint8_t)(SLOPE_MIN+m);
    1.75 +            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
    1.76 +            p[1]=(uint8_t)(SLOPE_MIN+m);
    1.77 +            *p=SLOPE_MIN;
    1.78 +            p+=4;
    1.79 +        }
    1.80 +    }
    1.81 +    return p;
    1.82 +}
    1.83 +
    1.84 +/*
    1.85 + * Encode the code points of a string as
    1.86 + * a sequence of byte-encoded differences (slope detection),
    1.87 + * preserving lexical order.
    1.88 + *
    1.89 + * Optimize the difference-taking for runs of Unicode text within
    1.90 + * small scripts:
    1.91 + *
    1.92 + * Most small scripts are allocated within aligned 128-blocks of Unicode
    1.93 + * code points. Lexical order is preserved if "prev" is always moved
    1.94 + * into the middle of such a block.
    1.95 + *
    1.96 + * Additionally, "prev" is moved from anywhere in the Unihan
    1.97 + * area into the middle of that area.
    1.98 + * Note that the identical-level run in a sort key is generated from
    1.99 + * NFD text - there are never Hangul characters included.
   1.100 + */
   1.101 +U_CFUNC void
   1.102 +u_writeIdenticalLevelRun(const UChar *s, int32_t length, icu::ByteSink &sink) {
   1.103 +    char scratch[64];
   1.104 +    int32_t capacity;
   1.105 +
   1.106 +    UChar32 prev=0;
   1.107 +    int32_t i=0;
   1.108 +    while(i<length) {
   1.109 +        char *buffer=sink.GetAppendBuffer(1, length*2, scratch, (int32_t)sizeof(scratch), &capacity);
   1.110 +        uint8_t *p;
   1.111 +        // We must have capacity>=SLOPE_MAX_BYTES in case u_writeDiff() writes that much,
   1.112 +        // but we do not want to force the sink.GetAppendBuffer() to allocate
   1.113 +        // for a large min_capacity because we might actually only write one byte.
   1.114 +        if(capacity<16) {
   1.115 +            buffer=scratch;
   1.116 +            capacity=(int32_t)sizeof(scratch);
   1.117 +        }
   1.118 +        p=reinterpret_cast<uint8_t *>(buffer);
   1.119 +        uint8_t *lastSafe=p+capacity-SLOPE_MAX_BYTES;
   1.120 +        while(i<length && p<=lastSafe) {
   1.121 +            if(prev<0x4e00 || prev>=0xa000) {
   1.122 +                prev=(prev&~0x7f)-SLOPE_REACH_NEG_1;
   1.123 +            } else {
   1.124 +                /*
   1.125 +                 * Unihan U+4e00..U+9fa5:
   1.126 +                 * double-bytes down from the upper end
   1.127 +                 */
   1.128 +                prev=0x9fff-SLOPE_REACH_POS_2;
   1.129 +            }
   1.130 +
   1.131 +            UChar32 c;
   1.132 +            U16_NEXT(s, i, length, c);
   1.133 +            p=u_writeDiff(c-prev, p);
   1.134 +            prev=c;
   1.135 +        }
   1.136 +        sink.Append(buffer, (int32_t)(p-reinterpret_cast<uint8_t *>(buffer)));
   1.137 +    }
   1.138 +}
   1.139 +
   1.140 +U_CFUNC int32_t
   1.141 +u_writeIdenticalLevelRunTwoChars(UChar32 first, UChar32 second, uint8_t *p) {
   1.142 +    uint8_t *p0 = p;
   1.143 +    if(first<0x4e00 || first>=0xa000) {
   1.144 +        first=(first&~0x7f)-SLOPE_REACH_NEG_1;
   1.145 +    } else {
   1.146 +        /*
   1.147 +         * Unihan U+4e00..U+9fa5:
   1.148 +         * double-bytes down from the upper end
   1.149 +         */
   1.150 +        first=0x9fff-SLOPE_REACH_POS_2;
   1.151 +    }
   1.152 +
   1.153 +    p=u_writeDiff(second-first, p);
   1.154 +    return (int32_t)(p-p0);
   1.155 +}
   1.156 +
   1.157 +#endif /* #if !UCONFIG_NO_COLLATION */

mercurial