intl/icu/source/i18n/bocsu.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/bocsu.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,165 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*   Copyright (C) 2001-2011, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +*******************************************************************************
     1.9 +*   file name:  bocsu.c
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   Author: Markus W. Scherer
    1.15 +*
    1.16 +*   Modification history:
    1.17 +*   05/18/2001  weiv    Made into separate module
    1.18 +*/
    1.19 +
    1.20 +#ifndef BOCSU_H
    1.21 +#define BOCSU_H
    1.22 +
    1.23 +#include "unicode/utypes.h"
    1.24 +
    1.25 +#if !UCONFIG_NO_COLLATION
    1.26 +
    1.27 +U_NAMESPACE_BEGIN
    1.28 +
    1.29 +class ByteSink;
    1.30 +
    1.31 +U_NAMESPACE_END
    1.32 +
    1.33 +/*
    1.34 + * "BOCSU"
    1.35 + * Binary Ordered Compression Scheme for Unicode
    1.36 + *
    1.37 + * Specific application:
    1.38 + *
    1.39 + * Encode a Unicode string for the identical level of a sort key.
    1.40 + * Restrictions:
    1.41 + * - byte stream (unsigned 8-bit bytes)
    1.42 + * - lexical order of the identical-level run must be
    1.43 + *   the same as code point order for the string
    1.44 + * - avoid byte values 0, 1, 2
    1.45 + *
    1.46 + * Method: Slope Detection
    1.47 + * Remember the previous code point (initial 0).
    1.48 + * For each cp in the string, encode the difference to the previous one.
    1.49 + *
    1.50 + * With a compact encoding of differences, this yields good results for
    1.51 + * small scripts and UTF-like results otherwise.
    1.52 + *
    1.53 + * Encoding of differences:
    1.54 + * - Similar to a UTF, encoding the length of the byte sequence in the lead bytes.
    1.55 + * - Does not need to be friendly for decoding or random access
    1.56 + *   (trail byte values may overlap with lead/single byte values).
    1.57 + * - The signedness must be encoded as the most significant part.
    1.58 + *
    1.59 + * We encode differences with few bytes if their absolute values are small.
    1.60 + * For correct ordering, we must treat the entire value range -10ffff..+10ffff
    1.61 + * in ascending order, which forbids encoding the sign and the absolute value separately.
    1.62 + * Instead, we split the lead byte range in the middle and encode non-negative values
    1.63 + * going up and negative values going down.
    1.64 + *
    1.65 + * For very small absolute values, the difference is added to a middle byte value
    1.66 + * for single-byte encoded differences.
    1.67 + * For somewhat larger absolute values, the difference is divided by the number
    1.68 + * of byte values available, the modulo is used for one trail byte, and the remainder
    1.69 + * is added to a lead byte avoiding the single-byte range.
    1.70 + * For large absolute values, the difference is similarly encoded in three bytes.
    1.71 + *
    1.72 + * This encoding does not use byte values 0, 1, 2, but uses all other byte values
    1.73 + * for lead/single bytes so that the middle range of single bytes is as large
    1.74 + * as possible.
    1.75 + * Note that the lead byte ranges overlap some, but that the sequences as a whole
    1.76 + * are well ordered. I.e., even if the lead byte is the same for sequences of different
    1.77 + * lengths, the trail bytes establish correct order.
    1.78 + * It would be possible to encode slightly larger ranges for each length (>1) by
    1.79 + * subtracting the lower bound of the range. However, that would also slow down the
    1.80 + * calculation.
    1.81 + *
    1.82 + * For the actual string encoding, an optimization moves the previous code point value
    1.83 + * to the middle of its Unicode script block to minimize the differences in
    1.84 + * same-script text runs.
    1.85 + */
    1.86 +
    1.87 +/* Do not use byte values 0, 1, 2 because they are separators in sort keys. */
    1.88 +#define SLOPE_MIN           3
    1.89 +#define SLOPE_MAX           0xff
    1.90 +#define SLOPE_MIDDLE        0x81
    1.91 +
    1.92 +#define SLOPE_TAIL_COUNT    (SLOPE_MAX-SLOPE_MIN+1)
    1.93 +
    1.94 +#define SLOPE_MAX_BYTES     4
    1.95 +
    1.96 +/*
    1.97 + * Number of lead bytes:
    1.98 + * 1        middle byte for 0
    1.99 + * 2*80=160 single bytes for !=0
   1.100 + * 2*42=84  for double-byte values
   1.101 + * 2*3=6    for 3-byte values
   1.102 + * 2*1=2    for 4-byte values
   1.103 + *
   1.104 + * The sum must be <=SLOPE_TAIL_COUNT.
   1.105 + *
   1.106 + * Why these numbers?
   1.107 + * - There should be >=128 single-byte values to cover 128-blocks
   1.108 + *   with small scripts.
   1.109 + * - There should be >=20902 single/double-byte values to cover Unihan.
   1.110 + * - It helps CJK Extension B some if there are 3-byte values that cover
   1.111 + *   the distance between them and Unihan.
   1.112 + *   This also helps to jump among distant places in the BMP.
   1.113 + * - Four-byte values are necessary to cover the rest of Unicode.
   1.114 + *
   1.115 + * Symmetrical lead byte counts are for convenience.
   1.116 + * With an equal distribution of even and odd differences there is also
   1.117 + * no advantage to asymmetrical lead byte counts.
   1.118 + */
   1.119 +#define SLOPE_SINGLE        80
   1.120 +#define SLOPE_LEAD_2        42
   1.121 +#define SLOPE_LEAD_3        3
   1.122 +#define SLOPE_LEAD_4        1
   1.123 +
   1.124 +/* The difference value range for single-byters. */
   1.125 +#define SLOPE_REACH_POS_1   SLOPE_SINGLE
   1.126 +#define SLOPE_REACH_NEG_1   (-SLOPE_SINGLE)
   1.127 +
   1.128 +/* The difference value range for double-byters. */
   1.129 +#define SLOPE_REACH_POS_2   (SLOPE_LEAD_2*SLOPE_TAIL_COUNT+(SLOPE_LEAD_2-1))
   1.130 +#define SLOPE_REACH_NEG_2   (-SLOPE_REACH_POS_2-1)
   1.131 +
   1.132 +/* The difference value range for 3-byters. */
   1.133 +#define SLOPE_REACH_POS_3   (SLOPE_LEAD_3*SLOPE_TAIL_COUNT*SLOPE_TAIL_COUNT+(SLOPE_LEAD_3-1)*SLOPE_TAIL_COUNT+(SLOPE_TAIL_COUNT-1))
   1.134 +#define SLOPE_REACH_NEG_3   (-SLOPE_REACH_POS_3-1)
   1.135 +
   1.136 +/* The lead byte start values. */
   1.137 +#define SLOPE_START_POS_2   (SLOPE_MIDDLE+SLOPE_SINGLE+1)
   1.138 +#define SLOPE_START_POS_3   (SLOPE_START_POS_2+SLOPE_LEAD_2)
   1.139 +
   1.140 +#define SLOPE_START_NEG_2   (SLOPE_MIDDLE+SLOPE_REACH_NEG_1)
   1.141 +#define SLOPE_START_NEG_3   (SLOPE_START_NEG_2-SLOPE_LEAD_2)
   1.142 +
   1.143 +/*
   1.144 + * Integer division and modulo with negative numerators
   1.145 + * yields negative modulo results and quotients that are one more than
   1.146 + * what we need here.
   1.147 + */
   1.148 +#define NEGDIVMOD(n, d, m) { \
   1.149 +    (m)=(n)%(d); \
   1.150 +    (n)/=(d); \
   1.151 +    if((m)<0) { \
   1.152 +        --(n); \
   1.153 +        (m)+=(d); \
   1.154 +    } \
   1.155 +}
   1.156 +
   1.157 +U_CFUNC void
   1.158 +u_writeIdenticalLevelRun(const UChar *s, int32_t length, icu::ByteSink &sink);
   1.159 +
   1.160 +U_CFUNC int32_t
   1.161 +u_writeIdenticalLevelRunTwoChars(UChar32 first, UChar32 second, uint8_t *p);
   1.162 +
   1.163 +U_CFUNC uint8_t *
   1.164 +u_writeDiff(int32_t diff, uint8_t *p);
   1.165 +
   1.166 +#endif /* #if !UCONFIG_NO_COLLATION */
   1.167 +
   1.168 +#endif

mercurial