|
1 /* |
|
2 ******************************************************************************* |
|
3 * Copyright (C) 2001-2011, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ******************************************************************************* |
|
6 * file name: bocsu.cpp |
|
7 * encoding: US-ASCII |
|
8 * tab size: 8 (not used) |
|
9 * indentation:4 |
|
10 * |
|
11 * Author: Markus W. Scherer |
|
12 * |
|
13 * Modification history: |
|
14 * 05/18/2001 weiv Made into separate module |
|
15 */ |
|
16 |
|
17 |
|
18 #include "unicode/utypes.h" |
|
19 |
|
20 #if !UCONFIG_NO_COLLATION |
|
21 |
|
22 #include "unicode/bytestream.h" |
|
23 #include "unicode/utf16.h" |
|
24 #include "bocsu.h" |
|
25 |
|
26 /* |
|
27 * encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes, |
|
28 * preserving lexical order |
|
29 */ |
|
30 U_CFUNC uint8_t * |
|
31 u_writeDiff(int32_t diff, uint8_t *p) { |
|
32 if(diff>=SLOPE_REACH_NEG_1) { |
|
33 if(diff<=SLOPE_REACH_POS_1) { |
|
34 *p++=(uint8_t)(SLOPE_MIDDLE+diff); |
|
35 } else if(diff<=SLOPE_REACH_POS_2) { |
|
36 *p++=(uint8_t)(SLOPE_START_POS_2+(diff/SLOPE_TAIL_COUNT)); |
|
37 *p++=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); |
|
38 } else if(diff<=SLOPE_REACH_POS_3) { |
|
39 p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); |
|
40 diff/=SLOPE_TAIL_COUNT; |
|
41 p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); |
|
42 *p=(uint8_t)(SLOPE_START_POS_3+(diff/SLOPE_TAIL_COUNT)); |
|
43 p+=3; |
|
44 } else { |
|
45 p[3]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); |
|
46 diff/=SLOPE_TAIL_COUNT; |
|
47 p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); |
|
48 diff/=SLOPE_TAIL_COUNT; |
|
49 p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); |
|
50 *p=SLOPE_MAX; |
|
51 p+=4; |
|
52 } |
|
53 } else { |
|
54 int32_t m; |
|
55 |
|
56 if(diff>=SLOPE_REACH_NEG_2) { |
|
57 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); |
|
58 *p++=(uint8_t)(SLOPE_START_NEG_2+diff); |
|
59 *p++=(uint8_t)(SLOPE_MIN+m); |
|
60 } else if(diff>=SLOPE_REACH_NEG_3) { |
|
61 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); |
|
62 p[2]=(uint8_t)(SLOPE_MIN+m); |
|
63 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); |
|
64 p[1]=(uint8_t)(SLOPE_MIN+m); |
|
65 *p=(uint8_t)(SLOPE_START_NEG_3+diff); |
|
66 p+=3; |
|
67 } else { |
|
68 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); |
|
69 p[3]=(uint8_t)(SLOPE_MIN+m); |
|
70 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); |
|
71 p[2]=(uint8_t)(SLOPE_MIN+m); |
|
72 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); |
|
73 p[1]=(uint8_t)(SLOPE_MIN+m); |
|
74 *p=SLOPE_MIN; |
|
75 p+=4; |
|
76 } |
|
77 } |
|
78 return p; |
|
79 } |
|
80 |
|
81 /* |
|
82 * Encode the code points of a string as |
|
83 * a sequence of byte-encoded differences (slope detection), |
|
84 * preserving lexical order. |
|
85 * |
|
86 * Optimize the difference-taking for runs of Unicode text within |
|
87 * small scripts: |
|
88 * |
|
89 * Most small scripts are allocated within aligned 128-blocks of Unicode |
|
90 * code points. Lexical order is preserved if "prev" is always moved |
|
91 * into the middle of such a block. |
|
92 * |
|
93 * Additionally, "prev" is moved from anywhere in the Unihan |
|
94 * area into the middle of that area. |
|
95 * Note that the identical-level run in a sort key is generated from |
|
96 * NFD text - there are never Hangul characters included. |
|
97 */ |
|
98 U_CFUNC void |
|
99 u_writeIdenticalLevelRun(const UChar *s, int32_t length, icu::ByteSink &sink) { |
|
100 char scratch[64]; |
|
101 int32_t capacity; |
|
102 |
|
103 UChar32 prev=0; |
|
104 int32_t i=0; |
|
105 while(i<length) { |
|
106 char *buffer=sink.GetAppendBuffer(1, length*2, scratch, (int32_t)sizeof(scratch), &capacity); |
|
107 uint8_t *p; |
|
108 // We must have capacity>=SLOPE_MAX_BYTES in case u_writeDiff() writes that much, |
|
109 // but we do not want to force the sink.GetAppendBuffer() to allocate |
|
110 // for a large min_capacity because we might actually only write one byte. |
|
111 if(capacity<16) { |
|
112 buffer=scratch; |
|
113 capacity=(int32_t)sizeof(scratch); |
|
114 } |
|
115 p=reinterpret_cast<uint8_t *>(buffer); |
|
116 uint8_t *lastSafe=p+capacity-SLOPE_MAX_BYTES; |
|
117 while(i<length && p<=lastSafe) { |
|
118 if(prev<0x4e00 || prev>=0xa000) { |
|
119 prev=(prev&~0x7f)-SLOPE_REACH_NEG_1; |
|
120 } else { |
|
121 /* |
|
122 * Unihan U+4e00..U+9fa5: |
|
123 * double-bytes down from the upper end |
|
124 */ |
|
125 prev=0x9fff-SLOPE_REACH_POS_2; |
|
126 } |
|
127 |
|
128 UChar32 c; |
|
129 U16_NEXT(s, i, length, c); |
|
130 p=u_writeDiff(c-prev, p); |
|
131 prev=c; |
|
132 } |
|
133 sink.Append(buffer, (int32_t)(p-reinterpret_cast<uint8_t *>(buffer))); |
|
134 } |
|
135 } |
|
136 |
|
137 U_CFUNC int32_t |
|
138 u_writeIdenticalLevelRunTwoChars(UChar32 first, UChar32 second, uint8_t *p) { |
|
139 uint8_t *p0 = p; |
|
140 if(first<0x4e00 || first>=0xa000) { |
|
141 first=(first&~0x7f)-SLOPE_REACH_NEG_1; |
|
142 } else { |
|
143 /* |
|
144 * Unihan U+4e00..U+9fa5: |
|
145 * double-bytes down from the upper end |
|
146 */ |
|
147 first=0x9fff-SLOPE_REACH_POS_2; |
|
148 } |
|
149 |
|
150 p=u_writeDiff(second-first, p); |
|
151 return (int32_t)(p-p0); |
|
152 } |
|
153 |
|
154 #endif /* #if !UCONFIG_NO_COLLATION */ |