1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/bocsu.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,154 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* Copyright (C) 2001-2011, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +******************************************************************************* 1.9 +* file name: bocsu.cpp 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* Author: Markus W. Scherer 1.15 +* 1.16 +* Modification history: 1.17 +* 05/18/2001 weiv Made into separate module 1.18 +*/ 1.19 + 1.20 + 1.21 +#include "unicode/utypes.h" 1.22 + 1.23 +#if !UCONFIG_NO_COLLATION 1.24 + 1.25 +#include "unicode/bytestream.h" 1.26 +#include "unicode/utf16.h" 1.27 +#include "bocsu.h" 1.28 + 1.29 +/* 1.30 + * encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes, 1.31 + * preserving lexical order 1.32 + */ 1.33 +U_CFUNC uint8_t * 1.34 +u_writeDiff(int32_t diff, uint8_t *p) { 1.35 + if(diff>=SLOPE_REACH_NEG_1) { 1.36 + if(diff<=SLOPE_REACH_POS_1) { 1.37 + *p++=(uint8_t)(SLOPE_MIDDLE+diff); 1.38 + } else if(diff<=SLOPE_REACH_POS_2) { 1.39 + *p++=(uint8_t)(SLOPE_START_POS_2+(diff/SLOPE_TAIL_COUNT)); 1.40 + *p++=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 1.41 + } else if(diff<=SLOPE_REACH_POS_3) { 1.42 + p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 1.43 + diff/=SLOPE_TAIL_COUNT; 1.44 + p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 1.45 + *p=(uint8_t)(SLOPE_START_POS_3+(diff/SLOPE_TAIL_COUNT)); 1.46 + p+=3; 1.47 + } else { 1.48 + p[3]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 1.49 + diff/=SLOPE_TAIL_COUNT; 1.50 + p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 1.51 + diff/=SLOPE_TAIL_COUNT; 1.52 + p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 1.53 + *p=SLOPE_MAX; 1.54 + p+=4; 1.55 + } 1.56 + } else { 1.57 + int32_t m; 1.58 + 1.59 + if(diff>=SLOPE_REACH_NEG_2) { 1.60 + NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 1.61 + *p++=(uint8_t)(SLOPE_START_NEG_2+diff); 1.62 + *p++=(uint8_t)(SLOPE_MIN+m); 1.63 + } else if(diff>=SLOPE_REACH_NEG_3) { 1.64 + NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 1.65 + p[2]=(uint8_t)(SLOPE_MIN+m); 1.66 + NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 1.67 + p[1]=(uint8_t)(SLOPE_MIN+m); 1.68 + *p=(uint8_t)(SLOPE_START_NEG_3+diff); 1.69 + p+=3; 1.70 + } else { 1.71 + NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 1.72 + p[3]=(uint8_t)(SLOPE_MIN+m); 1.73 + NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 1.74 + p[2]=(uint8_t)(SLOPE_MIN+m); 1.75 + NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 1.76 + p[1]=(uint8_t)(SLOPE_MIN+m); 1.77 + *p=SLOPE_MIN; 1.78 + p+=4; 1.79 + } 1.80 + } 1.81 + return p; 1.82 +} 1.83 + 1.84 +/* 1.85 + * Encode the code points of a string as 1.86 + * a sequence of byte-encoded differences (slope detection), 1.87 + * preserving lexical order. 1.88 + * 1.89 + * Optimize the difference-taking for runs of Unicode text within 1.90 + * small scripts: 1.91 + * 1.92 + * Most small scripts are allocated within aligned 128-blocks of Unicode 1.93 + * code points. Lexical order is preserved if "prev" is always moved 1.94 + * into the middle of such a block. 1.95 + * 1.96 + * Additionally, "prev" is moved from anywhere in the Unihan 1.97 + * area into the middle of that area. 1.98 + * Note that the identical-level run in a sort key is generated from 1.99 + * NFD text - there are never Hangul characters included. 1.100 + */ 1.101 +U_CFUNC void 1.102 +u_writeIdenticalLevelRun(const UChar *s, int32_t length, icu::ByteSink &sink) { 1.103 + char scratch[64]; 1.104 + int32_t capacity; 1.105 + 1.106 + UChar32 prev=0; 1.107 + int32_t i=0; 1.108 + while(i<length) { 1.109 + char *buffer=sink.GetAppendBuffer(1, length*2, scratch, (int32_t)sizeof(scratch), &capacity); 1.110 + uint8_t *p; 1.111 + // We must have capacity>=SLOPE_MAX_BYTES in case u_writeDiff() writes that much, 1.112 + // but we do not want to force the sink.GetAppendBuffer() to allocate 1.113 + // for a large min_capacity because we might actually only write one byte. 1.114 + if(capacity<16) { 1.115 + buffer=scratch; 1.116 + capacity=(int32_t)sizeof(scratch); 1.117 + } 1.118 + p=reinterpret_cast<uint8_t *>(buffer); 1.119 + uint8_t *lastSafe=p+capacity-SLOPE_MAX_BYTES; 1.120 + while(i<length && p<=lastSafe) { 1.121 + if(prev<0x4e00 || prev>=0xa000) { 1.122 + prev=(prev&~0x7f)-SLOPE_REACH_NEG_1; 1.123 + } else { 1.124 + /* 1.125 + * Unihan U+4e00..U+9fa5: 1.126 + * double-bytes down from the upper end 1.127 + */ 1.128 + prev=0x9fff-SLOPE_REACH_POS_2; 1.129 + } 1.130 + 1.131 + UChar32 c; 1.132 + U16_NEXT(s, i, length, c); 1.133 + p=u_writeDiff(c-prev, p); 1.134 + prev=c; 1.135 + } 1.136 + sink.Append(buffer, (int32_t)(p-reinterpret_cast<uint8_t *>(buffer))); 1.137 + } 1.138 +} 1.139 + 1.140 +U_CFUNC int32_t 1.141 +u_writeIdenticalLevelRunTwoChars(UChar32 first, UChar32 second, uint8_t *p) { 1.142 + uint8_t *p0 = p; 1.143 + if(first<0x4e00 || first>=0xa000) { 1.144 + first=(first&~0x7f)-SLOPE_REACH_NEG_1; 1.145 + } else { 1.146 + /* 1.147 + * Unihan U+4e00..U+9fa5: 1.148 + * double-bytes down from the upper end 1.149 + */ 1.150 + first=0x9fff-SLOPE_REACH_POS_2; 1.151 + } 1.152 + 1.153 + p=u_writeDiff(second-first, p); 1.154 + return (int32_t)(p-p0); 1.155 +} 1.156 + 1.157 +#endif /* #if !UCONFIG_NO_COLLATION */