michael@0: /* michael@0: ******************************************************************************* michael@0: * Copyright (C) 2001-2011, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ******************************************************************************* michael@0: * file name: bocsu.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * Author: Markus W. Scherer michael@0: * michael@0: * Modification history: michael@0: * 05/18/2001 weiv Made into separate module michael@0: */ michael@0: michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_COLLATION michael@0: michael@0: #include "unicode/bytestream.h" michael@0: #include "unicode/utf16.h" michael@0: #include "bocsu.h" michael@0: michael@0: /* michael@0: * encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes, michael@0: * preserving lexical order michael@0: */ michael@0: U_CFUNC uint8_t * michael@0: u_writeDiff(int32_t diff, uint8_t *p) { michael@0: if(diff>=SLOPE_REACH_NEG_1) { michael@0: if(diff<=SLOPE_REACH_POS_1) { michael@0: *p++=(uint8_t)(SLOPE_MIDDLE+diff); michael@0: } else if(diff<=SLOPE_REACH_POS_2) { michael@0: *p++=(uint8_t)(SLOPE_START_POS_2+(diff/SLOPE_TAIL_COUNT)); michael@0: *p++=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); michael@0: } else if(diff<=SLOPE_REACH_POS_3) { michael@0: p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); michael@0: diff/=SLOPE_TAIL_COUNT; michael@0: p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); michael@0: *p=(uint8_t)(SLOPE_START_POS_3+(diff/SLOPE_TAIL_COUNT)); michael@0: p+=3; michael@0: } else { michael@0: p[3]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); michael@0: diff/=SLOPE_TAIL_COUNT; michael@0: p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); michael@0: diff/=SLOPE_TAIL_COUNT; michael@0: p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); michael@0: *p=SLOPE_MAX; michael@0: p+=4; michael@0: } michael@0: } else { michael@0: int32_t m; michael@0: michael@0: if(diff>=SLOPE_REACH_NEG_2) { michael@0: NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); michael@0: *p++=(uint8_t)(SLOPE_START_NEG_2+diff); michael@0: *p++=(uint8_t)(SLOPE_MIN+m); michael@0: } else if(diff>=SLOPE_REACH_NEG_3) { michael@0: NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); michael@0: p[2]=(uint8_t)(SLOPE_MIN+m); michael@0: NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); michael@0: p[1]=(uint8_t)(SLOPE_MIN+m); michael@0: *p=(uint8_t)(SLOPE_START_NEG_3+diff); michael@0: p+=3; michael@0: } else { michael@0: NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); michael@0: p[3]=(uint8_t)(SLOPE_MIN+m); michael@0: NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); michael@0: p[2]=(uint8_t)(SLOPE_MIN+m); michael@0: NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); michael@0: p[1]=(uint8_t)(SLOPE_MIN+m); michael@0: *p=SLOPE_MIN; michael@0: p+=4; michael@0: } michael@0: } michael@0: return p; michael@0: } michael@0: michael@0: /* michael@0: * Encode the code points of a string as michael@0: * a sequence of byte-encoded differences (slope detection), michael@0: * preserving lexical order. michael@0: * michael@0: * Optimize the difference-taking for runs of Unicode text within michael@0: * small scripts: michael@0: * michael@0: * Most small scripts are allocated within aligned 128-blocks of Unicode michael@0: * code points. Lexical order is preserved if "prev" is always moved michael@0: * into the middle of such a block. michael@0: * michael@0: * Additionally, "prev" is moved from anywhere in the Unihan michael@0: * area into the middle of that area. michael@0: * Note that the identical-level run in a sort key is generated from michael@0: * NFD text - there are never Hangul characters included. michael@0: */ michael@0: U_CFUNC void michael@0: u_writeIdenticalLevelRun(const UChar *s, int32_t length, icu::ByteSink &sink) { michael@0: char scratch[64]; michael@0: int32_t capacity; michael@0: michael@0: UChar32 prev=0; michael@0: int32_t i=0; michael@0: while(i=SLOPE_MAX_BYTES in case u_writeDiff() writes that much, michael@0: // but we do not want to force the sink.GetAppendBuffer() to allocate michael@0: // for a large min_capacity because we might actually only write one byte. michael@0: if(capacity<16) { michael@0: buffer=scratch; michael@0: capacity=(int32_t)sizeof(scratch); michael@0: } michael@0: p=reinterpret_cast(buffer); michael@0: uint8_t *lastSafe=p+capacity-SLOPE_MAX_BYTES; michael@0: while(i=0xa000) { michael@0: prev=(prev&~0x7f)-SLOPE_REACH_NEG_1; michael@0: } else { michael@0: /* michael@0: * Unihan U+4e00..U+9fa5: michael@0: * double-bytes down from the upper end michael@0: */ michael@0: prev=0x9fff-SLOPE_REACH_POS_2; michael@0: } michael@0: michael@0: UChar32 c; michael@0: U16_NEXT(s, i, length, c); michael@0: p=u_writeDiff(c-prev, p); michael@0: prev=c; michael@0: } michael@0: sink.Append(buffer, (int32_t)(p-reinterpret_cast(buffer))); michael@0: } michael@0: } michael@0: michael@0: U_CFUNC int32_t michael@0: u_writeIdenticalLevelRunTwoChars(UChar32 first, UChar32 second, uint8_t *p) { michael@0: uint8_t *p0 = p; michael@0: if(first<0x4e00 || first>=0xa000) { michael@0: first=(first&~0x7f)-SLOPE_REACH_NEG_1; michael@0: } else { michael@0: /* michael@0: * Unihan U+4e00..U+9fa5: michael@0: * double-bytes down from the upper end michael@0: */ michael@0: first=0x9fff-SLOPE_REACH_POS_2; michael@0: } michael@0: michael@0: p=u_writeDiff(second-first, p); michael@0: return (int32_t)(p-p0); michael@0: } michael@0: michael@0: #endif /* #if !UCONFIG_NO_COLLATION */