intl/icu/source/common/utf_impl.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/utf_impl.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,326 @@
     1.4 +/*
     1.5 +******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 1999-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +******************************************************************************
    1.11 +*   file name:  utf_impl.c
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 1999sep13
    1.17 +*   created by: Markus W. Scherer
    1.18 +*
    1.19 +*   This file provides implementation functions for macros in the utfXX.h
    1.20 +*   that would otherwise be too long as macros.
    1.21 +*/
    1.22 +
    1.23 +/* set import/export definitions */
    1.24 +#ifndef U_UTF8_IMPL
    1.25 +#   define U_UTF8_IMPL
    1.26 +#endif
    1.27 +
    1.28 +#include "unicode/utypes.h"
    1.29 +#include "unicode/utf.h"
    1.30 +#include "unicode/utf8.h"
    1.31 +#include "unicode/utf_old.h"
    1.32 +#include "uassert.h"
    1.33 +
    1.34 +/*
    1.35 + * This table could be replaced on many machines by
    1.36 + * a few lines of assembler code using an
    1.37 + * "index of first 0-bit from msb" instruction and
    1.38 + * one or two more integer instructions.
    1.39 + *
    1.40 + * For example, on an i386, do something like
    1.41 + * - MOV AL, leadByte
    1.42 + * - NOT AL         (8-bit, leave b15..b8==0..0, reverse only b7..b0)
    1.43 + * - MOV AH, 0
    1.44 + * - BSR BX, AX     (16-bit)
    1.45 + * - MOV AX, 6      (result)
    1.46 + * - JZ finish      (ZF==1 if leadByte==0xff)
    1.47 + * - SUB AX, BX (result)
    1.48 + * -finish:
    1.49 + * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
    1.50 + *
    1.51 + * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
    1.52 + * lead bytes above 0xf4 are illegal.
    1.53 + * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
    1.54 + */
    1.55 +U_EXPORT const uint8_t 
    1.56 +utf8_countTrailBytes[256]={
    1.57 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.58 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.59 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.60 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.61 +
    1.62 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.63 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.64 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.65 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.66 +
    1.67 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.68 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.69 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.70 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.71 +
    1.72 +    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1.73 +    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1.74 +
    1.75 +    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1.76 +    3, 3, 3, 3, 3,
    1.77 +    3, 3, 3,    /* illegal in Unicode */
    1.78 +    4, 4, 4, 4, /* illegal in Unicode */
    1.79 +    5, 5,       /* illegal in Unicode */
    1.80 +    0, 0        /* illegal bytes 0xfe and 0xff */
    1.81 +};
    1.82 +
    1.83 +static const UChar32
    1.84 +utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
    1.85 +
    1.86 +static const UChar32
    1.87 +utf8_errorValue[6]={
    1.88 +    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
    1.89 +    0x3ffffff, 0x7fffffff
    1.90 +};
    1.91 +
    1.92 +static UChar32
    1.93 +errorValue(int32_t count, int8_t strict) {
    1.94 +    if(strict>=0) {
    1.95 +        return utf8_errorValue[count];
    1.96 +    } else if(strict==-3) {
    1.97 +        return 0xfffd;
    1.98 +    } else {
    1.99 +        return U_SENTINEL;
   1.100 +    }
   1.101 +}
   1.102 +
   1.103 +/*
   1.104 + * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
   1.105 + * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
   1.106 + *
   1.107 + * U8_NEXT() supports NUL-terminated strings indicated via length<0.
   1.108 + *
   1.109 + * The "strict" parameter controls the error behavior:
   1.110 + * <0  "Safe" behavior of U8_NEXT():
   1.111 + *     -1: All illegal byte sequences yield U_SENTINEL=-1.
   1.112 + *     -2: Same as -1, except for lenient treatment of surrogate code points as legal.
   1.113 + *         Some implementations use this for roundtripping of
   1.114 + *         Unicode 16-bit strings that are not well-formed UTF-16, that is, they
   1.115 + *         contain unpaired surrogates.
   1.116 + *     -3: All illegal byte sequences yield U+FFFD.
   1.117 + *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
   1.118 + *     All illegal byte sequences yield a positive code point such that this
   1.119 + *     result code point would be encoded with the same number of bytes as
   1.120 + *     the illegal sequence.
   1.121 + * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
   1.122 + *     Same as the obsolete "safe" behavior, but non-characters are also treated
   1.123 + *     like illegal sequences.
   1.124 + *
   1.125 + * Note that a UBool is the same as an int8_t.
   1.126 + */
   1.127 +U_CAPI UChar32 U_EXPORT2
   1.128 +utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
   1.129 +    int32_t i=*pi;
   1.130 +    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
   1.131 +    U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
   1.132 +    if(i+count<=length || length<0) {
   1.133 +        uint8_t trail;
   1.134 +
   1.135 +        U8_MASK_LEAD_BYTE(c, count);
   1.136 +        /* support NUL-terminated strings: do not read beyond the first non-trail byte */
   1.137 +        switch(count) {
   1.138 +        /* each branch falls through to the next one */
   1.139 +        case 0:
   1.140 +            /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
   1.141 +        case 5:
   1.142 +        case 4:
   1.143 +            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
   1.144 +            break;
   1.145 +        case 3:
   1.146 +            trail=s[i++]-0x80;
   1.147 +            c=(c<<6)|trail;
   1.148 +            /* c>=0x110 would result in code point>0x10ffff, outside Unicode */
   1.149 +            if(c>=0x110 || trail>0x3f) { break; }
   1.150 +        case 2:
   1.151 +            trail=s[i++]-0x80;
   1.152 +            c=(c<<6)|trail;
   1.153 +            /*
   1.154 +             * test for a surrogate d800..dfff unless we are lenient:
   1.155 +             * before the last (c<<6), a surrogate is c=360..37f
   1.156 +             */
   1.157 +            if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
   1.158 +        case 1:
   1.159 +            trail=s[i++]-0x80;
   1.160 +            c=(c<<6)|trail;
   1.161 +            if(trail>0x3f) { break; }
   1.162 +            /* correct sequence - all trail bytes have (b7..b6)==(10) */
   1.163 +            if(c>=utf8_minLegal[count] &&
   1.164 +                    /* strict: forbid non-characters like U+fffe */
   1.165 +                    (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
   1.166 +                *pi=i;
   1.167 +                return c;
   1.168 +            }
   1.169 +        /* no default branch to optimize switch()  - all values are covered */
   1.170 +        }
   1.171 +    } else {
   1.172 +        /* too few bytes left */
   1.173 +        count=length-i;
   1.174 +    }
   1.175 +
   1.176 +    /* error handling */
   1.177 +    i=*pi;
   1.178 +    while(count>0 && U8_IS_TRAIL(s[i])) {
   1.179 +        ++i;
   1.180 +        --count;
   1.181 +    }
   1.182 +    c=errorValue(i-*pi, strict);
   1.183 +    *pi=i;
   1.184 +    return c;
   1.185 +}
   1.186 +
   1.187 +U_CAPI int32_t U_EXPORT2
   1.188 +utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) {
   1.189 +    if((uint32_t)(c)<=0x7ff) {
   1.190 +        if((i)+1<(length)) {
   1.191 +            (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
   1.192 +            (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
   1.193 +            return i;
   1.194 +        }
   1.195 +    } else if((uint32_t)(c)<=0xffff) {
   1.196 +        /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
   1.197 +        if((i)+2<(length) && !U_IS_SURROGATE(c)) {
   1.198 +            (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
   1.199 +            (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
   1.200 +            (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
   1.201 +            return i;
   1.202 +        }
   1.203 +    } else if((uint32_t)(c)<=0x10ffff) {
   1.204 +        if((i)+3<(length)) {
   1.205 +            (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);
   1.206 +            (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);
   1.207 +            (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
   1.208 +            (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
   1.209 +            return i;
   1.210 +        }
   1.211 +    }
   1.212 +    /* c>0x10ffff or not enough space, write an error value */
   1.213 +    if(pIsError!=NULL) {
   1.214 +        *pIsError=TRUE;
   1.215 +    } else {
   1.216 +        length-=i;
   1.217 +        if(length>0) {
   1.218 +            int32_t offset;
   1.219 +            if(length>3) {
   1.220 +                length=3;
   1.221 +            }
   1.222 +            s+=i;
   1.223 +            offset=0;
   1.224 +            c=utf8_errorValue[length-1];
   1.225 +            UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
   1.226 +            i=i+offset;
   1.227 +        }
   1.228 +    }
   1.229 +    return i;
   1.230 +}
   1.231 +
   1.232 +U_CAPI UChar32 U_EXPORT2
   1.233 +utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
   1.234 +    int32_t i=*pi;
   1.235 +    uint8_t b, count=1, shift=6;
   1.236 +
   1.237 +    if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
   1.238 +
   1.239 +    /* extract value bits from the last trail byte */
   1.240 +    c&=0x3f;
   1.241 +
   1.242 +    for(;;) {
   1.243 +        if(i<=start) {
   1.244 +            /* no lead byte at all */
   1.245 +            return errorValue(0, strict);
   1.246 +        }
   1.247 +
   1.248 +        /* read another previous byte */
   1.249 +        b=s[--i];
   1.250 +        if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
   1.251 +            if(b&0x40) {
   1.252 +                /* lead byte, this will always end the loop */
   1.253 +                uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
   1.254 +
   1.255 +                if(count==shouldCount) {
   1.256 +                    /* set the new position */
   1.257 +                    *pi=i;
   1.258 +                    U8_MASK_LEAD_BYTE(b, count);
   1.259 +                    c|=(UChar32)b<<shift;
   1.260 +                    if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) {
   1.261 +                        /* illegal sequence or (strict and non-character) */
   1.262 +                        if(count>=4) {
   1.263 +                            count=3;
   1.264 +                        }
   1.265 +                        c=errorValue(count, strict);
   1.266 +                    } else {
   1.267 +                        /* exit with correct c */
   1.268 +                    }
   1.269 +                } else {
   1.270 +                    /* the lead byte does not match the number of trail bytes */
   1.271 +                    /* only set the position to the lead byte if it would
   1.272 +                       include the trail byte that we started with */
   1.273 +                    if(count<shouldCount) {
   1.274 +                        *pi=i;
   1.275 +                        c=errorValue(count, strict);
   1.276 +                    } else {
   1.277 +                        c=errorValue(0, strict);
   1.278 +                    }
   1.279 +                }
   1.280 +                break;
   1.281 +            } else if(count<5) {
   1.282 +                /* trail byte */
   1.283 +                c|=(UChar32)(b&0x3f)<<shift;
   1.284 +                ++count;
   1.285 +                shift+=6;
   1.286 +            } else {
   1.287 +                /* more than 5 trail bytes is illegal */
   1.288 +                c=errorValue(0, strict);
   1.289 +                break;
   1.290 +            }
   1.291 +        } else {
   1.292 +            /* single-byte character precedes trailing bytes */
   1.293 +            c=errorValue(0, strict);
   1.294 +            break;
   1.295 +        }
   1.296 +    }
   1.297 +    return c;
   1.298 +}
   1.299 +
   1.300 +U_CAPI int32_t U_EXPORT2
   1.301 +utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
   1.302 +    /* i had been decremented once before the function call */
   1.303 +    int32_t I=i, Z;
   1.304 +    uint8_t b;
   1.305 +
   1.306 +    /* read at most the 6 bytes s[Z] to s[i], inclusively */
   1.307 +    if(I-5>start) {
   1.308 +        Z=I-5;
   1.309 +    } else {
   1.310 +        Z=start;
   1.311 +    }
   1.312 +
   1.313 +    /* return I if the sequence starting there is long enough to include i */
   1.314 +    do {
   1.315 +        b=s[I];
   1.316 +        if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
   1.317 +            break;
   1.318 +        } else if(b>=0xc0) {
   1.319 +            if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
   1.320 +                return I;
   1.321 +            } else {
   1.322 +                break;
   1.323 +            }
   1.324 +        }
   1.325 +    } while(Z<=--I);
   1.326 +
   1.327 +    /* return i itself to be consistent with the FWD_1 macro */
   1.328 +    return i;
   1.329 +}

mercurial