intl/icu/source/common/bmpset.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 ******************************************************************************
     3 *
     4 *   Copyright (C) 2007, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 ******************************************************************************
     8 *   file name:  bmpset.h
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2007jan29
    14 *   created by: Markus W. Scherer
    15 */
    17 #ifndef __BMPSET_H__
    18 #define __BMPSET_H__
    20 #include "unicode/utypes.h"
    21 #include "unicode/uniset.h"
    23 U_NAMESPACE_BEGIN
    25 /*
    26  * Helper class for frozen UnicodeSets, implements contains() and span()
    27  * optimized for BMP code points. Structured to be UTF-8-friendly.
    28  *
    29  * ASCII: Look up bytes.
    30  * 2-byte characters: Bits organized vertically.
    31  * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
    32  *                    with mixed for illegal ranges.
    33  * Supplementary characters: Call contains() on the parent set.
    34  */
    35 class BMPSet : public UMemory {
    36 public:
    37     BMPSet(const int32_t *parentList, int32_t parentListLength);
    38     BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength);
    39     virtual ~BMPSet();
    41     virtual UBool contains(UChar32 c) const;
    43     /*
    44      * Span the initial substring for which each character c has spanCondition==contains(c).
    45      * It must be s<limit and spanCondition==0 or 1.
    46      * @return The string pointer which limits the span.
    47      */
    48     const UChar *span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;
    50     /*
    51      * Span the trailing substring for which each character c has spanCondition==contains(c).
    52      * It must be s<limit and spanCondition==0 or 1.
    53      * @return The string pointer which starts the span.
    54      */
    55     const UChar *spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;
    57     /*
    58      * Span the initial substring for which each character c has spanCondition==contains(c).
    59      * It must be length>0 and spanCondition==0 or 1.
    60      * @return The string pointer which limits the span.
    61      */
    62     const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
    64     /*
    65      * Span the trailing substring for which each character c has spanCondition==contains(c).
    66      * It must be length>0 and spanCondition==0 or 1.
    67      * @return The start of the span.
    68      */
    69     int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
    71 private:
    72     void initBits();
    73     void overrideIllegal();
    75     /**
    76      * Same as UnicodeSet::findCodePoint(UChar32 c) const except that the
    77      * binary search is restricted for finding code points in a certain range.
    78      *
    79      * For restricting the search for finding in the range start..end,
    80      * pass in
    81      *   lo=findCodePoint(start) and
    82      *   hi=findCodePoint(end)
    83      * with 0<=lo<=hi<len.
    84      * findCodePoint(c) defaults to lo=0 and hi=len-1.
    85      *
    86      * @param c a character in a subrange of MIN_VALUE..MAX_VALUE
    87      * @param lo The lowest index to be returned.
    88      * @param hi The highest index to be returned.
    89      * @return the smallest integer i in the range lo..hi,
    90      *         inclusive, such that c < list[i]
    91      */
    92     int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const;
    94     inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
    96     /*
    97      * One byte per ASCII character, or trail byte in lead position.
    98      * 0 or 1 for ASCII characters.
    99      * The value for trail bytes is the result of contains(FFFD)
   100      * for faster validity checking at runtime.
   101      */
   102     UBool asciiBytes[0xc0];
   104     /*
   105      * One bit per code point from U+0000..U+07FF.
   106      * The bits are organized vertically; consecutive code points
   107      * correspond to the same bit positions in consecutive table words.
   108      * With code point parts
   109      *   lead=c{10..6}
   110      *   trail=c{5..0}
   111      * it is set.contains(c)==(table7FF[trail] bit lead)
   112      *
   113      * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD)
   114      * for faster validity checking at runtime.
   115      */
   116     uint32_t table7FF[64];
   118     /*
   119      * One bit per 64 BMP code points.
   120      * The bits are organized vertically; consecutive 64-code point blocks
   121      * correspond to the same bit position in consecutive table words.
   122      * With code point parts
   123      *   lead=c{15..12}
   124      *   t1=c{11..6}
   125      * test bits (lead+16) and lead in bmpBlockBits[t1].
   126      * If the upper bit is 0, then the lower bit indicates if contains(c)
   127      * for all code points in the 64-block.
   128      * If the upper bit is 1, then the block is mixed and set.contains(c)
   129      * must be called.
   130      *
   131      * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to
   132      * the result of contains(FFFD) for faster validity checking at runtime.
   133      */
   134     uint32_t bmpBlockBits[64];
   136     /*
   137      * Inversion list indexes for restricted binary searches in
   138      * findCodePoint(), from
   139      * findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000).
   140      * U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
   141      * always looked up in the bit tables.
   142      * The last pair of indexes is for finding supplementary code points.
   143      */
   144     int32_t list4kStarts[18];
   146     /*
   147      * The inversion list of the parent set, for the slower contains() implementation
   148      * for mixed BMP blocks and for supplementary code points.
   149      * The list is terminated with list[listLength-1]=0x110000.
   150      */
   151     const int32_t *list;
   152     int32_t listLength;
   153 };
   155 inline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const {
   156     return (UBool)(findCodePoint(c, lo, hi) & 1);
   157 }
   159 U_NAMESPACE_END
   161 #endif

mercurial