intl/icu/source/common/bmpset.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 ******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2007, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 ******************************************************************************
michael@0 8 * file name: bmpset.h
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2007jan29
michael@0 14 * created by: Markus W. Scherer
michael@0 15 */
michael@0 16
michael@0 17 #ifndef __BMPSET_H__
michael@0 18 #define __BMPSET_H__
michael@0 19
michael@0 20 #include "unicode/utypes.h"
michael@0 21 #include "unicode/uniset.h"
michael@0 22
michael@0 23 U_NAMESPACE_BEGIN
michael@0 24
michael@0 25 /*
michael@0 26 * Helper class for frozen UnicodeSets, implements contains() and span()
michael@0 27 * optimized for BMP code points. Structured to be UTF-8-friendly.
michael@0 28 *
michael@0 29 * ASCII: Look up bytes.
michael@0 30 * 2-byte characters: Bits organized vertically.
michael@0 31 * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
michael@0 32 * with mixed for illegal ranges.
michael@0 33 * Supplementary characters: Call contains() on the parent set.
michael@0 34 */
michael@0 35 class BMPSet : public UMemory {
michael@0 36 public:
michael@0 37 BMPSet(const int32_t *parentList, int32_t parentListLength);
michael@0 38 BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength);
michael@0 39 virtual ~BMPSet();
michael@0 40
michael@0 41 virtual UBool contains(UChar32 c) const;
michael@0 42
michael@0 43 /*
michael@0 44 * Span the initial substring for which each character c has spanCondition==contains(c).
michael@0 45 * It must be s<limit and spanCondition==0 or 1.
michael@0 46 * @return The string pointer which limits the span.
michael@0 47 */
michael@0 48 const UChar *span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;
michael@0 49
michael@0 50 /*
michael@0 51 * Span the trailing substring for which each character c has spanCondition==contains(c).
michael@0 52 * It must be s<limit and spanCondition==0 or 1.
michael@0 53 * @return The string pointer which starts the span.
michael@0 54 */
michael@0 55 const UChar *spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;
michael@0 56
michael@0 57 /*
michael@0 58 * Span the initial substring for which each character c has spanCondition==contains(c).
michael@0 59 * It must be length>0 and spanCondition==0 or 1.
michael@0 60 * @return The string pointer which limits the span.
michael@0 61 */
michael@0 62 const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
michael@0 63
michael@0 64 /*
michael@0 65 * Span the trailing substring for which each character c has spanCondition==contains(c).
michael@0 66 * It must be length>0 and spanCondition==0 or 1.
michael@0 67 * @return The start of the span.
michael@0 68 */
michael@0 69 int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
michael@0 70
michael@0 71 private:
michael@0 72 void initBits();
michael@0 73 void overrideIllegal();
michael@0 74
michael@0 75 /**
michael@0 76 * Same as UnicodeSet::findCodePoint(UChar32 c) const except that the
michael@0 77 * binary search is restricted for finding code points in a certain range.
michael@0 78 *
michael@0 79 * For restricting the search for finding in the range start..end,
michael@0 80 * pass in
michael@0 81 * lo=findCodePoint(start) and
michael@0 82 * hi=findCodePoint(end)
michael@0 83 * with 0<=lo<=hi<len.
michael@0 84 * findCodePoint(c) defaults to lo=0 and hi=len-1.
michael@0 85 *
michael@0 86 * @param c a character in a subrange of MIN_VALUE..MAX_VALUE
michael@0 87 * @param lo The lowest index to be returned.
michael@0 88 * @param hi The highest index to be returned.
michael@0 89 * @return the smallest integer i in the range lo..hi,
michael@0 90 * inclusive, such that c < list[i]
michael@0 91 */
michael@0 92 int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const;
michael@0 93
michael@0 94 inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
michael@0 95
michael@0 96 /*
michael@0 97 * One byte per ASCII character, or trail byte in lead position.
michael@0 98 * 0 or 1 for ASCII characters.
michael@0 99 * The value for trail bytes is the result of contains(FFFD)
michael@0 100 * for faster validity checking at runtime.
michael@0 101 */
michael@0 102 UBool asciiBytes[0xc0];
michael@0 103
michael@0 104 /*
michael@0 105 * One bit per code point from U+0000..U+07FF.
michael@0 106 * The bits are organized vertically; consecutive code points
michael@0 107 * correspond to the same bit positions in consecutive table words.
michael@0 108 * With code point parts
michael@0 109 * lead=c{10..6}
michael@0 110 * trail=c{5..0}
michael@0 111 * it is set.contains(c)==(table7FF[trail] bit lead)
michael@0 112 *
michael@0 113 * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD)
michael@0 114 * for faster validity checking at runtime.
michael@0 115 */
michael@0 116 uint32_t table7FF[64];
michael@0 117
michael@0 118 /*
michael@0 119 * One bit per 64 BMP code points.
michael@0 120 * The bits are organized vertically; consecutive 64-code point blocks
michael@0 121 * correspond to the same bit position in consecutive table words.
michael@0 122 * With code point parts
michael@0 123 * lead=c{15..12}
michael@0 124 * t1=c{11..6}
michael@0 125 * test bits (lead+16) and lead in bmpBlockBits[t1].
michael@0 126 * If the upper bit is 0, then the lower bit indicates if contains(c)
michael@0 127 * for all code points in the 64-block.
michael@0 128 * If the upper bit is 1, then the block is mixed and set.contains(c)
michael@0 129 * must be called.
michael@0 130 *
michael@0 131 * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to
michael@0 132 * the result of contains(FFFD) for faster validity checking at runtime.
michael@0 133 */
michael@0 134 uint32_t bmpBlockBits[64];
michael@0 135
michael@0 136 /*
michael@0 137 * Inversion list indexes for restricted binary searches in
michael@0 138 * findCodePoint(), from
michael@0 139 * findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000).
michael@0 140 * U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
michael@0 141 * always looked up in the bit tables.
michael@0 142 * The last pair of indexes is for finding supplementary code points.
michael@0 143 */
michael@0 144 int32_t list4kStarts[18];
michael@0 145
michael@0 146 /*
michael@0 147 * The inversion list of the parent set, for the slower contains() implementation
michael@0 148 * for mixed BMP blocks and for supplementary code points.
michael@0 149 * The list is terminated with list[listLength-1]=0x110000.
michael@0 150 */
michael@0 151 const int32_t *list;
michael@0 152 int32_t listLength;
michael@0 153 };
michael@0 154
michael@0 155 inline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const {
michael@0 156 return (UBool)(findCodePoint(c, lo, hi) & 1);
michael@0 157 }
michael@0 158
michael@0 159 U_NAMESPACE_END
michael@0 160
michael@0 161 #endif

mercurial