Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ****************************************************************************** |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2007, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ****************************************************************************** |
michael@0 | 8 | * file name: bmpset.h |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2007jan29 |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | */ |
michael@0 | 16 | |
michael@0 | 17 | #ifndef __BMPSET_H__ |
michael@0 | 18 | #define __BMPSET_H__ |
michael@0 | 19 | |
michael@0 | 20 | #include "unicode/utypes.h" |
michael@0 | 21 | #include "unicode/uniset.h" |
michael@0 | 22 | |
michael@0 | 23 | U_NAMESPACE_BEGIN |
michael@0 | 24 | |
michael@0 | 25 | /* |
michael@0 | 26 | * Helper class for frozen UnicodeSets, implements contains() and span() |
michael@0 | 27 | * optimized for BMP code points. Structured to be UTF-8-friendly. |
michael@0 | 28 | * |
michael@0 | 29 | * ASCII: Look up bytes. |
michael@0 | 30 | * 2-byte characters: Bits organized vertically. |
michael@0 | 31 | * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, |
michael@0 | 32 | * with mixed for illegal ranges. |
michael@0 | 33 | * Supplementary characters: Call contains() on the parent set. |
michael@0 | 34 | */ |
michael@0 | 35 | class BMPSet : public UMemory { |
michael@0 | 36 | public: |
michael@0 | 37 | BMPSet(const int32_t *parentList, int32_t parentListLength); |
michael@0 | 38 | BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength); |
michael@0 | 39 | virtual ~BMPSet(); |
michael@0 | 40 | |
michael@0 | 41 | virtual UBool contains(UChar32 c) const; |
michael@0 | 42 | |
michael@0 | 43 | /* |
michael@0 | 44 | * Span the initial substring for which each character c has spanCondition==contains(c). |
michael@0 | 45 | * It must be s<limit and spanCondition==0 or 1. |
michael@0 | 46 | * @return The string pointer which limits the span. |
michael@0 | 47 | */ |
michael@0 | 48 | const UChar *span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const; |
michael@0 | 49 | |
michael@0 | 50 | /* |
michael@0 | 51 | * Span the trailing substring for which each character c has spanCondition==contains(c). |
michael@0 | 52 | * It must be s<limit and spanCondition==0 or 1. |
michael@0 | 53 | * @return The string pointer which starts the span. |
michael@0 | 54 | */ |
michael@0 | 55 | const UChar *spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const; |
michael@0 | 56 | |
michael@0 | 57 | /* |
michael@0 | 58 | * Span the initial substring for which each character c has spanCondition==contains(c). |
michael@0 | 59 | * It must be length>0 and spanCondition==0 or 1. |
michael@0 | 60 | * @return The string pointer which limits the span. |
michael@0 | 61 | */ |
michael@0 | 62 | const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const; |
michael@0 | 63 | |
michael@0 | 64 | /* |
michael@0 | 65 | * Span the trailing substring for which each character c has spanCondition==contains(c). |
michael@0 | 66 | * It must be length>0 and spanCondition==0 or 1. |
michael@0 | 67 | * @return The start of the span. |
michael@0 | 68 | */ |
michael@0 | 69 | int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const; |
michael@0 | 70 | |
michael@0 | 71 | private: |
michael@0 | 72 | void initBits(); |
michael@0 | 73 | void overrideIllegal(); |
michael@0 | 74 | |
michael@0 | 75 | /** |
michael@0 | 76 | * Same as UnicodeSet::findCodePoint(UChar32 c) const except that the |
michael@0 | 77 | * binary search is restricted for finding code points in a certain range. |
michael@0 | 78 | * |
michael@0 | 79 | * For restricting the search for finding in the range start..end, |
michael@0 | 80 | * pass in |
michael@0 | 81 | * lo=findCodePoint(start) and |
michael@0 | 82 | * hi=findCodePoint(end) |
michael@0 | 83 | * with 0<=lo<=hi<len. |
michael@0 | 84 | * findCodePoint(c) defaults to lo=0 and hi=len-1. |
michael@0 | 85 | * |
michael@0 | 86 | * @param c a character in a subrange of MIN_VALUE..MAX_VALUE |
michael@0 | 87 | * @param lo The lowest index to be returned. |
michael@0 | 88 | * @param hi The highest index to be returned. |
michael@0 | 89 | * @return the smallest integer i in the range lo..hi, |
michael@0 | 90 | * inclusive, such that c < list[i] |
michael@0 | 91 | */ |
michael@0 | 92 | int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const; |
michael@0 | 93 | |
michael@0 | 94 | inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const; |
michael@0 | 95 | |
michael@0 | 96 | /* |
michael@0 | 97 | * One byte per ASCII character, or trail byte in lead position. |
michael@0 | 98 | * 0 or 1 for ASCII characters. |
michael@0 | 99 | * The value for trail bytes is the result of contains(FFFD) |
michael@0 | 100 | * for faster validity checking at runtime. |
michael@0 | 101 | */ |
michael@0 | 102 | UBool asciiBytes[0xc0]; |
michael@0 | 103 | |
michael@0 | 104 | /* |
michael@0 | 105 | * One bit per code point from U+0000..U+07FF. |
michael@0 | 106 | * The bits are organized vertically; consecutive code points |
michael@0 | 107 | * correspond to the same bit positions in consecutive table words. |
michael@0 | 108 | * With code point parts |
michael@0 | 109 | * lead=c{10..6} |
michael@0 | 110 | * trail=c{5..0} |
michael@0 | 111 | * it is set.contains(c)==(table7FF[trail] bit lead) |
michael@0 | 112 | * |
michael@0 | 113 | * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) |
michael@0 | 114 | * for faster validity checking at runtime. |
michael@0 | 115 | */ |
michael@0 | 116 | uint32_t table7FF[64]; |
michael@0 | 117 | |
michael@0 | 118 | /* |
michael@0 | 119 | * One bit per 64 BMP code points. |
michael@0 | 120 | * The bits are organized vertically; consecutive 64-code point blocks |
michael@0 | 121 | * correspond to the same bit position in consecutive table words. |
michael@0 | 122 | * With code point parts |
michael@0 | 123 | * lead=c{15..12} |
michael@0 | 124 | * t1=c{11..6} |
michael@0 | 125 | * test bits (lead+16) and lead in bmpBlockBits[t1]. |
michael@0 | 126 | * If the upper bit is 0, then the lower bit indicates if contains(c) |
michael@0 | 127 | * for all code points in the 64-block. |
michael@0 | 128 | * If the upper bit is 1, then the block is mixed and set.contains(c) |
michael@0 | 129 | * must be called. |
michael@0 | 130 | * |
michael@0 | 131 | * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to |
michael@0 | 132 | * the result of contains(FFFD) for faster validity checking at runtime. |
michael@0 | 133 | */ |
michael@0 | 134 | uint32_t bmpBlockBits[64]; |
michael@0 | 135 | |
michael@0 | 136 | /* |
michael@0 | 137 | * Inversion list indexes for restricted binary searches in |
michael@0 | 138 | * findCodePoint(), from |
michael@0 | 139 | * findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000). |
michael@0 | 140 | * U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are |
michael@0 | 141 | * always looked up in the bit tables. |
michael@0 | 142 | * The last pair of indexes is for finding supplementary code points. |
michael@0 | 143 | */ |
michael@0 | 144 | int32_t list4kStarts[18]; |
michael@0 | 145 | |
michael@0 | 146 | /* |
michael@0 | 147 | * The inversion list of the parent set, for the slower contains() implementation |
michael@0 | 148 | * for mixed BMP blocks and for supplementary code points. |
michael@0 | 149 | * The list is terminated with list[listLength-1]=0x110000. |
michael@0 | 150 | */ |
michael@0 | 151 | const int32_t *list; |
michael@0 | 152 | int32_t listLength; |
michael@0 | 153 | }; |
michael@0 | 154 | |
michael@0 | 155 | inline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const { |
michael@0 | 156 | return (UBool)(findCodePoint(c, lo, hi) & 1); |
michael@0 | 157 | } |
michael@0 | 158 | |
michael@0 | 159 | U_NAMESPACE_END |
michael@0 | 160 | |
michael@0 | 161 | #endif |