Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * Copyright (C) 2013, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ******************************************************************************* |
michael@0 | 6 | * dictionarydata.h |
michael@0 | 7 | * |
michael@0 | 8 | * created on: 2012may31 |
michael@0 | 9 | * created by: Markus W. Scherer & Maxime Serrano |
michael@0 | 10 | */ |
michael@0 | 11 | |
michael@0 | 12 | #include "dictionarydata.h" |
michael@0 | 13 | #include "unicode/ucharstrie.h" |
michael@0 | 14 | #include "unicode/bytestrie.h" |
michael@0 | 15 | #include "unicode/udata.h" |
michael@0 | 16 | #include "cmemory.h" |
michael@0 | 17 | |
michael@0 | 18 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 19 | |
michael@0 | 20 | U_NAMESPACE_BEGIN |
michael@0 | 21 | |
michael@0 | 22 | const int32_t DictionaryData::TRIE_TYPE_BYTES = 0; |
michael@0 | 23 | const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1; |
michael@0 | 24 | const int32_t DictionaryData::TRIE_TYPE_MASK = 7; |
michael@0 | 25 | const int32_t DictionaryData::TRIE_HAS_VALUES = 8; |
michael@0 | 26 | |
michael@0 | 27 | const int32_t DictionaryData::TRANSFORM_NONE = 0; |
michael@0 | 28 | const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; |
michael@0 | 29 | const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; |
michael@0 | 30 | const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; |
michael@0 | 31 | |
michael@0 | 32 | DictionaryMatcher::~DictionaryMatcher() { |
michael@0 | 33 | } |
michael@0 | 34 | |
michael@0 | 35 | UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { |
michael@0 | 36 | udata_close(file); |
michael@0 | 37 | } |
michael@0 | 38 | |
michael@0 | 39 | int32_t UCharsDictionaryMatcher::getType() const { |
michael@0 | 40 | return DictionaryData::TRIE_TYPE_UCHARS; |
michael@0 | 41 | } |
michael@0 | 42 | |
michael@0 | 43 | int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { |
michael@0 | 44 | UCharsTrie uct(characters); |
michael@0 | 45 | UChar32 c = utext_next32(text); |
michael@0 | 46 | if (c < 0) { |
michael@0 | 47 | return 0; |
michael@0 | 48 | } |
michael@0 | 49 | UStringTrieResult result = uct.first(c); |
michael@0 | 50 | int32_t numChars = 1; |
michael@0 | 51 | count = 0; |
michael@0 | 52 | for (;;) { |
michael@0 | 53 | if (USTRINGTRIE_HAS_VALUE(result)) { |
michael@0 | 54 | if (count < limit) { |
michael@0 | 55 | if (values != NULL) { |
michael@0 | 56 | values[count] = uct.getValue(); |
michael@0 | 57 | } |
michael@0 | 58 | lengths[count++] = numChars; |
michael@0 | 59 | } |
michael@0 | 60 | if (result == USTRINGTRIE_FINAL_VALUE) { |
michael@0 | 61 | break; |
michael@0 | 62 | } |
michael@0 | 63 | } |
michael@0 | 64 | else if (result == USTRINGTRIE_NO_MATCH) { |
michael@0 | 65 | break; |
michael@0 | 66 | } |
michael@0 | 67 | |
michael@0 | 68 | // TODO: why do we have a text limit if the UText knows its length? |
michael@0 | 69 | if (numChars >= maxLength) { |
michael@0 | 70 | break; |
michael@0 | 71 | } |
michael@0 | 72 | |
michael@0 | 73 | c = utext_next32(text); |
michael@0 | 74 | if (c < 0) { |
michael@0 | 75 | break; |
michael@0 | 76 | } |
michael@0 | 77 | ++numChars; |
michael@0 | 78 | result = uct.next(c); |
michael@0 | 79 | } |
michael@0 | 80 | return numChars; |
michael@0 | 81 | } |
michael@0 | 82 | |
michael@0 | 83 | BytesDictionaryMatcher::~BytesDictionaryMatcher() { |
michael@0 | 84 | udata_close(file); |
michael@0 | 85 | } |
michael@0 | 86 | |
michael@0 | 87 | UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { |
michael@0 | 88 | if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { |
michael@0 | 89 | if (c == 0x200D) { |
michael@0 | 90 | return 0xFF; |
michael@0 | 91 | } else if (c == 0x200C) { |
michael@0 | 92 | return 0xFE; |
michael@0 | 93 | } |
michael@0 | 94 | int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); |
michael@0 | 95 | if (delta < 0 || 0xFD < delta) { |
michael@0 | 96 | return U_SENTINEL; |
michael@0 | 97 | } |
michael@0 | 98 | return (UChar32)delta; |
michael@0 | 99 | } |
michael@0 | 100 | return c; |
michael@0 | 101 | } |
michael@0 | 102 | |
michael@0 | 103 | int32_t BytesDictionaryMatcher::getType() const { |
michael@0 | 104 | return DictionaryData::TRIE_TYPE_BYTES; |
michael@0 | 105 | } |
michael@0 | 106 | |
michael@0 | 107 | int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { |
michael@0 | 108 | BytesTrie bt(characters); |
michael@0 | 109 | UChar32 c = utext_next32(text); |
michael@0 | 110 | if (c < 0) { |
michael@0 | 111 | return 0; |
michael@0 | 112 | } |
michael@0 | 113 | UStringTrieResult result = bt.first(transform(c)); |
michael@0 | 114 | int32_t numChars = 1; |
michael@0 | 115 | count = 0; |
michael@0 | 116 | for (;;) { |
michael@0 | 117 | if (USTRINGTRIE_HAS_VALUE(result)) { |
michael@0 | 118 | if (count < limit) { |
michael@0 | 119 | if (values != NULL) { |
michael@0 | 120 | values[count] = bt.getValue(); |
michael@0 | 121 | } |
michael@0 | 122 | lengths[count++] = numChars; |
michael@0 | 123 | } |
michael@0 | 124 | if (result == USTRINGTRIE_FINAL_VALUE) { |
michael@0 | 125 | break; |
michael@0 | 126 | } |
michael@0 | 127 | } |
michael@0 | 128 | else if (result == USTRINGTRIE_NO_MATCH) { |
michael@0 | 129 | break; |
michael@0 | 130 | } |
michael@0 | 131 | |
michael@0 | 132 | // TODO: why do we have a text limit if the UText knows its length? |
michael@0 | 133 | if (numChars >= maxLength) { |
michael@0 | 134 | break; |
michael@0 | 135 | } |
michael@0 | 136 | |
michael@0 | 137 | c = utext_next32(text); |
michael@0 | 138 | if (c < 0) { |
michael@0 | 139 | break; |
michael@0 | 140 | } |
michael@0 | 141 | ++numChars; |
michael@0 | 142 | result = bt.next(transform(c)); |
michael@0 | 143 | } |
michael@0 | 144 | return numChars; |
michael@0 | 145 | } |
michael@0 | 146 | |
michael@0 | 147 | |
michael@0 | 148 | U_NAMESPACE_END |
michael@0 | 149 | |
michael@0 | 150 | U_NAMESPACE_USE |
michael@0 | 151 | |
michael@0 | 152 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 153 | udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, |
michael@0 | 154 | void *outData, UErrorCode *pErrorCode) { |
michael@0 | 155 | const UDataInfo *pInfo; |
michael@0 | 156 | int32_t headerSize; |
michael@0 | 157 | const uint8_t *inBytes; |
michael@0 | 158 | uint8_t *outBytes; |
michael@0 | 159 | const int32_t *inIndexes; |
michael@0 | 160 | int32_t indexes[DictionaryData::IX_COUNT]; |
michael@0 | 161 | int32_t i, offset, size; |
michael@0 | 162 | |
michael@0 | 163 | headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); |
michael@0 | 164 | if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; |
michael@0 | 165 | pInfo = (const UDataInfo *)((const char *)inData + 4); |
michael@0 | 166 | if (!(pInfo->dataFormat[0] == 0x44 && |
michael@0 | 167 | pInfo->dataFormat[1] == 0x69 && |
michael@0 | 168 | pInfo->dataFormat[2] == 0x63 && |
michael@0 | 169 | pInfo->dataFormat[3] == 0x74 && |
michael@0 | 170 | pInfo->formatVersion[0] == 1)) { |
michael@0 | 171 | udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", |
michael@0 | 172 | pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); |
michael@0 | 173 | *pErrorCode = U_UNSUPPORTED_ERROR; |
michael@0 | 174 | return 0; |
michael@0 | 175 | } |
michael@0 | 176 | |
michael@0 | 177 | inBytes = (const uint8_t *)inData + headerSize; |
michael@0 | 178 | outBytes = (uint8_t *)outData + headerSize; |
michael@0 | 179 | |
michael@0 | 180 | inIndexes = (const int32_t *)inBytes; |
michael@0 | 181 | if (length >= 0) { |
michael@0 | 182 | length -= headerSize; |
michael@0 | 183 | if (length < (int32_t)(sizeof(indexes))) { |
michael@0 | 184 | udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); |
michael@0 | 185 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 186 | return 0; |
michael@0 | 187 | } |
michael@0 | 188 | } |
michael@0 | 189 | |
michael@0 | 190 | for (i = 0; i < DictionaryData::IX_COUNT; i++) { |
michael@0 | 191 | indexes[i] = udata_readInt32(ds, inIndexes[i]); |
michael@0 | 192 | } |
michael@0 | 193 | |
michael@0 | 194 | size = indexes[DictionaryData::IX_TOTAL_SIZE]; |
michael@0 | 195 | |
michael@0 | 196 | if (length >= 0) { |
michael@0 | 197 | if (length < size) { |
michael@0 | 198 | udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); |
michael@0 | 199 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 200 | return 0; |
michael@0 | 201 | } |
michael@0 | 202 | |
michael@0 | 203 | if (inBytes != outBytes) { |
michael@0 | 204 | uprv_memcpy(outBytes, inBytes, size); |
michael@0 | 205 | } |
michael@0 | 206 | |
michael@0 | 207 | offset = 0; |
michael@0 | 208 | ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); |
michael@0 | 209 | offset = (int32_t)sizeof(indexes); |
michael@0 | 210 | int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; |
michael@0 | 211 | int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; |
michael@0 | 212 | |
michael@0 | 213 | if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { |
michael@0 | 214 | ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); |
michael@0 | 215 | } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { |
michael@0 | 216 | // nothing to do |
michael@0 | 217 | } else { |
michael@0 | 218 | udata_printError(ds, "udict_swap(): unknown trie type!\n"); |
michael@0 | 219 | *pErrorCode = U_UNSUPPORTED_ERROR; |
michael@0 | 220 | return 0; |
michael@0 | 221 | } |
michael@0 | 222 | |
michael@0 | 223 | // these next two sections are empty in the current format, |
michael@0 | 224 | // but may be used later. |
michael@0 | 225 | offset = nextOffset; |
michael@0 | 226 | nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; |
michael@0 | 227 | offset = nextOffset; |
michael@0 | 228 | nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; |
michael@0 | 229 | offset = nextOffset; |
michael@0 | 230 | } |
michael@0 | 231 | return headerSize + size; |
michael@0 | 232 | } |
michael@0 | 233 | #endif |