intl/icu/source/common/dictionarydata.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 * Copyright (C) 2013, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 *******************************************************************************
michael@0 6 * dictionarydata.h
michael@0 7 *
michael@0 8 * created on: 2012may31
michael@0 9 * created by: Markus W. Scherer & Maxime Serrano
michael@0 10 */
michael@0 11
michael@0 12 #include "dictionarydata.h"
michael@0 13 #include "unicode/ucharstrie.h"
michael@0 14 #include "unicode/bytestrie.h"
michael@0 15 #include "unicode/udata.h"
michael@0 16 #include "cmemory.h"
michael@0 17
michael@0 18 #if !UCONFIG_NO_BREAK_ITERATION
michael@0 19
michael@0 20 U_NAMESPACE_BEGIN
michael@0 21
michael@0 22 const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
michael@0 23 const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
michael@0 24 const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
michael@0 25 const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
michael@0 26
michael@0 27 const int32_t DictionaryData::TRANSFORM_NONE = 0;
michael@0 28 const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
michael@0 29 const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
michael@0 30 const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
michael@0 31
michael@0 32 DictionaryMatcher::~DictionaryMatcher() {
michael@0 33 }
michael@0 34
michael@0 35 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
michael@0 36 udata_close(file);
michael@0 37 }
michael@0 38
michael@0 39 int32_t UCharsDictionaryMatcher::getType() const {
michael@0 40 return DictionaryData::TRIE_TYPE_UCHARS;
michael@0 41 }
michael@0 42
michael@0 43 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
michael@0 44 UCharsTrie uct(characters);
michael@0 45 UChar32 c = utext_next32(text);
michael@0 46 if (c < 0) {
michael@0 47 return 0;
michael@0 48 }
michael@0 49 UStringTrieResult result = uct.first(c);
michael@0 50 int32_t numChars = 1;
michael@0 51 count = 0;
michael@0 52 for (;;) {
michael@0 53 if (USTRINGTRIE_HAS_VALUE(result)) {
michael@0 54 if (count < limit) {
michael@0 55 if (values != NULL) {
michael@0 56 values[count] = uct.getValue();
michael@0 57 }
michael@0 58 lengths[count++] = numChars;
michael@0 59 }
michael@0 60 if (result == USTRINGTRIE_FINAL_VALUE) {
michael@0 61 break;
michael@0 62 }
michael@0 63 }
michael@0 64 else if (result == USTRINGTRIE_NO_MATCH) {
michael@0 65 break;
michael@0 66 }
michael@0 67
michael@0 68 // TODO: why do we have a text limit if the UText knows its length?
michael@0 69 if (numChars >= maxLength) {
michael@0 70 break;
michael@0 71 }
michael@0 72
michael@0 73 c = utext_next32(text);
michael@0 74 if (c < 0) {
michael@0 75 break;
michael@0 76 }
michael@0 77 ++numChars;
michael@0 78 result = uct.next(c);
michael@0 79 }
michael@0 80 return numChars;
michael@0 81 }
michael@0 82
michael@0 83 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
michael@0 84 udata_close(file);
michael@0 85 }
michael@0 86
michael@0 87 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
michael@0 88 if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
michael@0 89 if (c == 0x200D) {
michael@0 90 return 0xFF;
michael@0 91 } else if (c == 0x200C) {
michael@0 92 return 0xFE;
michael@0 93 }
michael@0 94 int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
michael@0 95 if (delta < 0 || 0xFD < delta) {
michael@0 96 return U_SENTINEL;
michael@0 97 }
michael@0 98 return (UChar32)delta;
michael@0 99 }
michael@0 100 return c;
michael@0 101 }
michael@0 102
michael@0 103 int32_t BytesDictionaryMatcher::getType() const {
michael@0 104 return DictionaryData::TRIE_TYPE_BYTES;
michael@0 105 }
michael@0 106
michael@0 107 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
michael@0 108 BytesTrie bt(characters);
michael@0 109 UChar32 c = utext_next32(text);
michael@0 110 if (c < 0) {
michael@0 111 return 0;
michael@0 112 }
michael@0 113 UStringTrieResult result = bt.first(transform(c));
michael@0 114 int32_t numChars = 1;
michael@0 115 count = 0;
michael@0 116 for (;;) {
michael@0 117 if (USTRINGTRIE_HAS_VALUE(result)) {
michael@0 118 if (count < limit) {
michael@0 119 if (values != NULL) {
michael@0 120 values[count] = bt.getValue();
michael@0 121 }
michael@0 122 lengths[count++] = numChars;
michael@0 123 }
michael@0 124 if (result == USTRINGTRIE_FINAL_VALUE) {
michael@0 125 break;
michael@0 126 }
michael@0 127 }
michael@0 128 else if (result == USTRINGTRIE_NO_MATCH) {
michael@0 129 break;
michael@0 130 }
michael@0 131
michael@0 132 // TODO: why do we have a text limit if the UText knows its length?
michael@0 133 if (numChars >= maxLength) {
michael@0 134 break;
michael@0 135 }
michael@0 136
michael@0 137 c = utext_next32(text);
michael@0 138 if (c < 0) {
michael@0 139 break;
michael@0 140 }
michael@0 141 ++numChars;
michael@0 142 result = bt.next(transform(c));
michael@0 143 }
michael@0 144 return numChars;
michael@0 145 }
michael@0 146
michael@0 147
michael@0 148 U_NAMESPACE_END
michael@0 149
michael@0 150 U_NAMESPACE_USE
michael@0 151
michael@0 152 U_CAPI int32_t U_EXPORT2
michael@0 153 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
michael@0 154 void *outData, UErrorCode *pErrorCode) {
michael@0 155 const UDataInfo *pInfo;
michael@0 156 int32_t headerSize;
michael@0 157 const uint8_t *inBytes;
michael@0 158 uint8_t *outBytes;
michael@0 159 const int32_t *inIndexes;
michael@0 160 int32_t indexes[DictionaryData::IX_COUNT];
michael@0 161 int32_t i, offset, size;
michael@0 162
michael@0 163 headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
michael@0 164 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
michael@0 165 pInfo = (const UDataInfo *)((const char *)inData + 4);
michael@0 166 if (!(pInfo->dataFormat[0] == 0x44 &&
michael@0 167 pInfo->dataFormat[1] == 0x69 &&
michael@0 168 pInfo->dataFormat[2] == 0x63 &&
michael@0 169 pInfo->dataFormat[3] == 0x74 &&
michael@0 170 pInfo->formatVersion[0] == 1)) {
michael@0 171 udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
michael@0 172 pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
michael@0 173 *pErrorCode = U_UNSUPPORTED_ERROR;
michael@0 174 return 0;
michael@0 175 }
michael@0 176
michael@0 177 inBytes = (const uint8_t *)inData + headerSize;
michael@0 178 outBytes = (uint8_t *)outData + headerSize;
michael@0 179
michael@0 180 inIndexes = (const int32_t *)inBytes;
michael@0 181 if (length >= 0) {
michael@0 182 length -= headerSize;
michael@0 183 if (length < (int32_t)(sizeof(indexes))) {
michael@0 184 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
michael@0 185 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 186 return 0;
michael@0 187 }
michael@0 188 }
michael@0 189
michael@0 190 for (i = 0; i < DictionaryData::IX_COUNT; i++) {
michael@0 191 indexes[i] = udata_readInt32(ds, inIndexes[i]);
michael@0 192 }
michael@0 193
michael@0 194 size = indexes[DictionaryData::IX_TOTAL_SIZE];
michael@0 195
michael@0 196 if (length >= 0) {
michael@0 197 if (length < size) {
michael@0 198 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
michael@0 199 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 200 return 0;
michael@0 201 }
michael@0 202
michael@0 203 if (inBytes != outBytes) {
michael@0 204 uprv_memcpy(outBytes, inBytes, size);
michael@0 205 }
michael@0 206
michael@0 207 offset = 0;
michael@0 208 ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
michael@0 209 offset = (int32_t)sizeof(indexes);
michael@0 210 int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
michael@0 211 int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
michael@0 212
michael@0 213 if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
michael@0 214 ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
michael@0 215 } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
michael@0 216 // nothing to do
michael@0 217 } else {
michael@0 218 udata_printError(ds, "udict_swap(): unknown trie type!\n");
michael@0 219 *pErrorCode = U_UNSUPPORTED_ERROR;
michael@0 220 return 0;
michael@0 221 }
michael@0 222
michael@0 223 // these next two sections are empty in the current format,
michael@0 224 // but may be used later.
michael@0 225 offset = nextOffset;
michael@0 226 nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
michael@0 227 offset = nextOffset;
michael@0 228 nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
michael@0 229 offset = nextOffset;
michael@0 230 }
michael@0 231 return headerSize + size;
michael@0 232 }
michael@0 233 #endif

mercurial