1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/dictionarydata.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,233 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* Copyright (C) 2013, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +******************************************************************************* 1.9 +* dictionarydata.h 1.10 +* 1.11 +* created on: 2012may31 1.12 +* created by: Markus W. Scherer & Maxime Serrano 1.13 +*/ 1.14 + 1.15 +#include "dictionarydata.h" 1.16 +#include "unicode/ucharstrie.h" 1.17 +#include "unicode/bytestrie.h" 1.18 +#include "unicode/udata.h" 1.19 +#include "cmemory.h" 1.20 + 1.21 +#if !UCONFIG_NO_BREAK_ITERATION 1.22 + 1.23 +U_NAMESPACE_BEGIN 1.24 + 1.25 +const int32_t DictionaryData::TRIE_TYPE_BYTES = 0; 1.26 +const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1; 1.27 +const int32_t DictionaryData::TRIE_TYPE_MASK = 7; 1.28 +const int32_t DictionaryData::TRIE_HAS_VALUES = 8; 1.29 + 1.30 +const int32_t DictionaryData::TRANSFORM_NONE = 0; 1.31 +const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; 1.32 +const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; 1.33 +const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; 1.34 + 1.35 +DictionaryMatcher::~DictionaryMatcher() { 1.36 +} 1.37 + 1.38 +UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { 1.39 + udata_close(file); 1.40 +} 1.41 + 1.42 +int32_t UCharsDictionaryMatcher::getType() const { 1.43 + return DictionaryData::TRIE_TYPE_UCHARS; 1.44 +} 1.45 + 1.46 +int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { 1.47 + UCharsTrie uct(characters); 1.48 + UChar32 c = utext_next32(text); 1.49 + if (c < 0) { 1.50 + return 0; 1.51 + } 1.52 + UStringTrieResult result = uct.first(c); 1.53 + int32_t numChars = 1; 1.54 + count = 0; 1.55 + for (;;) { 1.56 + if (USTRINGTRIE_HAS_VALUE(result)) { 1.57 + if (count < limit) { 1.58 + if (values != NULL) { 1.59 + values[count] = uct.getValue(); 1.60 + } 1.61 + lengths[count++] = numChars; 1.62 + } 1.63 + if (result == USTRINGTRIE_FINAL_VALUE) { 1.64 + break; 1.65 + } 1.66 + } 1.67 + else if (result == USTRINGTRIE_NO_MATCH) { 1.68 + break; 1.69 + } 1.70 + 1.71 + // TODO: why do we have a text limit if the UText knows its length? 1.72 + if (numChars >= maxLength) { 1.73 + break; 1.74 + } 1.75 + 1.76 + c = utext_next32(text); 1.77 + if (c < 0) { 1.78 + break; 1.79 + } 1.80 + ++numChars; 1.81 + result = uct.next(c); 1.82 + } 1.83 + return numChars; 1.84 +} 1.85 + 1.86 +BytesDictionaryMatcher::~BytesDictionaryMatcher() { 1.87 + udata_close(file); 1.88 +} 1.89 + 1.90 +UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { 1.91 + if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { 1.92 + if (c == 0x200D) { 1.93 + return 0xFF; 1.94 + } else if (c == 0x200C) { 1.95 + return 0xFE; 1.96 + } 1.97 + int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); 1.98 + if (delta < 0 || 0xFD < delta) { 1.99 + return U_SENTINEL; 1.100 + } 1.101 + return (UChar32)delta; 1.102 + } 1.103 + return c; 1.104 +} 1.105 + 1.106 +int32_t BytesDictionaryMatcher::getType() const { 1.107 + return DictionaryData::TRIE_TYPE_BYTES; 1.108 +} 1.109 + 1.110 +int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { 1.111 + BytesTrie bt(characters); 1.112 + UChar32 c = utext_next32(text); 1.113 + if (c < 0) { 1.114 + return 0; 1.115 + } 1.116 + UStringTrieResult result = bt.first(transform(c)); 1.117 + int32_t numChars = 1; 1.118 + count = 0; 1.119 + for (;;) { 1.120 + if (USTRINGTRIE_HAS_VALUE(result)) { 1.121 + if (count < limit) { 1.122 + if (values != NULL) { 1.123 + values[count] = bt.getValue(); 1.124 + } 1.125 + lengths[count++] = numChars; 1.126 + } 1.127 + if (result == USTRINGTRIE_FINAL_VALUE) { 1.128 + break; 1.129 + } 1.130 + } 1.131 + else if (result == USTRINGTRIE_NO_MATCH) { 1.132 + break; 1.133 + } 1.134 + 1.135 + // TODO: why do we have a text limit if the UText knows its length? 1.136 + if (numChars >= maxLength) { 1.137 + break; 1.138 + } 1.139 + 1.140 + c = utext_next32(text); 1.141 + if (c < 0) { 1.142 + break; 1.143 + } 1.144 + ++numChars; 1.145 + result = bt.next(transform(c)); 1.146 + } 1.147 + return numChars; 1.148 +} 1.149 + 1.150 + 1.151 +U_NAMESPACE_END 1.152 + 1.153 +U_NAMESPACE_USE 1.154 + 1.155 +U_CAPI int32_t U_EXPORT2 1.156 +udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, 1.157 + void *outData, UErrorCode *pErrorCode) { 1.158 + const UDataInfo *pInfo; 1.159 + int32_t headerSize; 1.160 + const uint8_t *inBytes; 1.161 + uint8_t *outBytes; 1.162 + const int32_t *inIndexes; 1.163 + int32_t indexes[DictionaryData::IX_COUNT]; 1.164 + int32_t i, offset, size; 1.165 + 1.166 + headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 1.167 + if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; 1.168 + pInfo = (const UDataInfo *)((const char *)inData + 4); 1.169 + if (!(pInfo->dataFormat[0] == 0x44 && 1.170 + pInfo->dataFormat[1] == 0x69 && 1.171 + pInfo->dataFormat[2] == 0x63 && 1.172 + pInfo->dataFormat[3] == 0x74 && 1.173 + pInfo->formatVersion[0] == 1)) { 1.174 + udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", 1.175 + pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); 1.176 + *pErrorCode = U_UNSUPPORTED_ERROR; 1.177 + return 0; 1.178 + } 1.179 + 1.180 + inBytes = (const uint8_t *)inData + headerSize; 1.181 + outBytes = (uint8_t *)outData + headerSize; 1.182 + 1.183 + inIndexes = (const int32_t *)inBytes; 1.184 + if (length >= 0) { 1.185 + length -= headerSize; 1.186 + if (length < (int32_t)(sizeof(indexes))) { 1.187 + udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); 1.188 + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 1.189 + return 0; 1.190 + } 1.191 + } 1.192 + 1.193 + for (i = 0; i < DictionaryData::IX_COUNT; i++) { 1.194 + indexes[i] = udata_readInt32(ds, inIndexes[i]); 1.195 + } 1.196 + 1.197 + size = indexes[DictionaryData::IX_TOTAL_SIZE]; 1.198 + 1.199 + if (length >= 0) { 1.200 + if (length < size) { 1.201 + udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); 1.202 + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 1.203 + return 0; 1.204 + } 1.205 + 1.206 + if (inBytes != outBytes) { 1.207 + uprv_memcpy(outBytes, inBytes, size); 1.208 + } 1.209 + 1.210 + offset = 0; 1.211 + ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); 1.212 + offset = (int32_t)sizeof(indexes); 1.213 + int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 1.214 + int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; 1.215 + 1.216 + if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 1.217 + ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); 1.218 + } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 1.219 + // nothing to do 1.220 + } else { 1.221 + udata_printError(ds, "udict_swap(): unknown trie type!\n"); 1.222 + *pErrorCode = U_UNSUPPORTED_ERROR; 1.223 + return 0; 1.224 + } 1.225 + 1.226 + // these next two sections are empty in the current format, 1.227 + // but may be used later. 1.228 + offset = nextOffset; 1.229 + nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; 1.230 + offset = nextOffset; 1.231 + nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; 1.232 + offset = nextOffset; 1.233 + } 1.234 + return headerSize + size; 1.235 +} 1.236 +#endif