|
1 /* |
|
2 ******************************************************************************* |
|
3 * Copyright (C) 2013, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ******************************************************************************* |
|
6 * dictionarydata.h |
|
7 * |
|
8 * created on: 2012may31 |
|
9 * created by: Markus W. Scherer & Maxime Serrano |
|
10 */ |
|
11 |
|
12 #include "dictionarydata.h" |
|
13 #include "unicode/ucharstrie.h" |
|
14 #include "unicode/bytestrie.h" |
|
15 #include "unicode/udata.h" |
|
16 #include "cmemory.h" |
|
17 |
|
18 #if !UCONFIG_NO_BREAK_ITERATION |
|
19 |
|
20 U_NAMESPACE_BEGIN |
|
21 |
|
22 const int32_t DictionaryData::TRIE_TYPE_BYTES = 0; |
|
23 const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1; |
|
24 const int32_t DictionaryData::TRIE_TYPE_MASK = 7; |
|
25 const int32_t DictionaryData::TRIE_HAS_VALUES = 8; |
|
26 |
|
27 const int32_t DictionaryData::TRANSFORM_NONE = 0; |
|
28 const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; |
|
29 const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; |
|
30 const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; |
|
31 |
|
32 DictionaryMatcher::~DictionaryMatcher() { |
|
33 } |
|
34 |
|
35 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { |
|
36 udata_close(file); |
|
37 } |
|
38 |
|
39 int32_t UCharsDictionaryMatcher::getType() const { |
|
40 return DictionaryData::TRIE_TYPE_UCHARS; |
|
41 } |
|
42 |
|
43 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { |
|
44 UCharsTrie uct(characters); |
|
45 UChar32 c = utext_next32(text); |
|
46 if (c < 0) { |
|
47 return 0; |
|
48 } |
|
49 UStringTrieResult result = uct.first(c); |
|
50 int32_t numChars = 1; |
|
51 count = 0; |
|
52 for (;;) { |
|
53 if (USTRINGTRIE_HAS_VALUE(result)) { |
|
54 if (count < limit) { |
|
55 if (values != NULL) { |
|
56 values[count] = uct.getValue(); |
|
57 } |
|
58 lengths[count++] = numChars; |
|
59 } |
|
60 if (result == USTRINGTRIE_FINAL_VALUE) { |
|
61 break; |
|
62 } |
|
63 } |
|
64 else if (result == USTRINGTRIE_NO_MATCH) { |
|
65 break; |
|
66 } |
|
67 |
|
68 // TODO: why do we have a text limit if the UText knows its length? |
|
69 if (numChars >= maxLength) { |
|
70 break; |
|
71 } |
|
72 |
|
73 c = utext_next32(text); |
|
74 if (c < 0) { |
|
75 break; |
|
76 } |
|
77 ++numChars; |
|
78 result = uct.next(c); |
|
79 } |
|
80 return numChars; |
|
81 } |
|
82 |
|
83 BytesDictionaryMatcher::~BytesDictionaryMatcher() { |
|
84 udata_close(file); |
|
85 } |
|
86 |
|
87 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { |
|
88 if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { |
|
89 if (c == 0x200D) { |
|
90 return 0xFF; |
|
91 } else if (c == 0x200C) { |
|
92 return 0xFE; |
|
93 } |
|
94 int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); |
|
95 if (delta < 0 || 0xFD < delta) { |
|
96 return U_SENTINEL; |
|
97 } |
|
98 return (UChar32)delta; |
|
99 } |
|
100 return c; |
|
101 } |
|
102 |
|
103 int32_t BytesDictionaryMatcher::getType() const { |
|
104 return DictionaryData::TRIE_TYPE_BYTES; |
|
105 } |
|
106 |
|
107 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { |
|
108 BytesTrie bt(characters); |
|
109 UChar32 c = utext_next32(text); |
|
110 if (c < 0) { |
|
111 return 0; |
|
112 } |
|
113 UStringTrieResult result = bt.first(transform(c)); |
|
114 int32_t numChars = 1; |
|
115 count = 0; |
|
116 for (;;) { |
|
117 if (USTRINGTRIE_HAS_VALUE(result)) { |
|
118 if (count < limit) { |
|
119 if (values != NULL) { |
|
120 values[count] = bt.getValue(); |
|
121 } |
|
122 lengths[count++] = numChars; |
|
123 } |
|
124 if (result == USTRINGTRIE_FINAL_VALUE) { |
|
125 break; |
|
126 } |
|
127 } |
|
128 else if (result == USTRINGTRIE_NO_MATCH) { |
|
129 break; |
|
130 } |
|
131 |
|
132 // TODO: why do we have a text limit if the UText knows its length? |
|
133 if (numChars >= maxLength) { |
|
134 break; |
|
135 } |
|
136 |
|
137 c = utext_next32(text); |
|
138 if (c < 0) { |
|
139 break; |
|
140 } |
|
141 ++numChars; |
|
142 result = bt.next(transform(c)); |
|
143 } |
|
144 return numChars; |
|
145 } |
|
146 |
|
147 |
|
148 U_NAMESPACE_END |
|
149 |
|
150 U_NAMESPACE_USE |
|
151 |
|
152 U_CAPI int32_t U_EXPORT2 |
|
153 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, |
|
154 void *outData, UErrorCode *pErrorCode) { |
|
155 const UDataInfo *pInfo; |
|
156 int32_t headerSize; |
|
157 const uint8_t *inBytes; |
|
158 uint8_t *outBytes; |
|
159 const int32_t *inIndexes; |
|
160 int32_t indexes[DictionaryData::IX_COUNT]; |
|
161 int32_t i, offset, size; |
|
162 |
|
163 headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); |
|
164 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; |
|
165 pInfo = (const UDataInfo *)((const char *)inData + 4); |
|
166 if (!(pInfo->dataFormat[0] == 0x44 && |
|
167 pInfo->dataFormat[1] == 0x69 && |
|
168 pInfo->dataFormat[2] == 0x63 && |
|
169 pInfo->dataFormat[3] == 0x74 && |
|
170 pInfo->formatVersion[0] == 1)) { |
|
171 udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", |
|
172 pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); |
|
173 *pErrorCode = U_UNSUPPORTED_ERROR; |
|
174 return 0; |
|
175 } |
|
176 |
|
177 inBytes = (const uint8_t *)inData + headerSize; |
|
178 outBytes = (uint8_t *)outData + headerSize; |
|
179 |
|
180 inIndexes = (const int32_t *)inBytes; |
|
181 if (length >= 0) { |
|
182 length -= headerSize; |
|
183 if (length < (int32_t)(sizeof(indexes))) { |
|
184 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); |
|
185 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
|
186 return 0; |
|
187 } |
|
188 } |
|
189 |
|
190 for (i = 0; i < DictionaryData::IX_COUNT; i++) { |
|
191 indexes[i] = udata_readInt32(ds, inIndexes[i]); |
|
192 } |
|
193 |
|
194 size = indexes[DictionaryData::IX_TOTAL_SIZE]; |
|
195 |
|
196 if (length >= 0) { |
|
197 if (length < size) { |
|
198 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); |
|
199 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
|
200 return 0; |
|
201 } |
|
202 |
|
203 if (inBytes != outBytes) { |
|
204 uprv_memcpy(outBytes, inBytes, size); |
|
205 } |
|
206 |
|
207 offset = 0; |
|
208 ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); |
|
209 offset = (int32_t)sizeof(indexes); |
|
210 int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; |
|
211 int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; |
|
212 |
|
213 if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { |
|
214 ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); |
|
215 } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { |
|
216 // nothing to do |
|
217 } else { |
|
218 udata_printError(ds, "udict_swap(): unknown trie type!\n"); |
|
219 *pErrorCode = U_UNSUPPORTED_ERROR; |
|
220 return 0; |
|
221 } |
|
222 |
|
223 // these next two sections are empty in the current format, |
|
224 // but may be used later. |
|
225 offset = nextOffset; |
|
226 nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; |
|
227 offset = nextOffset; |
|
228 nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; |
|
229 offset = nextOffset; |
|
230 } |
|
231 return headerSize + size; |
|
232 } |
|
233 #endif |