|
1 /* |
|
2 *************************************************************************** |
|
3 * Copyright (C) 1999-2010 International Business Machines Corporation * |
|
4 * and others. All rights reserved. * |
|
5 *************************************************************************** |
|
6 */ |
|
7 |
|
8 #include "unicode/utypes.h" |
|
9 |
|
10 #if !UCONFIG_NO_BREAK_ITERATION |
|
11 |
|
12 #include "unicode/utypes.h" |
|
13 #include "rbbidata.h" |
|
14 #include "rbbirb.h" |
|
15 #include "utrie.h" |
|
16 #include "udatamem.h" |
|
17 #include "cmemory.h" |
|
18 #include "cstring.h" |
|
19 #include "umutex.h" |
|
20 |
|
21 #include "uassert.h" |
|
22 |
|
23 |
|
24 //----------------------------------------------------------------------------------- |
|
25 // |
|
26 // Trie access folding function. Copied as-is from properties code in uchar.c |
|
27 // |
|
28 //----------------------------------------------------------------------------------- |
|
29 U_CDECL_BEGIN |
|
30 static int32_t U_CALLCONV |
|
31 getFoldingOffset(uint32_t data) { |
|
32 /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */ |
|
33 if(data&0x8000) { |
|
34 return (int32_t)(data&0x7fff); |
|
35 } else { |
|
36 return 0; |
|
37 } |
|
38 } |
|
39 U_CDECL_END |
|
40 |
|
41 U_NAMESPACE_BEGIN |
|
42 |
|
43 //----------------------------------------------------------------------------- |
|
44 // |
|
45 // Constructors. |
|
46 // |
|
47 //----------------------------------------------------------------------------- |
|
48 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) { |
|
49 init(data, status); |
|
50 } |
|
51 |
|
52 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) { |
|
53 init(data, status); |
|
54 fDontFreeData = TRUE; |
|
55 } |
|
56 |
|
57 RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) { |
|
58 const RBBIDataHeader *d = (const RBBIDataHeader *) |
|
59 // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size); |
|
60 // taking into consideration the padding added in by udata_write |
|
61 ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize); |
|
62 init(d, status); |
|
63 fUDataMem = udm; |
|
64 } |
|
65 |
|
66 //----------------------------------------------------------------------------- |
|
67 // |
|
68 // init(). Does most of the work of construction, shared between the |
|
69 // constructors. |
|
70 // |
|
71 //----------------------------------------------------------------------------- |
|
72 void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { |
|
73 if (U_FAILURE(status)) { |
|
74 return; |
|
75 } |
|
76 fHeader = data; |
|
77 if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3) |
|
78 { |
|
79 status = U_INVALID_FORMAT_ERROR; |
|
80 return; |
|
81 } |
|
82 // Note: in ICU version 3.2 and earlier, there was a formatVersion 1 |
|
83 // that is no longer supported. At that time fFormatVersion was |
|
84 // an int32_t field, rather than an array of 4 bytes. |
|
85 |
|
86 fDontFreeData = FALSE; |
|
87 fUDataMem = NULL; |
|
88 fReverseTable = NULL; |
|
89 fSafeFwdTable = NULL; |
|
90 fSafeRevTable = NULL; |
|
91 if (data->fFTableLen != 0) { |
|
92 fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable); |
|
93 } |
|
94 if (data->fRTableLen != 0) { |
|
95 fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable); |
|
96 } |
|
97 if (data->fSFTableLen != 0) { |
|
98 fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable); |
|
99 } |
|
100 if (data->fSRTableLen != 0) { |
|
101 fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable); |
|
102 } |
|
103 |
|
104 |
|
105 utrie_unserialize(&fTrie, |
|
106 (uint8_t *)data + fHeader->fTrie, |
|
107 fHeader->fTrieLen, |
|
108 &status); |
|
109 if (U_FAILURE(status)) { |
|
110 return; |
|
111 } |
|
112 fTrie.getFoldingOffset=getFoldingOffset; |
|
113 |
|
114 |
|
115 fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource); |
|
116 fRuleString.setTo(TRUE, fRuleSource, -1); |
|
117 U_ASSERT(data->fRuleSourceLen > 0); |
|
118 |
|
119 fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable); |
|
120 fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t); |
|
121 |
|
122 fRefCount = 1; |
|
123 |
|
124 #ifdef RBBI_DEBUG |
|
125 char *debugEnv = getenv("U_RBBIDEBUG"); |
|
126 if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();} |
|
127 #endif |
|
128 } |
|
129 |
|
130 |
|
131 //----------------------------------------------------------------------------- |
|
132 // |
|
133 // Destructor. Don't call this - use removeReference() instead. |
|
134 // |
|
135 //----------------------------------------------------------------------------- |
|
136 RBBIDataWrapper::~RBBIDataWrapper() { |
|
137 U_ASSERT(fRefCount == 0); |
|
138 if (fUDataMem) { |
|
139 udata_close(fUDataMem); |
|
140 } else if (!fDontFreeData) { |
|
141 uprv_free((void *)fHeader); |
|
142 } |
|
143 } |
|
144 |
|
145 |
|
146 |
|
147 //----------------------------------------------------------------------------- |
|
148 // |
|
149 // Operator == Consider two RBBIDataWrappers to be equal if they |
|
150 // refer to the same underlying data. Although |
|
151 // the data wrappers are normally shared between |
|
152 // iterator instances, it's possible to independently |
|
153 // open the same data twice, and get two instances, which |
|
154 // should still be ==. |
|
155 // |
|
156 //----------------------------------------------------------------------------- |
|
157 UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const { |
|
158 if (fHeader == other.fHeader) { |
|
159 return TRUE; |
|
160 } |
|
161 if (fHeader->fLength != other.fHeader->fLength) { |
|
162 return FALSE; |
|
163 } |
|
164 if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) { |
|
165 return TRUE; |
|
166 } |
|
167 return FALSE; |
|
168 } |
|
169 |
|
170 int32_t RBBIDataWrapper::hashCode() { |
|
171 return fHeader->fFTableLen; |
|
172 } |
|
173 |
|
174 |
|
175 |
|
176 //----------------------------------------------------------------------------- |
|
177 // |
|
178 // Reference Counting. A single RBBIDataWrapper object is shared among |
|
179 // however many RulesBasedBreakIterator instances are |
|
180 // referencing the same data. |
|
181 // |
|
182 //----------------------------------------------------------------------------- |
|
183 void RBBIDataWrapper::removeReference() { |
|
184 if (umtx_atomic_dec(&fRefCount) == 0) { |
|
185 delete this; |
|
186 } |
|
187 } |
|
188 |
|
189 |
|
190 RBBIDataWrapper *RBBIDataWrapper::addReference() { |
|
191 umtx_atomic_inc(&fRefCount); |
|
192 return this; |
|
193 } |
|
194 |
|
195 |
|
196 |
|
197 //----------------------------------------------------------------------------- |
|
198 // |
|
199 // getRuleSourceString |
|
200 // |
|
201 //----------------------------------------------------------------------------- |
|
202 const UnicodeString &RBBIDataWrapper::getRuleSourceString() const { |
|
203 return fRuleString; |
|
204 } |
|
205 |
|
206 |
|
207 //----------------------------------------------------------------------------- |
|
208 // |
|
209 // print - debugging function to dump the runtime data tables. |
|
210 // |
|
211 //----------------------------------------------------------------------------- |
|
212 #ifdef RBBI_DEBUG |
|
213 void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) { |
|
214 uint32_t c; |
|
215 uint32_t s; |
|
216 |
|
217 RBBIDebugPrintf(" %s\n", heading); |
|
218 |
|
219 RBBIDebugPrintf("State | Acc LA TagIx"); |
|
220 for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);} |
|
221 RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) { |
|
222 RBBIDebugPrintf("----"); |
|
223 } |
|
224 RBBIDebugPrintf("\n"); |
|
225 |
|
226 if (table == NULL) { |
|
227 RBBIDebugPrintf(" N U L L T A B L E\n\n"); |
|
228 return; |
|
229 } |
|
230 for (s=0; s<table->fNumStates; s++) { |
|
231 RBBIStateTableRow *row = (RBBIStateTableRow *) |
|
232 (table->fTableData + (table->fRowLen * s)); |
|
233 RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx); |
|
234 for (c=0; c<fHeader->fCatCount; c++) { |
|
235 RBBIDebugPrintf("%3d ", row->fNextState[c]); |
|
236 } |
|
237 RBBIDebugPrintf("\n"); |
|
238 } |
|
239 RBBIDebugPrintf("\n"); |
|
240 } |
|
241 #endif |
|
242 |
|
243 |
|
244 #ifdef RBBI_DEBUG |
|
245 void RBBIDataWrapper::printData() { |
|
246 RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader); |
|
247 RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1], |
|
248 fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]); |
|
249 RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength); |
|
250 RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount); |
|
251 |
|
252 printTable("Forward State Transition Table", fForwardTable); |
|
253 printTable("Reverse State Transition Table", fReverseTable); |
|
254 printTable("Safe Forward State Transition Table", fSafeFwdTable); |
|
255 printTable("Safe Reverse State Transition Table", fSafeRevTable); |
|
256 |
|
257 RBBIDebugPrintf("\nOrignal Rules source:\n"); |
|
258 for (int32_t c=0; fRuleSource[c] != 0; c++) { |
|
259 RBBIDebugPrintf("%c", fRuleSource[c]); |
|
260 } |
|
261 RBBIDebugPrintf("\n\n"); |
|
262 } |
|
263 #endif |
|
264 |
|
265 |
|
266 U_NAMESPACE_END |
|
267 U_NAMESPACE_USE |
|
268 |
|
269 //----------------------------------------------------------------------------- |
|
270 // |
|
271 // ubrk_swap - byte swap and char encoding swap of RBBI data |
|
272 // |
|
273 //----------------------------------------------------------------------------- |
|
274 |
|
275 U_CAPI int32_t U_EXPORT2 |
|
276 ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, |
|
277 UErrorCode *status) { |
|
278 |
|
279 if (status == NULL || U_FAILURE(*status)) { |
|
280 return 0; |
|
281 } |
|
282 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { |
|
283 *status=U_ILLEGAL_ARGUMENT_ERROR; |
|
284 return 0; |
|
285 } |
|
286 |
|
287 // |
|
288 // Check that the data header is for for break data. |
|
289 // (Header contents are defined in genbrk.cpp) |
|
290 // |
|
291 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); |
|
292 if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */ |
|
293 pInfo->dataFormat[1]==0x72 && |
|
294 pInfo->dataFormat[2]==0x6b && |
|
295 pInfo->dataFormat[3]==0x20 && |
|
296 pInfo->formatVersion[0]==3 )) { |
|
297 udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", |
|
298 pInfo->dataFormat[0], pInfo->dataFormat[1], |
|
299 pInfo->dataFormat[2], pInfo->dataFormat[3], |
|
300 pInfo->formatVersion[0]); |
|
301 *status=U_UNSUPPORTED_ERROR; |
|
302 return 0; |
|
303 } |
|
304 |
|
305 // |
|
306 // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific |
|
307 // RBBIDataHeader). This swap also conveniently gets us |
|
308 // the size of the ICU d.h., which lets us locate the start |
|
309 // of the RBBI specific data. |
|
310 // |
|
311 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); |
|
312 |
|
313 |
|
314 // |
|
315 // Get the RRBI Data Header, and check that it appears to be OK. |
|
316 // |
|
317 // Note: ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually |
|
318 // an int32_t with a value of 1. Starting with ICU 3.4, |
|
319 // RBBI's fDataFormat matches the dataFormat field from the |
|
320 // UDataInfo header, four int8_t bytes. The value is {3,1,0,0} |
|
321 // |
|
322 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; |
|
323 RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes; |
|
324 if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 || |
|
325 rbbiDH->fFormatVersion[0] != 3 || |
|
326 ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) |
|
327 { |
|
328 udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n"); |
|
329 *status=U_UNSUPPORTED_ERROR; |
|
330 return 0; |
|
331 } |
|
332 |
|
333 // |
|
334 // Prefight operation? Just return the size |
|
335 // |
|
336 int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength); |
|
337 int32_t totalSize = headerSize + breakDataLength; |
|
338 if (length < 0) { |
|
339 return totalSize; |
|
340 } |
|
341 |
|
342 // |
|
343 // Check that length passed in is consistent with length from RBBI data header. |
|
344 // |
|
345 if (length < totalSize) { |
|
346 udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n", |
|
347 breakDataLength); |
|
348 *status=U_INDEX_OUTOFBOUNDS_ERROR; |
|
349 return 0; |
|
350 } |
|
351 |
|
352 |
|
353 // |
|
354 // Swap the Data. Do the data itself first, then the RBBI Data Header, because |
|
355 // we need to reference the header to locate the data, and an |
|
356 // inplace swap of the header leaves it unusable. |
|
357 // |
|
358 uint8_t *outBytes = (uint8_t *)outData + headerSize; |
|
359 RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes; |
|
360 |
|
361 int32_t tableStartOffset; |
|
362 int32_t tableLength; |
|
363 |
|
364 // |
|
365 // If not swapping in place, zero out the output buffer before starting. |
|
366 // Individual tables and other data items within are aligned to 8 byte boundaries |
|
367 // when originally created. Any unused space between items needs to be zero. |
|
368 // |
|
369 if (inBytes != outBytes) { |
|
370 uprv_memset(outBytes, 0, breakDataLength); |
|
371 } |
|
372 |
|
373 // |
|
374 // Each state table begins with several 32 bit fields. Calculate the size |
|
375 // in bytes of these. |
|
376 // |
|
377 int32_t topSize = offsetof(RBBIStateTable, fTableData); |
|
378 |
|
379 // Forward state table. |
|
380 tableStartOffset = ds->readUInt32(rbbiDH->fFTable); |
|
381 tableLength = ds->readUInt32(rbbiDH->fFTableLen); |
|
382 |
|
383 if (tableLength > 0) { |
|
384 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, |
|
385 outBytes+tableStartOffset, status); |
|
386 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, |
|
387 outBytes+tableStartOffset+topSize, status); |
|
388 } |
|
389 |
|
390 // Reverse state table. Same layout as forward table, above. |
|
391 tableStartOffset = ds->readUInt32(rbbiDH->fRTable); |
|
392 tableLength = ds->readUInt32(rbbiDH->fRTableLen); |
|
393 |
|
394 if (tableLength > 0) { |
|
395 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, |
|
396 outBytes+tableStartOffset, status); |
|
397 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, |
|
398 outBytes+tableStartOffset+topSize, status); |
|
399 } |
|
400 |
|
401 // Safe Forward state table. Same layout as forward table, above. |
|
402 tableStartOffset = ds->readUInt32(rbbiDH->fSFTable); |
|
403 tableLength = ds->readUInt32(rbbiDH->fSFTableLen); |
|
404 |
|
405 if (tableLength > 0) { |
|
406 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, |
|
407 outBytes+tableStartOffset, status); |
|
408 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, |
|
409 outBytes+tableStartOffset+topSize, status); |
|
410 } |
|
411 |
|
412 // Safe Reverse state table. Same layout as forward table, above. |
|
413 tableStartOffset = ds->readUInt32(rbbiDH->fSRTable); |
|
414 tableLength = ds->readUInt32(rbbiDH->fSRTableLen); |
|
415 |
|
416 if (tableLength > 0) { |
|
417 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, |
|
418 outBytes+tableStartOffset, status); |
|
419 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, |
|
420 outBytes+tableStartOffset+topSize, status); |
|
421 } |
|
422 |
|
423 // Trie table for character categories |
|
424 utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), |
|
425 outBytes+ds->readUInt32(rbbiDH->fTrie), status); |
|
426 |
|
427 // Source Rules Text. It's UChar data |
|
428 ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen), |
|
429 outBytes+ds->readUInt32(rbbiDH->fRuleSource), status); |
|
430 |
|
431 // Table of rule status values. It's all int_32 values |
|
432 ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen), |
|
433 outBytes+ds->readUInt32(rbbiDH->fStatusTable), status); |
|
434 |
|
435 // And, last, the header. |
|
436 // It is all int32_t values except for fFormataVersion, which is an array of four bytes. |
|
437 // Swap the whole thing as int32_t, then re-swap the one field. |
|
438 // |
|
439 ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status); |
|
440 ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status); |
|
441 |
|
442 return totalSize; |
|
443 } |
|
444 |
|
445 |
|
446 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |