|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2003-2012, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: ucmstate.c |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2003oct09 |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * This file handles ICU .ucm file state information as part of the ucm module. |
|
17 * Most of this code used to be in makeconv.c. |
|
18 */ |
|
19 |
|
20 #include "unicode/utypes.h" |
|
21 #include "cstring.h" |
|
22 #include "cmemory.h" |
|
23 #include "uarrsort.h" |
|
24 #include "ucnvmbcs.h" |
|
25 #include "ucnv_ext.h" |
|
26 #include "uparse.h" |
|
27 #include "ucm.h" |
|
28 #include <stdio.h> |
|
29 |
|
30 #if !UCONFIG_NO_CONVERSION |
|
31 |
|
32 /* MBCS state handling ------------------------------------------------------ */ |
|
33 |
|
34 /* |
|
35 * state table row grammar (ebnf-style): |
|
36 * (whitespace is allowed between all tokens) |
|
37 * |
|
38 * row=[[firstentry ','] entry (',' entry)*] |
|
39 * firstentry="initial" | "surrogates" |
|
40 * (initial state (default for state 0), output is all surrogate pairs) |
|
41 * entry=range [':' nextstate] ['.' action] |
|
42 * range=number ['-' number] |
|
43 * nextstate=number |
|
44 * (0..7f) |
|
45 * action='u' | 's' | 'p' | 'i' |
|
46 * (unassigned, state change only, surrogate pair, illegal) |
|
47 * number=(1- or 2-digit hexadecimal number) |
|
48 */ |
|
49 static const char * |
|
50 parseState(const char *s, int32_t state[256], uint32_t *pFlags) { |
|
51 const char *t; |
|
52 uint32_t start, end, i; |
|
53 int32_t entry; |
|
54 |
|
55 /* initialize the state: all illegal with U+ffff */ |
|
56 for(i=0; i<256; ++i) { |
|
57 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff); |
|
58 } |
|
59 |
|
60 /* skip leading white space */ |
|
61 s=u_skipWhitespace(s); |
|
62 |
|
63 /* is there an "initial" or "surrogates" directive? */ |
|
64 if(uprv_strncmp("initial", s, 7)==0) { |
|
65 *pFlags=MBCS_STATE_FLAG_DIRECT; |
|
66 s=u_skipWhitespace(s+7); |
|
67 if(*s++!=',') { |
|
68 return s-1; |
|
69 } |
|
70 } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) { |
|
71 *pFlags=MBCS_STATE_FLAG_SURROGATES; |
|
72 s=u_skipWhitespace(s+10); |
|
73 if(*s++!=',') { |
|
74 return s-1; |
|
75 } |
|
76 } else if(*s==0) { |
|
77 /* empty state row: all-illegal */ |
|
78 return NULL; |
|
79 } |
|
80 |
|
81 for(;;) { |
|
82 /* read an entry, the start of the range first */ |
|
83 s=u_skipWhitespace(s); |
|
84 start=uprv_strtoul(s, (char **)&t, 16); |
|
85 if(s==t || 0xff<start) { |
|
86 return s; |
|
87 } |
|
88 s=u_skipWhitespace(t); |
|
89 |
|
90 /* read the end of the range if there is one */ |
|
91 if(*s=='-') { |
|
92 s=u_skipWhitespace(s+1); |
|
93 end=uprv_strtoul(s, (char **)&t, 16); |
|
94 if(s==t || end<start || 0xff<end) { |
|
95 return s; |
|
96 } |
|
97 s=u_skipWhitespace(t); |
|
98 } else { |
|
99 end=start; |
|
100 } |
|
101 |
|
102 /* determine the state entrys for this range */ |
|
103 if(*s!=':' && *s!='.') { |
|
104 /* the default is: final state with valid entries */ |
|
105 entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0); |
|
106 } else { |
|
107 entry=MBCS_ENTRY_TRANSITION(0, 0); |
|
108 if(*s==':') { |
|
109 /* get the next state, default to 0 */ |
|
110 s=u_skipWhitespace(s+1); |
|
111 i=uprv_strtoul(s, (char **)&t, 16); |
|
112 if(s!=t) { |
|
113 if(0x7f<i) { |
|
114 return s; |
|
115 } |
|
116 s=u_skipWhitespace(t); |
|
117 entry=MBCS_ENTRY_SET_STATE(entry, i); |
|
118 } |
|
119 } |
|
120 |
|
121 /* get the state action, default to valid */ |
|
122 if(*s=='.') { |
|
123 /* this is a final state */ |
|
124 entry=MBCS_ENTRY_SET_FINAL(entry); |
|
125 |
|
126 s=u_skipWhitespace(s+1); |
|
127 if(*s=='u') { |
|
128 /* unassigned set U+fffe */ |
|
129 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); |
|
130 s=u_skipWhitespace(s+1); |
|
131 } else if(*s=='p') { |
|
132 if(*pFlags!=MBCS_STATE_FLAG_DIRECT) { |
|
133 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR); |
|
134 } else { |
|
135 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); |
|
136 } |
|
137 s=u_skipWhitespace(s+1); |
|
138 } else if(*s=='s') { |
|
139 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY); |
|
140 s=u_skipWhitespace(s+1); |
|
141 } else if(*s=='i') { |
|
142 /* illegal set U+ffff */ |
|
143 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff); |
|
144 s=u_skipWhitespace(s+1); |
|
145 } else { |
|
146 /* default to valid */ |
|
147 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); |
|
148 } |
|
149 } else { |
|
150 /* this is an intermediate state, nothing to do */ |
|
151 } |
|
152 } |
|
153 |
|
154 /* adjust "final valid" states according to the state flags */ |
|
155 if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) { |
|
156 switch(*pFlags) { |
|
157 case 0: |
|
158 /* no adjustment */ |
|
159 break; |
|
160 case MBCS_STATE_FLAG_DIRECT: |
|
161 /* set the valid-direct code point to "unassigned"==0xfffe */ |
|
162 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe); |
|
163 break; |
|
164 case MBCS_STATE_FLAG_SURROGATES: |
|
165 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0); |
|
166 break; |
|
167 default: |
|
168 break; |
|
169 } |
|
170 } |
|
171 |
|
172 /* set this entry for the range */ |
|
173 for(i=start; i<=end; ++i) { |
|
174 state[i]=entry; |
|
175 } |
|
176 |
|
177 if(*s==',') { |
|
178 ++s; |
|
179 } else { |
|
180 return *s==0 ? NULL : s; |
|
181 } |
|
182 } |
|
183 } |
|
184 |
|
185 U_CAPI void U_EXPORT2 |
|
186 ucm_addState(UCMStates *states, const char *s) { |
|
187 const char *error; |
|
188 |
|
189 if(states->countStates==MBCS_MAX_STATE_COUNT) { |
|
190 fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT); |
|
191 exit(U_INVALID_TABLE_FORMAT); |
|
192 } |
|
193 |
|
194 error=parseState(s, states->stateTable[states->countStates], |
|
195 &states->stateFlags[states->countStates]); |
|
196 if(error!=NULL) { |
|
197 fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error); |
|
198 exit(U_INVALID_TABLE_FORMAT); |
|
199 } |
|
200 |
|
201 ++states->countStates; |
|
202 } |
|
203 |
|
204 U_CAPI UBool U_EXPORT2 |
|
205 ucm_parseHeaderLine(UCMFile *ucm, |
|
206 char *line, char **pKey, char **pValue) { |
|
207 UCMStates *states; |
|
208 char *s, *end; |
|
209 char c; |
|
210 |
|
211 states=&ucm->states; |
|
212 |
|
213 /* remove comments and trailing CR and LF and remove whitespace from the end */ |
|
214 for(end=line; (c=*end)!=0; ++end) { |
|
215 if(c=='#' || c=='\r' || c=='\n') { |
|
216 break; |
|
217 } |
|
218 } |
|
219 while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) { |
|
220 --end; |
|
221 } |
|
222 *end=0; |
|
223 |
|
224 /* skip leading white space and ignore empty lines */ |
|
225 s=(char *)u_skipWhitespace(line); |
|
226 if(*s==0) { |
|
227 return TRUE; |
|
228 } |
|
229 |
|
230 /* stop at the beginning of the mapping section */ |
|
231 if(uprv_memcmp(s, "CHARMAP", 7)==0) { |
|
232 return FALSE; |
|
233 } |
|
234 |
|
235 /* get the key name, bracketed in <> */ |
|
236 if(*s!='<') { |
|
237 fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line); |
|
238 exit(U_INVALID_TABLE_FORMAT); |
|
239 } |
|
240 *pKey=++s; |
|
241 while(*s!='>') { |
|
242 if(*s==0) { |
|
243 fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line); |
|
244 exit(U_INVALID_TABLE_FORMAT); |
|
245 } |
|
246 ++s; |
|
247 } |
|
248 *s=0; |
|
249 |
|
250 /* get the value string, possibly quoted */ |
|
251 s=(char *)u_skipWhitespace(s+1); |
|
252 if(*s!='"') { |
|
253 *pValue=s; |
|
254 } else { |
|
255 /* remove the quotes */ |
|
256 *pValue=s+1; |
|
257 if(end>*pValue && *(end-1)=='"') { |
|
258 *--end=0; |
|
259 } |
|
260 } |
|
261 |
|
262 /* collect the information from the header field, ignore unknown keys */ |
|
263 if(uprv_strcmp(*pKey, "uconv_class")==0) { |
|
264 if(uprv_strcmp(*pValue, "DBCS")==0) { |
|
265 states->conversionType=UCNV_DBCS; |
|
266 } else if(uprv_strcmp(*pValue, "SBCS")==0) { |
|
267 states->conversionType = UCNV_SBCS; |
|
268 } else if(uprv_strcmp(*pValue, "MBCS")==0) { |
|
269 states->conversionType = UCNV_MBCS; |
|
270 } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) { |
|
271 states->conversionType = UCNV_EBCDIC_STATEFUL; |
|
272 } else { |
|
273 fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue); |
|
274 exit(U_INVALID_TABLE_FORMAT); |
|
275 } |
|
276 return TRUE; |
|
277 } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) { |
|
278 c=**pValue; |
|
279 if('1'<=c && c<='4' && (*pValue)[1]==0) { |
|
280 states->maxCharLength=(int8_t)(c-'0'); |
|
281 states->outputType=(int8_t)(states->maxCharLength-1); |
|
282 } else { |
|
283 fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue); |
|
284 exit(U_INVALID_TABLE_FORMAT); |
|
285 } |
|
286 return TRUE; |
|
287 } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) { |
|
288 c=**pValue; |
|
289 if('1'<=c && c<='4' && (*pValue)[1]==0) { |
|
290 states->minCharLength=(int8_t)(c-'0'); |
|
291 } else { |
|
292 fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue); |
|
293 exit(U_INVALID_TABLE_FORMAT); |
|
294 } |
|
295 return TRUE; |
|
296 } else if(uprv_strcmp(*pKey, "icu:state")==0) { |
|
297 /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */ |
|
298 switch(states->conversionType) { |
|
299 case UCNV_SBCS: |
|
300 case UCNV_DBCS: |
|
301 case UCNV_EBCDIC_STATEFUL: |
|
302 states->conversionType=UCNV_MBCS; |
|
303 break; |
|
304 case UCNV_MBCS: |
|
305 break; |
|
306 default: |
|
307 fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n"); |
|
308 exit(U_INVALID_TABLE_FORMAT); |
|
309 } |
|
310 |
|
311 if(states->maxCharLength==0) { |
|
312 fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n"); |
|
313 exit(U_INVALID_TABLE_FORMAT); |
|
314 } |
|
315 ucm_addState(states, *pValue); |
|
316 return TRUE; |
|
317 } else if(uprv_strcmp(*pKey, "icu:base")==0) { |
|
318 if(**pValue==0) { |
|
319 fprintf(stderr, "ucm error: <icu:base> without a base table name\n"); |
|
320 exit(U_INVALID_TABLE_FORMAT); |
|
321 } |
|
322 uprv_strcpy(ucm->baseName, *pValue); |
|
323 return TRUE; |
|
324 } |
|
325 |
|
326 return FALSE; |
|
327 } |
|
328 |
|
329 /* post-processing ---------------------------------------------------------- */ |
|
330 |
|
331 static int32_t |
|
332 sumUpStates(UCMStates *states) { |
|
333 int32_t entry, sum, state, cell, count; |
|
334 UBool allStatesReady; |
|
335 |
|
336 /* |
|
337 * Sum up the offsets for all states. |
|
338 * In each final state (where there are only final entries), |
|
339 * the offsets add up directly. |
|
340 * In all other state table rows, for each transition entry to another state, |
|
341 * the offsets sum of that state needs to be added. |
|
342 * This is achieved in at most countStates iterations. |
|
343 */ |
|
344 allStatesReady=FALSE; |
|
345 for(count=states->countStates; !allStatesReady && count>=0; --count) { |
|
346 allStatesReady=TRUE; |
|
347 for(state=states->countStates-1; state>=0; --state) { |
|
348 if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) { |
|
349 allStatesReady=FALSE; |
|
350 sum=0; |
|
351 |
|
352 /* at first, add up only the final delta offsets to keep them <512 */ |
|
353 for(cell=0; cell<256; ++cell) { |
|
354 entry=states->stateTable[state][cell]; |
|
355 if(MBCS_ENTRY_IS_FINAL(entry)) { |
|
356 switch(MBCS_ENTRY_FINAL_ACTION(entry)) { |
|
357 case MBCS_STATE_VALID_16: |
|
358 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); |
|
359 sum+=1; |
|
360 break; |
|
361 case MBCS_STATE_VALID_16_PAIR: |
|
362 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); |
|
363 sum+=2; |
|
364 break; |
|
365 default: |
|
366 /* no addition */ |
|
367 break; |
|
368 } |
|
369 } |
|
370 } |
|
371 |
|
372 /* now, add up the delta offsets for the transitional entries */ |
|
373 for(cell=0; cell<256; ++cell) { |
|
374 entry=states->stateTable[state][cell]; |
|
375 if(MBCS_ENTRY_IS_TRANSITION(entry)) { |
|
376 if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) { |
|
377 states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum); |
|
378 sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)]; |
|
379 } else { |
|
380 /* that next state does not have a sum yet, we cannot finish the one for this state */ |
|
381 sum=-1; |
|
382 break; |
|
383 } |
|
384 } |
|
385 } |
|
386 |
|
387 if(sum!=-1) { |
|
388 states->stateOffsetSum[state]=sum; |
|
389 states->stateFlags[state]|=MBCS_STATE_FLAG_READY; |
|
390 } |
|
391 } |
|
392 } |
|
393 } |
|
394 |
|
395 if(!allStatesReady) { |
|
396 fprintf(stderr, "ucm error: the state table contains loops\n"); |
|
397 exit(U_INVALID_TABLE_FORMAT); |
|
398 } |
|
399 |
|
400 /* |
|
401 * For all "direct" (i.e., initial) states>0, |
|
402 * the offsets need to be increased by the sum of |
|
403 * the previous initial states. |
|
404 */ |
|
405 sum=states->stateOffsetSum[0]; |
|
406 for(state=1; state<states->countStates; ++state) { |
|
407 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { |
|
408 int32_t sum2=sum; |
|
409 sum+=states->stateOffsetSum[state]; |
|
410 for(cell=0; cell<256; ++cell) { |
|
411 entry=states->stateTable[state][cell]; |
|
412 if(MBCS_ENTRY_IS_TRANSITION(entry)) { |
|
413 states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2); |
|
414 } |
|
415 } |
|
416 } |
|
417 } |
|
418 |
|
419 /* round up to the next even number to have the following data 32-bit-aligned */ |
|
420 return states->countToUCodeUnits=(sum+1)&~1; |
|
421 } |
|
422 |
|
423 U_CAPI void U_EXPORT2 |
|
424 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) { |
|
425 int32_t entry, state, cell, count; |
|
426 |
|
427 if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) { |
|
428 fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n"); |
|
429 exit(U_INVALID_TABLE_FORMAT); |
|
430 } |
|
431 |
|
432 if(states->countStates==0) { |
|
433 switch(states->conversionType) { |
|
434 case UCNV_SBCS: |
|
435 /* SBCS: use MBCS data structure with a default state table */ |
|
436 if(states->maxCharLength!=1) { |
|
437 fprintf(stderr, "error: SBCS codepage with max B/char!=1\n"); |
|
438 exit(U_INVALID_TABLE_FORMAT); |
|
439 } |
|
440 states->conversionType=UCNV_MBCS; |
|
441 ucm_addState(states, "0-ff"); |
|
442 break; |
|
443 case UCNV_MBCS: |
|
444 fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n"); |
|
445 exit(U_INVALID_TABLE_FORMAT); |
|
446 break; |
|
447 case UCNV_EBCDIC_STATEFUL: |
|
448 /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */ |
|
449 if(states->minCharLength!=1 || states->maxCharLength!=2) { |
|
450 fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n"); |
|
451 exit(U_INVALID_TABLE_FORMAT); |
|
452 } |
|
453 states->conversionType=UCNV_MBCS; |
|
454 ucm_addState(states, "0-ff, e:1.s, f:0.s"); |
|
455 ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4"); |
|
456 ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i"); |
|
457 ucm_addState(states, "0-ff:1.i, 40:1."); |
|
458 ucm_addState(states, "0-ff:1.i"); |
|
459 break; |
|
460 case UCNV_DBCS: |
|
461 /* DBCS: use MBCS data structure with a default state table */ |
|
462 if(states->minCharLength!=2 || states->maxCharLength!=2) { |
|
463 fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n"); |
|
464 exit(U_INVALID_TABLE_FORMAT); |
|
465 } |
|
466 states->conversionType = UCNV_MBCS; |
|
467 ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3"); |
|
468 ucm_addState(states, "41-fe"); |
|
469 ucm_addState(states, "40"); |
|
470 ucm_addState(states, ""); |
|
471 break; |
|
472 default: |
|
473 fprintf(stderr, "ucm error: unknown charset structure\n"); |
|
474 exit(U_INVALID_TABLE_FORMAT); |
|
475 break; |
|
476 } |
|
477 } |
|
478 |
|
479 /* |
|
480 * check that the min/max character lengths are reasonable; |
|
481 * to do this right, all paths through the state table would have to be |
|
482 * recursively walked while keeping track of the sequence lengths, |
|
483 * but these simple checks cover most state tables in practice |
|
484 */ |
|
485 if(states->maxCharLength<states->minCharLength) { |
|
486 fprintf(stderr, "ucm error: max B/char < min B/char\n"); |
|
487 exit(U_INVALID_TABLE_FORMAT); |
|
488 } |
|
489 |
|
490 /* count non-direct states and compare with max B/char */ |
|
491 count=0; |
|
492 for(state=0; state<states->countStates; ++state) { |
|
493 if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) { |
|
494 ++count; |
|
495 } |
|
496 } |
|
497 if(states->maxCharLength>count+1) { |
|
498 fprintf(stderr, "ucm error: max B/char too large\n"); |
|
499 exit(U_INVALID_TABLE_FORMAT); |
|
500 } |
|
501 |
|
502 if(states->minCharLength==1) { |
|
503 int32_t action; |
|
504 |
|
505 /* |
|
506 * if there are single-byte characters, |
|
507 * then the initial state must have direct result states |
|
508 */ |
|
509 for(cell=0; cell<256; ++cell) { |
|
510 entry=states->stateTable[0][cell]; |
|
511 if( MBCS_ENTRY_IS_FINAL(entry) && |
|
512 ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 || |
|
513 action==MBCS_STATE_UNASSIGNED) |
|
514 ) { |
|
515 break; |
|
516 } |
|
517 } |
|
518 |
|
519 if(cell==256) { |
|
520 fprintf(stderr, "ucm warning: min B/char too small\n"); |
|
521 } |
|
522 } |
|
523 |
|
524 /* |
|
525 * make sure that all "next state" values are within limits |
|
526 * and that all next states after final ones have the "direct" |
|
527 * flag of initial states |
|
528 */ |
|
529 for(state=states->countStates-1; state>=0; --state) { |
|
530 for(cell=0; cell<256; ++cell) { |
|
531 entry=states->stateTable[state][cell]; |
|
532 if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) { |
|
533 fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n", |
|
534 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); |
|
535 exit(U_INVALID_TABLE_FORMAT); |
|
536 } |
|
537 if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) { |
|
538 fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n", |
|
539 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); |
|
540 exit(U_INVALID_TABLE_FORMAT); |
|
541 } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) { |
|
542 fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n", |
|
543 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); |
|
544 exit(U_INVALID_TABLE_FORMAT); |
|
545 } |
|
546 } |
|
547 } |
|
548 |
|
549 /* is this an SI/SO (like EBCDIC-stateful) state table? */ |
|
550 if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) { |
|
551 if(states->maxCharLength!=2) { |
|
552 fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength); |
|
553 exit(U_INVALID_TABLE_FORMAT); |
|
554 } |
|
555 if(states->countStates<3) { |
|
556 fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates); |
|
557 exit(U_INVALID_TABLE_FORMAT); |
|
558 } |
|
559 /* are the SI/SO all in the right places? */ |
|
560 if( ignoreSISOCheck || |
|
561 (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && |
|
562 states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) && |
|
563 states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && |
|
564 states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0)) |
|
565 ) { |
|
566 states->outputType=MBCS_OUTPUT_2_SISO; |
|
567 } else { |
|
568 fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n"); |
|
569 exit(U_INVALID_TABLE_FORMAT); |
|
570 } |
|
571 state=2; |
|
572 } else { |
|
573 state=1; |
|
574 } |
|
575 |
|
576 /* check that no unexpected state is a "direct" one */ |
|
577 while(state<states->countStates) { |
|
578 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { |
|
579 fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state); |
|
580 exit(U_INVALID_TABLE_FORMAT); |
|
581 } |
|
582 ++state; |
|
583 } |
|
584 |
|
585 sumUpStates(states); |
|
586 } |
|
587 |
|
588 /* find a fallback for this offset; return the index or -1 if not found */ |
|
589 U_CAPI int32_t U_EXPORT2 |
|
590 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, |
|
591 uint32_t offset) { |
|
592 int32_t i; |
|
593 |
|
594 if(countToUFallbacks==0) { |
|
595 /* shortcut: most codepages do not have fallbacks from codepage to Unicode */ |
|
596 return -1; |
|
597 } |
|
598 |
|
599 /* do a linear search for the fallback mapping (the table is not yet sorted) */ |
|
600 for(i=0; i<countToUFallbacks; ++i) { |
|
601 if(offset==toUFallbacks[i].offset) { |
|
602 return i; |
|
603 } |
|
604 } |
|
605 return -1; |
|
606 } |
|
607 |
|
608 /* |
|
609 * This function tries to compact toUnicode tables for 2-byte codepages |
|
610 * by finding lead bytes with all-unassigned trail bytes and adding another state |
|
611 * for them. |
|
612 */ |
|
613 static void |
|
614 compactToUnicode2(UCMStates *states, |
|
615 uint16_t **pUnicodeCodeUnits, |
|
616 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, |
|
617 UBool verbose) { |
|
618 int32_t (*oldStateTable)[256]; |
|
619 uint16_t count[256]; |
|
620 uint16_t *oldUnicodeCodeUnits; |
|
621 int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum; |
|
622 int32_t i, j, leadState, trailState, newState, fallback; |
|
623 uint16_t unit; |
|
624 |
|
625 /* find the lead state */ |
|
626 if(states->outputType==MBCS_OUTPUT_2_SISO) { |
|
627 /* use the DBCS lead state for SI/SO codepages */ |
|
628 leadState=1; |
|
629 } else { |
|
630 leadState=0; |
|
631 } |
|
632 |
|
633 /* find the main trail state: the most used target state */ |
|
634 uprv_memset(count, 0, sizeof(count)); |
|
635 for(i=0; i<256; ++i) { |
|
636 entry=states->stateTable[leadState][i]; |
|
637 if(MBCS_ENTRY_IS_TRANSITION(entry)) { |
|
638 ++count[MBCS_ENTRY_TRANSITION_STATE(entry)]; |
|
639 } |
|
640 } |
|
641 trailState=0; |
|
642 for(i=1; i<states->countStates; ++i) { |
|
643 if(count[i]>count[trailState]) { |
|
644 trailState=i; |
|
645 } |
|
646 } |
|
647 |
|
648 /* count possible savings from lead bytes with all-unassigned results in all trail bytes */ |
|
649 uprv_memset(count, 0, sizeof(count)); |
|
650 savings=0; |
|
651 /* for each lead byte */ |
|
652 for(i=0; i<256; ++i) { |
|
653 entry=states->stateTable[leadState][i]; |
|
654 if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) { |
|
655 /* the offset is different for each lead byte */ |
|
656 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); |
|
657 /* for each trail byte for this lead byte */ |
|
658 for(j=0; j<256; ++j) { |
|
659 entry=states->stateTable[trailState][j]; |
|
660 switch(MBCS_ENTRY_FINAL_ACTION(entry)) { |
|
661 case MBCS_STATE_VALID_16: |
|
662 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); |
|
663 if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { |
|
664 ++count[i]; |
|
665 } else { |
|
666 j=999; /* do not count for this lead byte because there are assignments */ |
|
667 } |
|
668 break; |
|
669 case MBCS_STATE_VALID_16_PAIR: |
|
670 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); |
|
671 if((*pUnicodeCodeUnits)[entry]==0xfffe) { |
|
672 count[i]+=2; |
|
673 } else { |
|
674 j=999; /* do not count for this lead byte because there are assignments */ |
|
675 } |
|
676 break; |
|
677 default: |
|
678 break; |
|
679 } |
|
680 } |
|
681 if(j==256) { |
|
682 /* all trail bytes for this lead byte are unassigned */ |
|
683 savings+=count[i]; |
|
684 } else { |
|
685 count[i]=0; |
|
686 } |
|
687 } |
|
688 } |
|
689 /* subtract from the possible savings the cost of an additional state */ |
|
690 savings=savings*2-1024; /* count bytes, not 16-bit words */ |
|
691 if(savings<=0) { |
|
692 return; |
|
693 } |
|
694 if(verbose) { |
|
695 printf("compacting toUnicode data saves %ld bytes\n", (long)savings); |
|
696 } |
|
697 if(states->countStates>=MBCS_MAX_STATE_COUNT) { |
|
698 fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n"); |
|
699 return; |
|
700 } |
|
701 |
|
702 /* make a copy of the state table */ |
|
703 oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024); |
|
704 if(oldStateTable==NULL) { |
|
705 fprintf(stderr, "cannot compact toUnicode: out of memory\n"); |
|
706 return; |
|
707 } |
|
708 uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024); |
|
709 |
|
710 /* add the new state */ |
|
711 /* |
|
712 * this function does not catch the degenerate case where all lead bytes |
|
713 * have all-unassigned trail bytes and the lead state could be removed |
|
714 */ |
|
715 newState=states->countStates++; |
|
716 states->stateFlags[newState]=0; |
|
717 /* copy the old trail state, turning all assigned states into unassigned ones */ |
|
718 for(i=0; i<256; ++i) { |
|
719 entry=states->stateTable[trailState][i]; |
|
720 switch(MBCS_ENTRY_FINAL_ACTION(entry)) { |
|
721 case MBCS_STATE_VALID_16: |
|
722 case MBCS_STATE_VALID_16_PAIR: |
|
723 states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); |
|
724 break; |
|
725 default: |
|
726 states->stateTable[newState][i]=entry; |
|
727 break; |
|
728 } |
|
729 } |
|
730 |
|
731 /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */ |
|
732 for(i=0; i<256; ++i) { |
|
733 if(count[i]>0) { |
|
734 states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState); |
|
735 } |
|
736 } |
|
737 |
|
738 /* sum up the new state table */ |
|
739 for(i=0; i<states->countStates; ++i) { |
|
740 states->stateFlags[i]&=~MBCS_STATE_FLAG_READY; |
|
741 } |
|
742 sum=sumUpStates(states); |
|
743 |
|
744 /* allocate a new, smaller code units array */ |
|
745 oldUnicodeCodeUnits=*pUnicodeCodeUnits; |
|
746 if(sum==0) { |
|
747 *pUnicodeCodeUnits=NULL; |
|
748 if(oldUnicodeCodeUnits!=NULL) { |
|
749 uprv_free(oldUnicodeCodeUnits); |
|
750 } |
|
751 uprv_free(oldStateTable); |
|
752 return; |
|
753 } |
|
754 *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t)); |
|
755 if(*pUnicodeCodeUnits==NULL) { |
|
756 fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n", |
|
757 (long)sum); |
|
758 /* revert to the old state table */ |
|
759 *pUnicodeCodeUnits=oldUnicodeCodeUnits; |
|
760 --states->countStates; |
|
761 uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024); |
|
762 uprv_free(oldStateTable); |
|
763 return; |
|
764 } |
|
765 for(i=0; i<sum; ++i) { |
|
766 (*pUnicodeCodeUnits)[i]=0xfffe; |
|
767 } |
|
768 |
|
769 /* copy the code units for all assigned characters */ |
|
770 /* |
|
771 * The old state table has the same lead _and_ trail states for assigned characters! |
|
772 * The differences are in the offsets, and in the trail states for some unassigned characters. |
|
773 * For each character with an assigned state in the new table, it was assigned in the old one. |
|
774 * Only still-assigned characters are copied. |
|
775 * Note that fallback mappings need to get their offset values adjusted. |
|
776 */ |
|
777 |
|
778 /* for each initial state */ |
|
779 for(leadState=0; leadState<states->countStates; ++leadState) { |
|
780 if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) { |
|
781 /* for each lead byte from there */ |
|
782 for(i=0; i<256; ++i) { |
|
783 entry=states->stateTable[leadState][i]; |
|
784 if(MBCS_ENTRY_IS_TRANSITION(entry)) { |
|
785 trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); |
|
786 /* the new state does not have assigned states */ |
|
787 if(trailState!=newState) { |
|
788 trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry); |
|
789 oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]); |
|
790 /* for each trail byte */ |
|
791 for(j=0; j<256; ++j) { |
|
792 entry=states->stateTable[trailState][j]; |
|
793 /* copy assigned-character code units and adjust fallback offsets */ |
|
794 switch(MBCS_ENTRY_FINAL_ACTION(entry)) { |
|
795 case MBCS_STATE_VALID_16: |
|
796 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); |
|
797 /* find the old offset according to the old state table */ |
|
798 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); |
|
799 unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; |
|
800 if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) { |
|
801 toUFallbacks[fallback].offset=0x80000000|offset; |
|
802 } |
|
803 break; |
|
804 case MBCS_STATE_VALID_16_PAIR: |
|
805 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); |
|
806 /* find the old offset according to the old state table */ |
|
807 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); |
|
808 (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++]; |
|
809 (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; |
|
810 break; |
|
811 default: |
|
812 break; |
|
813 } |
|
814 } |
|
815 } |
|
816 } |
|
817 } |
|
818 } |
|
819 } |
|
820 |
|
821 /* remove temporary flags from fallback offsets that protected them from being modified twice */ |
|
822 for(i=0; i<countToUFallbacks; ++i) { |
|
823 toUFallbacks[i].offset&=0x7fffffff; |
|
824 } |
|
825 |
|
826 /* free temporary memory */ |
|
827 uprv_free(oldUnicodeCodeUnits); |
|
828 uprv_free(oldStateTable); |
|
829 } |
|
830 |
|
831 /* |
|
832 * recursive sub-function of compactToUnicodeHelper() |
|
833 * returns: |
|
834 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved, |
|
835 * if all sequences from this state are unassigned, returns the |
|
836 * <0 there are assignments in unicodeCodeUnits[] |
|
837 * 0 no use of unicodeCodeUnits[] |
|
838 */ |
|
839 static int32_t |
|
840 findUnassigned(UCMStates *states, |
|
841 uint16_t *unicodeCodeUnits, |
|
842 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, |
|
843 int32_t state, int32_t offset, uint32_t b) { |
|
844 int32_t i, entry, savings, localSavings, belowSavings; |
|
845 UBool haveAssigned; |
|
846 |
|
847 localSavings=belowSavings=0; |
|
848 haveAssigned=FALSE; |
|
849 for(i=0; i<256; ++i) { |
|
850 entry=states->stateTable[state][i]; |
|
851 if(MBCS_ENTRY_IS_TRANSITION(entry)) { |
|
852 savings=findUnassigned(states, |
|
853 unicodeCodeUnits, |
|
854 toUFallbacks, countToUFallbacks, |
|
855 MBCS_ENTRY_TRANSITION_STATE(entry), |
|
856 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), |
|
857 (b<<8)|(uint32_t)i); |
|
858 if(savings<0) { |
|
859 haveAssigned=TRUE; |
|
860 } else if(savings>0) { |
|
861 printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n", |
|
862 (unsigned long)((b<<8)|i), (long)state, (long)savings); |
|
863 belowSavings+=savings; |
|
864 } |
|
865 } else if(!haveAssigned) { |
|
866 switch(MBCS_ENTRY_FINAL_ACTION(entry)) { |
|
867 case MBCS_STATE_VALID_16: |
|
868 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); |
|
869 if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { |
|
870 localSavings+=2; |
|
871 } else { |
|
872 haveAssigned=TRUE; |
|
873 } |
|
874 break; |
|
875 case MBCS_STATE_VALID_16_PAIR: |
|
876 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); |
|
877 if(unicodeCodeUnits[entry]==0xfffe) { |
|
878 localSavings+=4; |
|
879 } else { |
|
880 haveAssigned=TRUE; |
|
881 } |
|
882 break; |
|
883 default: |
|
884 break; |
|
885 } |
|
886 } |
|
887 } |
|
888 if(haveAssigned) { |
|
889 return -1; |
|
890 } else { |
|
891 return localSavings+belowSavings; |
|
892 } |
|
893 } |
|
894 |
|
895 /* helper function for finding compaction opportunities */ |
|
896 static void |
|
897 compactToUnicodeHelper(UCMStates *states, |
|
898 uint16_t *unicodeCodeUnits, |
|
899 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) { |
|
900 int32_t state, savings; |
|
901 |
|
902 /* for each initial state */ |
|
903 for(state=0; state<states->countStates; ++state) { |
|
904 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { |
|
905 savings=findUnassigned(states, |
|
906 unicodeCodeUnits, |
|
907 toUFallbacks, countToUFallbacks, |
|
908 state, 0, 0); |
|
909 if(savings>0) { |
|
910 printf(" all-unassigned sequences from initial state %ld use %ld bytes\n", |
|
911 (long)state, (long)savings); |
|
912 } |
|
913 } |
|
914 } |
|
915 } |
|
916 |
|
917 static int32_t |
|
918 compareFallbacks(const void *context, const void *fb1, const void *fb2) { |
|
919 return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset; |
|
920 } |
|
921 |
|
922 U_CAPI void U_EXPORT2 |
|
923 ucm_optimizeStates(UCMStates *states, |
|
924 uint16_t **pUnicodeCodeUnits, |
|
925 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, |
|
926 UBool verbose) { |
|
927 UErrorCode errorCode; |
|
928 int32_t state, cell, entry; |
|
929 |
|
930 /* test each state table entry */ |
|
931 for(state=0; state<states->countStates; ++state) { |
|
932 for(cell=0; cell<256; ++cell) { |
|
933 entry=states->stateTable[state][cell]; |
|
934 /* |
|
935 * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code |
|
936 * and the code point is "unassigned" (0xfffe), then change it to |
|
937 * the "unassigned" action code with bits 26..23 set to zero and U+fffe. |
|
938 */ |
|
939 if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) { |
|
940 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED); |
|
941 } |
|
942 } |
|
943 } |
|
944 |
|
945 /* try to compact the toUnicode tables */ |
|
946 if(states->maxCharLength==2) { |
|
947 compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose); |
|
948 } else if(states->maxCharLength>2) { |
|
949 if(verbose) { |
|
950 compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks); |
|
951 } |
|
952 } |
|
953 |
|
954 /* sort toUFallbacks */ |
|
955 /* |
|
956 * It should be safe to sort them before compactToUnicode2() is called, |
|
957 * because it should not change the relative order of the offset values |
|
958 * that it adjusts, but they need to be sorted at some point, and |
|
959 * it is safest here. |
|
960 */ |
|
961 if(countToUFallbacks>0) { |
|
962 errorCode=U_ZERO_ERROR; /* nothing bad will happen... */ |
|
963 uprv_sortArray(toUFallbacks, countToUFallbacks, |
|
964 sizeof(_MBCSToUFallback), |
|
965 compareFallbacks, NULL, FALSE, &errorCode); |
|
966 } |
|
967 } |
|
968 |
|
969 /* use a complete state table ----------------------------------------------- */ |
|
970 |
|
971 U_CAPI int32_t U_EXPORT2 |
|
972 ucm_countChars(UCMStates *states, |
|
973 const uint8_t *bytes, int32_t length) { |
|
974 uint32_t offset; |
|
975 int32_t i, entry, count; |
|
976 uint8_t state; |
|
977 |
|
978 offset=0; |
|
979 count=0; |
|
980 state=0; |
|
981 |
|
982 if(states->countStates==0) { |
|
983 fprintf(stderr, "ucm error: there is no state information!\n"); |
|
984 return -1; |
|
985 } |
|
986 |
|
987 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */ |
|
988 if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) { |
|
989 state=1; |
|
990 } |
|
991 |
|
992 /* |
|
993 * Walk down the state table like in conversion, |
|
994 * much like getNextUChar(). |
|
995 * We assume that c<=0x10ffff. |
|
996 */ |
|
997 for(i=0; i<length; ++i) { |
|
998 entry=states->stateTable[state][bytes[i]]; |
|
999 if(MBCS_ENTRY_IS_TRANSITION(entry)) { |
|
1000 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); |
|
1001 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); |
|
1002 } else { |
|
1003 switch(MBCS_ENTRY_FINAL_ACTION(entry)) { |
|
1004 case MBCS_STATE_ILLEGAL: |
|
1005 fprintf(stderr, "ucm error: byte sequence ends in illegal state\n"); |
|
1006 return -1; |
|
1007 case MBCS_STATE_CHANGE_ONLY: |
|
1008 fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n"); |
|
1009 return -1; |
|
1010 case MBCS_STATE_UNASSIGNED: |
|
1011 case MBCS_STATE_FALLBACK_DIRECT_16: |
|
1012 case MBCS_STATE_VALID_DIRECT_16: |
|
1013 case MBCS_STATE_FALLBACK_DIRECT_20: |
|
1014 case MBCS_STATE_VALID_DIRECT_20: |
|
1015 case MBCS_STATE_VALID_16: |
|
1016 case MBCS_STATE_VALID_16_PAIR: |
|
1017 /* count a complete character and prepare for a new one */ |
|
1018 ++count; |
|
1019 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); |
|
1020 offset=0; |
|
1021 break; |
|
1022 default: |
|
1023 /* reserved, must never occur */ |
|
1024 fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry); |
|
1025 return -1; |
|
1026 } |
|
1027 } |
|
1028 } |
|
1029 |
|
1030 if(offset!=0) { |
|
1031 fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state); |
|
1032 return -1; |
|
1033 } |
|
1034 |
|
1035 /* |
|
1036 * for SI/SO (like EBCDIC-stateful), multiple-character results |
|
1037 * must consist of only double-byte sequences |
|
1038 */ |
|
1039 if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) { |
|
1040 fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count); |
|
1041 return -1; |
|
1042 } |
|
1043 |
|
1044 return count; |
|
1045 } |
|
1046 #endif |
|
1047 |