|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 1999-2011, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * |
|
7 * ucnv_cnv.h: |
|
8 * Definitions for converter implementations. |
|
9 * |
|
10 * Modification History: |
|
11 * |
|
12 * Date Name Description |
|
13 * 05/09/00 helena Added implementation to handle fallback mappings. |
|
14 * 06/29/2000 helena Major rewrite of the callback APIs. |
|
15 */ |
|
16 |
|
17 #ifndef UCNV_CNV_H |
|
18 #define UCNV_CNV_H |
|
19 |
|
20 #include "unicode/utypes.h" |
|
21 |
|
22 #if !UCONFIG_NO_CONVERSION |
|
23 |
|
24 #include "unicode/ucnv.h" |
|
25 #include "unicode/ucnv_err.h" |
|
26 #include "unicode/uset.h" |
|
27 #include "uset_imp.h" |
|
28 |
|
29 U_CDECL_BEGIN |
|
30 |
|
31 /* this is used in fromUnicode DBCS tables as an "unassigned" marker */ |
|
32 #define missingCharMarker 0xFFFF |
|
33 |
|
34 /* |
|
35 * #define missingUCharMarker 0xfffe |
|
36 * |
|
37 * commented out because there are actually two values used in toUnicode tables: |
|
38 * U+fffe "unassigned" |
|
39 * U+ffff "illegal" |
|
40 */ |
|
41 |
|
42 /** Forward declaration, see ucnv_bld.h */ |
|
43 struct UConverterSharedData; |
|
44 typedef struct UConverterSharedData UConverterSharedData; |
|
45 |
|
46 /* function types for UConverterImpl ---------------------------------------- */ |
|
47 |
|
48 /* struct with arguments for UConverterLoad and ucnv_load() */ |
|
49 typedef struct { |
|
50 int32_t size; /* sizeof(UConverterLoadArgs) */ |
|
51 int32_t nestedLoads; /* count nested ucnv_load() calls */ |
|
52 UBool onlyTestIsLoadable; /* input: don't actually load */ |
|
53 UBool reserved0; /* reserved - for good alignment of the pointers */ |
|
54 int16_t reserved; /* reserved - for good alignment of the pointers */ |
|
55 uint32_t options; |
|
56 const char *pkg, *name, *locale; |
|
57 } UConverterLoadArgs; |
|
58 |
|
59 #define UCNV_LOAD_ARGS_INITIALIZER \ |
|
60 { (int32_t)sizeof(UConverterLoadArgs), 0, FALSE, FALSE, 0, 0, NULL, NULL, NULL } |
|
61 |
|
62 typedef void (*UConverterLoad) (UConverterSharedData *sharedData, |
|
63 UConverterLoadArgs *pArgs, |
|
64 const uint8_t *raw, UErrorCode *pErrorCode); |
|
65 typedef void (*UConverterUnload) (UConverterSharedData *sharedData); |
|
66 |
|
67 typedef void (*UConverterOpen) (UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *pErrorCode); |
|
68 typedef void (*UConverterClose) (UConverter *cnv); |
|
69 |
|
70 typedef enum UConverterResetChoice { |
|
71 UCNV_RESET_BOTH, |
|
72 UCNV_RESET_TO_UNICODE, |
|
73 UCNV_RESET_FROM_UNICODE |
|
74 } UConverterResetChoice; |
|
75 |
|
76 typedef void (*UConverterReset) (UConverter *cnv, UConverterResetChoice choice); |
|
77 |
|
78 /* |
|
79 * Converter implementation function(s) for ucnv_toUnicode(). |
|
80 * If the toUnicodeWithOffsets function pointer is NULL, |
|
81 * then the toUnicode function will be used and the offsets will be set to -1. |
|
82 * |
|
83 * Must maintain state across buffers. Use toUBytes[toULength] for partial input |
|
84 * sequences; it will be checked in ucnv.c at the end of the input stream |
|
85 * to detect truncated input. |
|
86 * Some converters may need additional detection and may then set U_TRUNCATED_CHAR_FOUND. |
|
87 * |
|
88 * The toUnicodeWithOffsets must write exactly as many offset values as target |
|
89 * units. Write offset values of -1 for when the source index corresponding to |
|
90 * the output unit is not known (e.g., the character started in an earlier buffer). |
|
91 * The pArgs->offsets pointer need not be moved forward. |
|
92 * |
|
93 * At function return, either one of the following conditions must be true: |
|
94 * - U_BUFFER_OVERFLOW_ERROR and the target is full: target==targetLimit |
|
95 * - another error code with toUBytes[toULength] set to the offending input |
|
96 * - no error, and the source is consumed: source==sourceLimit |
|
97 * |
|
98 * The ucnv.c code will handle the end of the input (reset) |
|
99 * (reset, and truncation detection) and callbacks. |
|
100 */ |
|
101 typedef void (*UConverterToUnicode) (UConverterToUnicodeArgs *, UErrorCode *); |
|
102 |
|
103 /* |
|
104 * Same rules as for UConverterToUnicode. |
|
105 * A lead surrogate is kept in fromUChar32 across buffers, and if an error |
|
106 * occurs, then the offending input code point must be put into fromUChar32 |
|
107 * as well. |
|
108 */ |
|
109 typedef void (*UConverterFromUnicode) (UConverterFromUnicodeArgs *, UErrorCode *); |
|
110 |
|
111 /* |
|
112 * Converter implementation function for ucnv_convertEx(), for direct conversion |
|
113 * between two charsets without pivoting through UTF-16. |
|
114 * The rules are the same as for UConverterToUnicode and UConverterFromUnicode. |
|
115 * In addition, |
|
116 * - The toUnicode side must behave and keep state exactly like the |
|
117 * UConverterToUnicode implementation for the same source charset. |
|
118 * - A U_USING_DEFAULT_WARNING can be set to request to temporarily fall back |
|
119 * to pivoting. When this function is called, the conversion framework makes |
|
120 * sure that this warning is not set on input. |
|
121 * - Continuing a partial match and flushing the toUnicode replay buffer |
|
122 * are handled by pivoting, using the toUnicode and fromUnicode functions. |
|
123 */ |
|
124 typedef void (*UConverterConvert) (UConverterFromUnicodeArgs *pFromUArgs, |
|
125 UConverterToUnicodeArgs *pToUArgs, |
|
126 UErrorCode *pErrorCode); |
|
127 |
|
128 /* |
|
129 * Converter implementation function for ucnv_getNextUChar(). |
|
130 * If the function pointer is NULL, then the toUnicode function will be used. |
|
131 * |
|
132 * Will be called at a character boundary (toULength==0). |
|
133 * May return with |
|
134 * - U_INDEX_OUTOFBOUNDS_ERROR if there was no output for the input |
|
135 * (the return value will be ignored) |
|
136 * - U_TRUNCATED_CHAR_FOUND or another error code (never U_BUFFER_OVERFLOW_ERROR!) |
|
137 * with toUBytes[toULength] set to the offending input |
|
138 * (the return value will be ignored) |
|
139 * - return UCNV_GET_NEXT_UCHAR_USE_TO_U, without moving the source pointer, |
|
140 * to indicate that the ucnv.c code shall call the toUnicode function instead |
|
141 * - return a real code point result |
|
142 * |
|
143 * Unless UCNV_GET_NEXT_UCHAR_USE_TO_U is returned, the source bytes must be consumed. |
|
144 * |
|
145 * The ucnv.c code will handle the end of the input (reset) |
|
146 * (except for truncation detection!) and callbacks. |
|
147 */ |
|
148 typedef UChar32 (*UConverterGetNextUChar) (UConverterToUnicodeArgs *, UErrorCode *); |
|
149 |
|
150 typedef void (*UConverterGetStarters)(const UConverter* converter, |
|
151 UBool starters[256], |
|
152 UErrorCode *pErrorCode); |
|
153 |
|
154 /* If this function pointer is null or if the function returns null |
|
155 * the name field in static data struct should be returned by |
|
156 * ucnv_getName() API function |
|
157 */ |
|
158 typedef const char * (*UConverterGetName) (const UConverter *cnv); |
|
159 |
|
160 /** |
|
161 * Write the codepage substitution character. |
|
162 * If this function is not set, then ucnv_cbFromUWriteSub() writes |
|
163 * the substitution character from UConverter. |
|
164 * For stateful converters, it is typically necessary to handle this |
|
165 * specificially for the converter in order to properly maintain the state. |
|
166 */ |
|
167 typedef void (*UConverterWriteSub) (UConverterFromUnicodeArgs *pArgs, int32_t offsetIndex, UErrorCode *pErrorCode); |
|
168 |
|
169 /** |
|
170 * For converter-specific safeClone processing |
|
171 * If this function is not set, then ucnv_safeClone assumes that the converter has no private data that changes |
|
172 * after the converter is done opening. |
|
173 * If this function is set, then it is called just after a memcpy() of |
|
174 * converter data to the new, empty converter, and is expected to set up |
|
175 * the initial state of the converter. It is not expected to increment the |
|
176 * reference counts of the standard data types such as the shared data. |
|
177 */ |
|
178 typedef UConverter * (*UConverterSafeClone) (const UConverter *cnv, |
|
179 void *stackBuffer, |
|
180 int32_t *pBufferSize, |
|
181 UErrorCode *status); |
|
182 |
|
183 /** |
|
184 * Filters for some ucnv_getUnicodeSet() implementation code. |
|
185 */ |
|
186 typedef enum UConverterSetFilter { |
|
187 UCNV_SET_FILTER_NONE, |
|
188 UCNV_SET_FILTER_DBCS_ONLY, |
|
189 UCNV_SET_FILTER_2022_CN, |
|
190 UCNV_SET_FILTER_SJIS, |
|
191 UCNV_SET_FILTER_GR94DBCS, |
|
192 UCNV_SET_FILTER_HZ, |
|
193 UCNV_SET_FILTER_COUNT |
|
194 } UConverterSetFilter; |
|
195 |
|
196 /** |
|
197 * Fills the set of Unicode code points that can be converted by an ICU converter. |
|
198 * The API function ucnv_getUnicodeSet() clears the USet before calling |
|
199 * the converter's getUnicodeSet() implementation; the converter should only |
|
200 * add the appropriate code points to allow recursive use. |
|
201 * For example, the ISO-2022-JP converter will call each subconverter's |
|
202 * getUnicodeSet() implementation to consecutively add code points to |
|
203 * the same USet, which will result in a union of the sets of all subconverters. |
|
204 * |
|
205 * For more documentation, see ucnv_getUnicodeSet() in ucnv.h. |
|
206 */ |
|
207 typedef void (*UConverterGetUnicodeSet) (const UConverter *cnv, |
|
208 const USetAdder *sa, |
|
209 UConverterUnicodeSet which, |
|
210 UErrorCode *pErrorCode); |
|
211 |
|
212 UBool CONVERSION_U_SUCCESS (UErrorCode err); |
|
213 |
|
214 /** |
|
215 * UConverterImpl contains all the data and functions for a converter type. |
|
216 * Its function pointers work much like a C++ vtable. |
|
217 * Many converter types need to define only a subset of the functions; |
|
218 * when a function pointer is NULL, then a default action will be performed. |
|
219 * |
|
220 * Every converter type must implement toUnicode, fromUnicode, and getNextUChar, |
|
221 * otherwise the converter may crash. |
|
222 * Every converter type that has variable-length codepage sequences should |
|
223 * also implement toUnicodeWithOffsets and fromUnicodeWithOffsets for |
|
224 * correct offset handling. |
|
225 * All other functions may or may not be implemented - it depends only on |
|
226 * whether the converter type needs them. |
|
227 * |
|
228 * When open() fails, then close() will be called, if present. |
|
229 */ |
|
230 struct UConverterImpl { |
|
231 UConverterType type; |
|
232 |
|
233 UConverterLoad load; |
|
234 UConverterUnload unload; |
|
235 |
|
236 UConverterOpen open; |
|
237 UConverterClose close; |
|
238 UConverterReset reset; |
|
239 |
|
240 UConverterToUnicode toUnicode; |
|
241 UConverterToUnicode toUnicodeWithOffsets; |
|
242 UConverterFromUnicode fromUnicode; |
|
243 UConverterFromUnicode fromUnicodeWithOffsets; |
|
244 UConverterGetNextUChar getNextUChar; |
|
245 |
|
246 UConverterGetStarters getStarters; |
|
247 UConverterGetName getName; |
|
248 UConverterWriteSub writeSub; |
|
249 UConverterSafeClone safeClone; |
|
250 UConverterGetUnicodeSet getUnicodeSet; |
|
251 |
|
252 UConverterConvert toUTF8; |
|
253 UConverterConvert fromUTF8; |
|
254 }; |
|
255 |
|
256 extern const UConverterSharedData |
|
257 _MBCSData, _Latin1Data, |
|
258 _UTF8Data, _UTF16BEData, _UTF16LEData, _UTF32BEData, _UTF32LEData, |
|
259 _ISO2022Data, |
|
260 _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6, |
|
261 _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19, |
|
262 _HZData,_ISCIIData, _SCSUData, _ASCIIData, |
|
263 _UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData, _CompoundTextData; |
|
264 |
|
265 U_CDECL_END |
|
266 |
|
267 /** Always use fallbacks from codepage to Unicode */ |
|
268 #define TO_U_USE_FALLBACK(useFallback) TRUE |
|
269 #define UCNV_TO_U_USE_FALLBACK(cnv) TRUE |
|
270 |
|
271 /** Use fallbacks from Unicode to codepage when cnv->useFallback or for private-use code points */ |
|
272 #define IS_PRIVATE_USE(c) ((uint32_t)((c)-0xe000)<0x1900 || (uint32_t)((c)-0xf0000)<0x20000) |
|
273 #define FROM_U_USE_FALLBACK(useFallback, c) ((useFallback) || IS_PRIVATE_USE(c)) |
|
274 #define UCNV_FROM_U_USE_FALLBACK(cnv, c) FROM_U_USE_FALLBACK((cnv)->useFallback, c) |
|
275 |
|
276 /** |
|
277 * Magic number for ucnv_getNextUChar(), returned by a |
|
278 * getNextUChar() implementation to indicate to use the converter's toUnicode() |
|
279 * instead of the native function. |
|
280 * @internal |
|
281 */ |
|
282 #define UCNV_GET_NEXT_UCHAR_USE_TO_U -9 |
|
283 |
|
284 U_CFUNC void |
|
285 ucnv_getCompleteUnicodeSet(const UConverter *cnv, |
|
286 const USetAdder *sa, |
|
287 UConverterUnicodeSet which, |
|
288 UErrorCode *pErrorCode); |
|
289 |
|
290 U_CFUNC void |
|
291 ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv, |
|
292 const USetAdder *sa, |
|
293 UConverterUnicodeSet which, |
|
294 UErrorCode *pErrorCode); |
|
295 |
|
296 U_CFUNC void |
|
297 ucnv_fromUWriteBytes(UConverter *cnv, |
|
298 const char *bytes, int32_t length, |
|
299 char **target, const char *targetLimit, |
|
300 int32_t **offsets, |
|
301 int32_t sourceIndex, |
|
302 UErrorCode *pErrorCode); |
|
303 U_CFUNC void |
|
304 ucnv_toUWriteUChars(UConverter *cnv, |
|
305 const UChar *uchars, int32_t length, |
|
306 UChar **target, const UChar *targetLimit, |
|
307 int32_t **offsets, |
|
308 int32_t sourceIndex, |
|
309 UErrorCode *pErrorCode); |
|
310 |
|
311 U_CFUNC void |
|
312 ucnv_toUWriteCodePoint(UConverter *cnv, |
|
313 UChar32 c, |
|
314 UChar **target, const UChar *targetLimit, |
|
315 int32_t **offsets, |
|
316 int32_t sourceIndex, |
|
317 UErrorCode *pErrorCode); |
|
318 |
|
319 #endif |
|
320 |
|
321 #endif /* UCNV_CNV */ |