|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2005-2012, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 */ |
|
7 |
|
8 #ifndef __CSRMBCS_H |
|
9 #define __CSRMBCS_H |
|
10 |
|
11 #include "unicode/utypes.h" |
|
12 |
|
13 #if !UCONFIG_NO_CONVERSION |
|
14 |
|
15 #include "csrecog.h" |
|
16 |
|
17 U_NAMESPACE_BEGIN |
|
18 |
|
19 // "Character" iterated character class. |
|
20 // Recognizers for specific mbcs encodings make their "characters" available |
|
21 // by providing a nextChar() function that fills in an instance of IteratedChar |
|
22 // with the next char from the input. |
|
23 // The returned characters are not converted to Unicode, but remain as the raw |
|
24 // bytes (concatenated into an int) from the codepage data. |
|
25 // |
|
26 // For Asian charsets, use the raw input rather than the input that has been |
|
27 // stripped of markup. Detection only considers multi-byte chars, effectively |
|
28 // stripping markup anyway, and double byte chars do occur in markup too. |
|
29 // |
|
30 class IteratedChar : public UMemory |
|
31 { |
|
32 public: |
|
33 uint32_t charValue; // 1-4 bytes from the raw input data |
|
34 int32_t index; |
|
35 int32_t nextIndex; |
|
36 UBool error; |
|
37 UBool done; |
|
38 |
|
39 public: |
|
40 IteratedChar(); |
|
41 //void reset(); |
|
42 int32_t nextByte(InputText* det); |
|
43 }; |
|
44 |
|
45 |
|
46 class CharsetRecog_mbcs : public CharsetRecognizer { |
|
47 |
|
48 protected: |
|
49 /** |
|
50 * Test the match of this charset with the input text data |
|
51 * which is obtained via the CharsetDetector object. |
|
52 * |
|
53 * @param det The CharsetDetector, which contains the input text |
|
54 * to be checked for being in this charset. |
|
55 * @return Two values packed into one int (Damn java, anyhow) |
|
56 * <br/> |
|
57 * bits 0-7: the match confidence, ranging from 0-100 |
|
58 * <br/> |
|
59 * bits 8-15: The match reason, an enum-like value. |
|
60 */ |
|
61 int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const; |
|
62 |
|
63 public: |
|
64 |
|
65 virtual ~CharsetRecog_mbcs(); |
|
66 |
|
67 /** |
|
68 * Get the IANA name of this charset. |
|
69 * @return the charset name. |
|
70 */ |
|
71 |
|
72 const char *getName() const = 0; |
|
73 const char *getLanguage() const = 0; |
|
74 UBool match(InputText* input, CharsetMatch *results) const = 0; |
|
75 |
|
76 /** |
|
77 * Get the next character (however many bytes it is) from the input data |
|
78 * Subclasses for specific charset encodings must implement this function |
|
79 * to get characters according to the rules of their encoding scheme. |
|
80 * |
|
81 * This function is not a method of class IteratedChar only because |
|
82 * that would require a lot of extra derived classes, which is awkward. |
|
83 * @param it The IteratedChar "struct" into which the returned char is placed. |
|
84 * @param det The charset detector, which is needed to get at the input byte data |
|
85 * being iterated over. |
|
86 * @return True if a character was returned, false at end of input. |
|
87 */ |
|
88 virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0; |
|
89 |
|
90 }; |
|
91 |
|
92 |
|
93 /** |
|
94 * Shift-JIS charset recognizer. |
|
95 * |
|
96 */ |
|
97 class CharsetRecog_sjis : public CharsetRecog_mbcs { |
|
98 public: |
|
99 virtual ~CharsetRecog_sjis(); |
|
100 |
|
101 UBool nextChar(IteratedChar *it, InputText *det) const; |
|
102 |
|
103 UBool match(InputText* input, CharsetMatch *results) const; |
|
104 |
|
105 const char *getName() const; |
|
106 const char *getLanguage() const; |
|
107 |
|
108 }; |
|
109 |
|
110 |
|
111 /** |
|
112 * EUC charset recognizers. One abstract class that provides the common function |
|
113 * for getting the next character according to the EUC encoding scheme, |
|
114 * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. |
|
115 * |
|
116 */ |
|
117 class CharsetRecog_euc : public CharsetRecog_mbcs |
|
118 { |
|
119 public: |
|
120 virtual ~CharsetRecog_euc(); |
|
121 |
|
122 const char *getName() const = 0; |
|
123 const char *getLanguage() const = 0; |
|
124 |
|
125 UBool match(InputText* input, CharsetMatch *results) const = 0; |
|
126 /* |
|
127 * (non-Javadoc) |
|
128 * Get the next character value for EUC based encodings. |
|
129 * Character "value" is simply the raw bytes that make up the character |
|
130 * packed into an int. |
|
131 */ |
|
132 UBool nextChar(IteratedChar *it, InputText *det) const; |
|
133 }; |
|
134 |
|
135 /** |
|
136 * The charset recognize for EUC-JP. A singleton instance of this class |
|
137 * is created and kept by the public CharsetDetector class |
|
138 */ |
|
139 class CharsetRecog_euc_jp : public CharsetRecog_euc |
|
140 { |
|
141 public: |
|
142 virtual ~CharsetRecog_euc_jp(); |
|
143 |
|
144 const char *getName() const; |
|
145 const char *getLanguage() const; |
|
146 |
|
147 UBool match(InputText* input, CharsetMatch *results) const; |
|
148 }; |
|
149 |
|
150 /** |
|
151 * The charset recognize for EUC-KR. A singleton instance of this class |
|
152 * is created and kept by the public CharsetDetector class |
|
153 */ |
|
154 class CharsetRecog_euc_kr : public CharsetRecog_euc |
|
155 { |
|
156 public: |
|
157 virtual ~CharsetRecog_euc_kr(); |
|
158 |
|
159 const char *getName() const; |
|
160 const char *getLanguage() const; |
|
161 |
|
162 UBool match(InputText* input, CharsetMatch *results) const; |
|
163 }; |
|
164 |
|
165 /** |
|
166 * |
|
167 * Big5 charset recognizer. |
|
168 * |
|
169 */ |
|
170 class CharsetRecog_big5 : public CharsetRecog_mbcs |
|
171 { |
|
172 public: |
|
173 virtual ~CharsetRecog_big5(); |
|
174 |
|
175 UBool nextChar(IteratedChar* it, InputText* det) const; |
|
176 |
|
177 const char *getName() const; |
|
178 const char *getLanguage() const; |
|
179 |
|
180 UBool match(InputText* input, CharsetMatch *results) const; |
|
181 }; |
|
182 |
|
183 |
|
184 /** |
|
185 * |
|
186 * GB-18030 recognizer. Uses simplified Chinese statistics. |
|
187 * |
|
188 */ |
|
189 class CharsetRecog_gb_18030 : public CharsetRecog_mbcs |
|
190 { |
|
191 public: |
|
192 virtual ~CharsetRecog_gb_18030(); |
|
193 |
|
194 UBool nextChar(IteratedChar* it, InputText* det) const; |
|
195 |
|
196 const char *getName() const; |
|
197 const char *getLanguage() const; |
|
198 |
|
199 UBool match(InputText* input, CharsetMatch *results) const; |
|
200 }; |
|
201 |
|
202 U_NAMESPACE_END |
|
203 |
|
204 #endif |
|
205 #endif /* __CSRMBCS_H */ |