|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2005-2013, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 */ |
|
7 |
|
8 #ifndef __CSRSBCS_H |
|
9 #define __CSRSBCS_H |
|
10 |
|
11 #include "unicode/uobject.h" |
|
12 |
|
13 #if !UCONFIG_NO_CONVERSION |
|
14 |
|
15 #include "csrecog.h" |
|
16 |
|
17 U_NAMESPACE_BEGIN |
|
18 |
|
19 class NGramParser : public UMemory |
|
20 { |
|
21 private: |
|
22 int32_t ngram; |
|
23 const int32_t *ngramList; |
|
24 |
|
25 int32_t ngramCount; |
|
26 int32_t hitCount; |
|
27 |
|
28 protected: |
|
29 int32_t byteIndex; |
|
30 const uint8_t *charMap; |
|
31 |
|
32 void addByte(int32_t b); |
|
33 |
|
34 public: |
|
35 NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); |
|
36 |
|
37 private: |
|
38 /* |
|
39 * Binary search for value in table, which must have exactly 64 entries. |
|
40 */ |
|
41 int32_t search(const int32_t *table, int32_t value); |
|
42 |
|
43 void lookup(int32_t thisNgram); |
|
44 |
|
45 virtual int32_t nextByte(InputText *det); |
|
46 virtual void parseCharacters(InputText *det); |
|
47 |
|
48 public: |
|
49 int32_t parse(InputText *det); |
|
50 |
|
51 }; |
|
52 |
|
53 class NGramParser_IBM420 : public NGramParser |
|
54 { |
|
55 private: |
|
56 int32_t alef; |
|
57 int32_t isLamAlef(int32_t b); |
|
58 int32_t nextByte(InputText *det); |
|
59 void parseCharacters(InputText *det); |
|
60 |
|
61 public: |
|
62 NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap); |
|
63 }; |
|
64 |
|
65 |
|
66 class CharsetRecog_sbcs : public CharsetRecognizer |
|
67 { |
|
68 public: |
|
69 CharsetRecog_sbcs(); |
|
70 virtual ~CharsetRecog_sbcs(); |
|
71 virtual const char *getName() const = 0; |
|
72 virtual UBool match(InputText *det, CharsetMatch *results) const = 0; |
|
73 virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; |
|
74 }; |
|
75 |
|
76 class CharsetRecog_8859_1 : public CharsetRecog_sbcs |
|
77 { |
|
78 public: |
|
79 virtual ~CharsetRecog_8859_1(); |
|
80 const char *getName() const; |
|
81 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
82 }; |
|
83 |
|
84 class CharsetRecog_8859_2 : public CharsetRecog_sbcs |
|
85 { |
|
86 public: |
|
87 virtual ~CharsetRecog_8859_2(); |
|
88 const char *getName() const; |
|
89 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
90 }; |
|
91 |
|
92 class CharsetRecog_8859_5 : public CharsetRecog_sbcs |
|
93 { |
|
94 public: |
|
95 virtual ~CharsetRecog_8859_5(); |
|
96 const char *getName() const; |
|
97 }; |
|
98 |
|
99 class CharsetRecog_8859_6 : public CharsetRecog_sbcs |
|
100 { |
|
101 public: |
|
102 virtual ~CharsetRecog_8859_6(); |
|
103 |
|
104 const char *getName() const; |
|
105 }; |
|
106 |
|
107 class CharsetRecog_8859_7 : public CharsetRecog_sbcs |
|
108 { |
|
109 public: |
|
110 virtual ~CharsetRecog_8859_7(); |
|
111 |
|
112 const char *getName() const; |
|
113 }; |
|
114 |
|
115 class CharsetRecog_8859_8 : public CharsetRecog_sbcs |
|
116 { |
|
117 public: |
|
118 virtual ~CharsetRecog_8859_8(); |
|
119 |
|
120 virtual const char *getName() const; |
|
121 }; |
|
122 |
|
123 class CharsetRecog_8859_9 : public CharsetRecog_sbcs |
|
124 { |
|
125 public: |
|
126 virtual ~CharsetRecog_8859_9(); |
|
127 |
|
128 const char *getName() const; |
|
129 }; |
|
130 |
|
131 |
|
132 |
|
133 class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 |
|
134 { |
|
135 public: |
|
136 virtual ~CharsetRecog_8859_5_ru(); |
|
137 |
|
138 const char *getLanguage() const; |
|
139 |
|
140 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
141 }; |
|
142 |
|
143 class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 |
|
144 { |
|
145 public: |
|
146 virtual ~CharsetRecog_8859_6_ar(); |
|
147 |
|
148 const char *getLanguage() const; |
|
149 |
|
150 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
151 }; |
|
152 |
|
153 class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 |
|
154 { |
|
155 public: |
|
156 virtual ~CharsetRecog_8859_7_el(); |
|
157 |
|
158 const char *getLanguage() const; |
|
159 |
|
160 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
161 }; |
|
162 |
|
163 class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 |
|
164 { |
|
165 public: |
|
166 virtual ~CharsetRecog_8859_8_I_he(); |
|
167 |
|
168 const char *getName() const; |
|
169 |
|
170 const char *getLanguage() const; |
|
171 |
|
172 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
173 }; |
|
174 |
|
175 class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 |
|
176 { |
|
177 public: |
|
178 virtual ~CharsetRecog_8859_8_he (); |
|
179 |
|
180 const char *getLanguage() const; |
|
181 |
|
182 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
183 }; |
|
184 |
|
185 class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 |
|
186 { |
|
187 public: |
|
188 virtual ~CharsetRecog_8859_9_tr (); |
|
189 |
|
190 const char *getLanguage() const; |
|
191 |
|
192 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
193 }; |
|
194 |
|
195 class CharsetRecog_windows_1256 : public CharsetRecog_sbcs |
|
196 { |
|
197 public: |
|
198 virtual ~CharsetRecog_windows_1256(); |
|
199 |
|
200 const char *getName() const; |
|
201 |
|
202 const char *getLanguage() const; |
|
203 |
|
204 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
205 }; |
|
206 |
|
207 class CharsetRecog_windows_1251 : public CharsetRecog_sbcs |
|
208 { |
|
209 public: |
|
210 virtual ~CharsetRecog_windows_1251(); |
|
211 |
|
212 const char *getName() const; |
|
213 |
|
214 const char *getLanguage() const; |
|
215 |
|
216 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
217 }; |
|
218 |
|
219 |
|
220 class CharsetRecog_KOI8_R : public CharsetRecog_sbcs |
|
221 { |
|
222 public: |
|
223 virtual ~CharsetRecog_KOI8_R(); |
|
224 |
|
225 const char *getName() const; |
|
226 |
|
227 const char *getLanguage() const; |
|
228 |
|
229 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
230 }; |
|
231 |
|
232 class CharsetRecog_IBM424_he : public CharsetRecog_sbcs |
|
233 { |
|
234 public: |
|
235 virtual ~CharsetRecog_IBM424_he(); |
|
236 |
|
237 const char *getLanguage() const; |
|
238 }; |
|
239 |
|
240 class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { |
|
241 public: |
|
242 virtual ~CharsetRecog_IBM424_he_rtl(); |
|
243 |
|
244 const char *getName() const; |
|
245 |
|
246 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
247 }; |
|
248 |
|
249 class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { |
|
250 virtual ~CharsetRecog_IBM424_he_ltr(); |
|
251 |
|
252 const char *getName() const; |
|
253 |
|
254 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
255 }; |
|
256 |
|
257 class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs |
|
258 { |
|
259 public: |
|
260 virtual ~CharsetRecog_IBM420_ar(); |
|
261 |
|
262 const char *getLanguage() const; |
|
263 int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; |
|
264 |
|
265 }; |
|
266 |
|
267 class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { |
|
268 public: |
|
269 virtual ~CharsetRecog_IBM420_ar_rtl(); |
|
270 |
|
271 const char *getName() const; |
|
272 |
|
273 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
274 }; |
|
275 |
|
276 class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { |
|
277 virtual ~CharsetRecog_IBM420_ar_ltr(); |
|
278 |
|
279 const char *getName() const; |
|
280 |
|
281 virtual UBool match(InputText *det, CharsetMatch *results) const; |
|
282 }; |
|
283 |
|
284 U_NAMESPACE_END |
|
285 |
|
286 #endif /* !UCONFIG_NO_CONVERSION */ |
|
287 #endif /* __CSRSBCS_H */ |