|
1 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
2 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
4 #define TH_UNICODE |
|
5 |
|
6 #include <stdlib.h> |
|
7 #include <assert.h> |
|
8 #include "th_char.h" |
|
9 #define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z')) |
|
10 #define th_isspace(c) ((c)==' '||(c)=='\t') |
|
11 |
|
12 |
|
13 /* |
|
14 ///////////////////////////////////////////////// |
|
15 // Thai character type array |
|
16 */ |
|
17 |
|
18 typedef unsigned short twb_t; |
|
19 extern const twb_t _TwbType[0x100-0xa0]; |
|
20 |
|
21 /* |
|
22 // bit definition |
|
23 */ |
|
24 |
|
25 #define VRS 0x0001 |
|
26 #define VRE 0x0002 |
|
27 #define VRX 0x0004 |
|
28 |
|
29 #define VRA 0x0008 |
|
30 |
|
31 #define VLA 0x0010 |
|
32 #define VLO 0x0020 |
|
33 #define VLI 0x0040 |
|
34 |
|
35 #define VC 0x0080 |
|
36 |
|
37 #define CC 0x0100 |
|
38 #define CS 0x0200 |
|
39 |
|
40 #define C2 0x0400 |
|
41 #define CHB 0x0800 |
|
42 #define CHE 0x1000 |
|
43 |
|
44 #define MT 0x2000 |
|
45 /* |
|
46 //_#define me 0x2000 |
|
47 */ |
|
48 #define M 0x4000 |
|
49 |
|
50 #define T 0x8000 |
|
51 |
|
52 #define VL (VLA|VLO|VLI) |
|
53 #define VR (VRS|VRE|VRX) |
|
54 #define NE (VL|VRS) |
|
55 #define NB (VR|M) |
|
56 #define V (VL|VR) |
|
57 #define CX (CC|CS) |
|
58 #define C (CX|VC) |
|
59 #define A (C|V|M) |
|
60 |
|
61 #define twbtype(c) (_TwbType[th_zcode(c)]) |
|
62 |
|
63 #ifndef TRUE |
|
64 #define TRUE 1 |
|
65 #define FALSE 0 |
|
66 #endif |
|
67 #define RETURN(b) return (b) |
|
68 |
|
69 |
|
70 /* |
|
71 ///////////////////////////////////////////////// |
|
72 */ |
|
73 |
|
74 int TrbWordBreakPos(const th_char *pstr, int left, |
|
75 const th_char *rstr, int right) |
|
76 /* const ThBreakIterator *it, const th_char **p)*/ |
|
77 { |
|
78 /* |
|
79 //int left, right; |
|
80 //const th_char *s = *p; |
|
81 */ |
|
82 const th_char *lstr = pstr + left; |
|
83 th_char _c[6]; |
|
84 twb_t _t[6]; |
|
85 #define c(i) (_c[(i)+3]) |
|
86 #define t(i) (_t[(i)+3]) |
|
87 int i, j; |
|
88 |
|
89 /* |
|
90 //left = s - it->begin; |
|
91 */ |
|
92 if(left < 0) return -1; |
|
93 /* |
|
94 //right = (it->end == NULL) ? 4 : it->begin - s; |
|
95 */ |
|
96 if(right < 1) return -1; |
|
97 |
|
98 /* |
|
99 // get c(0), t(0) |
|
100 */ |
|
101 c(0) = rstr[0]; /* may be '\0' */ |
|
102 if(!th_isthai(c(0))) return -1; |
|
103 t(0) = twbtype(c(0)); |
|
104 if(!(t(0) & A)) return -1; |
|
105 |
|
106 /* |
|
107 // get c(-1), t(-1) |
|
108 */ |
|
109 if(left >= 1) { |
|
110 c(-1) = lstr[-1]; |
|
111 if(!th_isthai(c(-1))) return 0; |
|
112 t(-1) = twbtype(c(-1)); |
|
113 if(!(t(-1) & A)) return 0; /* handle punctuation marks here */ |
|
114 } else { c(-1) = 0; t(-1) = 0; } |
|
115 |
|
116 /* |
|
117 // get c(1..2), t(1..2) |
|
118 */ |
|
119 for(i = 1; i <= 2; i++) { |
|
120 if(i >= right) { c(i) = 0; t(i) = 0; } |
|
121 else { |
|
122 c(i) = rstr[i]; /* may be '\0'; */ |
|
123 if(!th_isthai(c(i))) right = i--; |
|
124 else { |
|
125 t(i) = twbtype(c(i)); |
|
126 if(!(t(i) & A)) right = i--; |
|
127 } |
|
128 } |
|
129 } |
|
130 /* |
|
131 // get c(-2..-3), t(-2..-3) |
|
132 */ |
|
133 for(i = -2, j = -2; i >= -3 ; j--) { |
|
134 if(j < -left) { c(i) = 0; t(i) = 0; i--; } |
|
135 else { |
|
136 c(i) = lstr[j]; |
|
137 if(!th_isthai(c(i))) left = 0; |
|
138 else { |
|
139 t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0); |
|
140 if(!(t(i) & A)) left = 0; |
|
141 else { |
|
142 if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) { |
|
143 c(i+1) = c(i); t(i+1) = t(i); |
|
144 } else i--; |
|
145 } |
|
146 } |
|
147 } |
|
148 } |
|
149 |
|
150 /* |
|
151 // prohibit the unlikely |
|
152 */ |
|
153 if((t(-1) & C) && (t(0) & C)) { |
|
154 if((t(-1) & CHE) || (t(0) & CHB)) return -1; |
|
155 } |
|
156 /* |
|
157 // special case : vlao, C/ sara_a|aa, !sara_a |
|
158 */ |
|
159 if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) && |
|
160 (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0; |
|
161 |
|
162 /* |
|
163 // prohibit break |
|
164 */ |
|
165 if(t(0) & NB) return -1; |
|
166 if(t(-1) & NE) return -1; |
|
167 |
|
168 |
|
169 /* |
|
170 // apply 100% rules |
|
171 */ |
|
172 if(t(-1) & VRE) { |
|
173 if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0; |
|
174 return -1; /* usually too short syllable, part of word */ |
|
175 } |
|
176 |
|
177 if(t(-2) & VRE) return -1; |
|
178 |
|
179 if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */ |
|
180 if((t(-1) & (VRS|VRX)) && c(1) == TH_SARA_I) return -1; /* exception */ |
|
181 if(t(-1) & (V|M)) return 0; /* !C/ C, NB */ |
|
182 if(t(-2) & VRS) return 0; /* VRS, C / C, NB */ |
|
183 if(!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */ |
|
184 if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */ |
|
185 if(t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */ |
|
186 } |
|
187 } |
|
188 if((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */ |
|
189 if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */ |
|
190 |
|
191 |
|
192 if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) { |
|
193 if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */ |
|
194 if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */ |
|
195 } |
|
196 /* |
|
197 // apply 90% rules |
|
198 */ |
|
199 if(t(0) & VL) return 0; |
|
200 if(t(1) & VL) return -1; |
|
201 if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0; |
|
202 |
|
203 /* |
|
204 //return -1; |
|
205 // apply 80% rules |
|
206 */ |
|
207 if(t(0) & CHE) { |
|
208 if((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */ |
|
209 /*if(t(-1) & VRX) return 0; // VRX/ CHE */ |
|
210 if(t(-1) & VC) return 0; /* VC/ CHE */ |
|
211 } |
|
212 if(t(-1) & CHB) { |
|
213 if((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */ |
|
214 if(t(0) & VC) return 0; /* CHB/ VC */ |
|
215 } |
|
216 |
|
217 if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */ |
|
218 if(t(-2) & VLI) return 0; /* VLI,C/C,VR .*/ |
|
219 else { /* vlao, C ? C , VR */ |
|
220 if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */ |
|
221 if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */ |
|
222 if(!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */ |
|
223 } |
|
224 } |
|
225 /* C,MT,C */ |
|
226 if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1; |
|
227 |
|
228 return -1; |
|
229 } |
|
230 |
|
231 |
|
232 int TrbFollowing(const th_char *begin, int length, int offset) |
|
233 /* |
|
234 //(ThBreakIterator *this, int offset) |
|
235 */ |
|
236 { |
|
237 const th_char *w = begin + offset; |
|
238 const th_char *end = begin + length; |
|
239 while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++; |
|
240 |
|
241 if(w < end && *w && !th_isthai(*w)) { |
|
242 int english = FALSE; |
|
243 while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) { |
|
244 if(th_isalpha(*w)) english = TRUE; |
|
245 w++; |
|
246 } |
|
247 if(english || w == end || |
|
248 (!th_isthai(*w) && th_isspace(*w))) return w - begin; |
|
249 } |
|
250 if(w == end || *w == 0 || !th_isthai(*w)) return w - begin; |
|
251 w++; |
|
252 if(w < end && *w && th_isthai(*w)) { |
|
253 int brk = TrbWordBreakPos(begin, w-begin, w, end-w); |
|
254 while (brk < 0) { |
|
255 w++; |
|
256 if(w == end || *w == 0 || !th_isthai(*w)) break; |
|
257 brk = TrbWordBreakPos(begin, w-begin, w, end-w); |
|
258 } |
|
259 if (brk > 0) w += brk; |
|
260 } |
|
261 if(w < end && *w && !th_isthai(*w)) { |
|
262 while(w < end && *w && !th_isthai(*w) && |
|
263 !th_isalpha(*w) && !th_isspace(*w)) w++; |
|
264 } |
|
265 return w - begin; |
|
266 } |
|
267 |
|
268 |
|
269 /* |
|
270 ///////////////////////////////////////////////// |
|
271 */ |
|
272 const twb_t _TwbType[0x100-0xa0] = { |
|
273 #if 0 |
|
274 /* 80 */ T, |
|
275 /* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
276 /* 90 */ T, |
|
277 /* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
278 #endif |
|
279 /* a0 */ 0, |
|
280 /* a1 */ CS, |
|
281 /* a2 */ CS | CHE, |
|
282 /* a3 */ CC | CHE, |
|
283 /* a4 */ CS | CHE, |
|
284 /* a5 */ CC | CHE, |
|
285 /* a6 */ CS, |
|
286 /* a7 */ CS | CHB, |
|
287 /* a8 */ CS, |
|
288 /* a9 */ CC | CHE, |
|
289 /* aa */ CS, |
|
290 /* ab */ CC | CHE, |
|
291 /* ac */ CC | CHB | CHE, |
|
292 /* ad */ CS | CHB, |
|
293 /* ae */ CS | CHB, |
|
294 /* af */ CS | CHB, |
|
295 /* b0 */ CS, |
|
296 /* b1 */ CS | CHB | CHE, |
|
297 /* b2 */ CS | CHB | CHE, |
|
298 /* b3 */ CS | CHB, |
|
299 /* b4 */ CS, |
|
300 /* b5 */ CS, |
|
301 /* b6 */ CS, |
|
302 /* b7 */ CS, |
|
303 /* b8 */ CS, |
|
304 /* b9 */ CS, |
|
305 /* ba */ CS, |
|
306 /* bb */ CS, |
|
307 /* bc */ CC | CHE, |
|
308 /* bd */ CC | CHE, |
|
309 /* be */ CS, |
|
310 /* bf */ CS, |
|
311 /* c0 */ CS | CHE, |
|
312 /* c1 */ CS, |
|
313 /* c2 */ CS, |
|
314 /* c3 */ CS | C2 | CHE, /* ? add CHE */ |
|
315 /* c4 */ VC | CHE, |
|
316 /* c5 */ CS | C2, |
|
317 /* c6 */ VC | CHE, |
|
318 /* c7 */ VC | C2, |
|
319 /* c8 */ CS, |
|
320 /* c9 */ CS | CHB, |
|
321 /* ca */ CS | CHE, |
|
322 /* cb */ CC | CHE, |
|
323 /* CC */ CS | CHB | CHE, |
|
324 /* cd */ VC, |
|
325 /* ce */ CC | CHE, |
|
326 /* cf */ T, |
|
327 /* d0 */ VRE | VRA, |
|
328 /* d1 */ VRS, |
|
329 /* d2 */ VRX | VRA, |
|
330 /* d3 */ VRE, |
|
331 /* d4 */ VRX | VRA, |
|
332 /* d5 */ VRX | VRA, |
|
333 /* d6 */ VRS, |
|
334 /* d7 */ VRS | VRA, |
|
335 /* d8 */ VRX, |
|
336 /* d9 */ VRX, |
|
337 /* da */ T, |
|
338 /* db */ 0, |
|
339 /* dc */ 0, |
|
340 /* dd */ 0, |
|
341 /* de */ 0, |
|
342 /* df */ T, |
|
343 /* e0 */ VLA, |
|
344 /* e1 */ VLO, |
|
345 /* e2 */ VLO, |
|
346 /* e3 */ VLI, |
|
347 /* e4 */ VLI, |
|
348 /* e5 */ VRE, |
|
349 /* e6 */ M, |
|
350 /* e7 */ M, |
|
351 /* e8 */ M | MT, |
|
352 /* e9 */ M | MT, |
|
353 /* ea */ M | MT, |
|
354 /* eb */ M | MT, |
|
355 /* ec */ M, |
|
356 /* ed */ T, |
|
357 /* ee */ T, |
|
358 /* ef */ T, |
|
359 /* f0 */ T, |
|
360 /* f1 */ T, |
|
361 /* f2 */ T, |
|
362 /* f3 */ T, |
|
363 /* f4 */ T, |
|
364 /* f5 */ T, |
|
365 /* f6 */ T, |
|
366 /* f7 */ T, |
|
367 /* f8 */ T, |
|
368 /* f9 */ T, |
|
369 /* fa */ T, |
|
370 /* fb */ T, |
|
371 /* fc */ 0, |
|
372 /* fd */ 0, |
|
373 /* fe */ 0, |
|
374 /* ff */ 0 |
|
375 }; |