Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 2 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 3 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 4 | #define TH_UNICODE |
michael@0 | 5 | |
michael@0 | 6 | #include <stdlib.h> |
michael@0 | 7 | #include <assert.h> |
michael@0 | 8 | #include "th_char.h" |
michael@0 | 9 | #define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z')) |
michael@0 | 10 | #define th_isspace(c) ((c)==' '||(c)=='\t') |
michael@0 | 11 | |
michael@0 | 12 | |
michael@0 | 13 | /* |
michael@0 | 14 | ///////////////////////////////////////////////// |
michael@0 | 15 | // Thai character type array |
michael@0 | 16 | */ |
michael@0 | 17 | |
michael@0 | 18 | typedef unsigned short twb_t; |
michael@0 | 19 | extern const twb_t _TwbType[0x100-0xa0]; |
michael@0 | 20 | |
michael@0 | 21 | /* |
michael@0 | 22 | // bit definition |
michael@0 | 23 | */ |
michael@0 | 24 | |
michael@0 | 25 | #define VRS 0x0001 |
michael@0 | 26 | #define VRE 0x0002 |
michael@0 | 27 | #define VRX 0x0004 |
michael@0 | 28 | |
michael@0 | 29 | #define VRA 0x0008 |
michael@0 | 30 | |
michael@0 | 31 | #define VLA 0x0010 |
michael@0 | 32 | #define VLO 0x0020 |
michael@0 | 33 | #define VLI 0x0040 |
michael@0 | 34 | |
michael@0 | 35 | #define VC 0x0080 |
michael@0 | 36 | |
michael@0 | 37 | #define CC 0x0100 |
michael@0 | 38 | #define CS 0x0200 |
michael@0 | 39 | |
michael@0 | 40 | #define C2 0x0400 |
michael@0 | 41 | #define CHB 0x0800 |
michael@0 | 42 | #define CHE 0x1000 |
michael@0 | 43 | |
michael@0 | 44 | #define MT 0x2000 |
michael@0 | 45 | /* |
michael@0 | 46 | //_#define me 0x2000 |
michael@0 | 47 | */ |
michael@0 | 48 | #define M 0x4000 |
michael@0 | 49 | |
michael@0 | 50 | #define T 0x8000 |
michael@0 | 51 | |
michael@0 | 52 | #define VL (VLA|VLO|VLI) |
michael@0 | 53 | #define VR (VRS|VRE|VRX) |
michael@0 | 54 | #define NE (VL|VRS) |
michael@0 | 55 | #define NB (VR|M) |
michael@0 | 56 | #define V (VL|VR) |
michael@0 | 57 | #define CX (CC|CS) |
michael@0 | 58 | #define C (CX|VC) |
michael@0 | 59 | #define A (C|V|M) |
michael@0 | 60 | |
michael@0 | 61 | #define twbtype(c) (_TwbType[th_zcode(c)]) |
michael@0 | 62 | |
michael@0 | 63 | #ifndef TRUE |
michael@0 | 64 | #define TRUE 1 |
michael@0 | 65 | #define FALSE 0 |
michael@0 | 66 | #endif |
michael@0 | 67 | #define RETURN(b) return (b) |
michael@0 | 68 | |
michael@0 | 69 | |
michael@0 | 70 | /* |
michael@0 | 71 | ///////////////////////////////////////////////// |
michael@0 | 72 | */ |
michael@0 | 73 | |
michael@0 | 74 | int TrbWordBreakPos(const th_char *pstr, int left, |
michael@0 | 75 | const th_char *rstr, int right) |
michael@0 | 76 | /* const ThBreakIterator *it, const th_char **p)*/ |
michael@0 | 77 | { |
michael@0 | 78 | /* |
michael@0 | 79 | //int left, right; |
michael@0 | 80 | //const th_char *s = *p; |
michael@0 | 81 | */ |
michael@0 | 82 | const th_char *lstr = pstr + left; |
michael@0 | 83 | th_char _c[6]; |
michael@0 | 84 | twb_t _t[6]; |
michael@0 | 85 | #define c(i) (_c[(i)+3]) |
michael@0 | 86 | #define t(i) (_t[(i)+3]) |
michael@0 | 87 | int i, j; |
michael@0 | 88 | |
michael@0 | 89 | /* |
michael@0 | 90 | //left = s - it->begin; |
michael@0 | 91 | */ |
michael@0 | 92 | if(left < 0) return -1; |
michael@0 | 93 | /* |
michael@0 | 94 | //right = (it->end == NULL) ? 4 : it->begin - s; |
michael@0 | 95 | */ |
michael@0 | 96 | if(right < 1) return -1; |
michael@0 | 97 | |
michael@0 | 98 | /* |
michael@0 | 99 | // get c(0), t(0) |
michael@0 | 100 | */ |
michael@0 | 101 | c(0) = rstr[0]; /* may be '\0' */ |
michael@0 | 102 | if(!th_isthai(c(0))) return -1; |
michael@0 | 103 | t(0) = twbtype(c(0)); |
michael@0 | 104 | if(!(t(0) & A)) return -1; |
michael@0 | 105 | |
michael@0 | 106 | /* |
michael@0 | 107 | // get c(-1), t(-1) |
michael@0 | 108 | */ |
michael@0 | 109 | if(left >= 1) { |
michael@0 | 110 | c(-1) = lstr[-1]; |
michael@0 | 111 | if(!th_isthai(c(-1))) return 0; |
michael@0 | 112 | t(-1) = twbtype(c(-1)); |
michael@0 | 113 | if(!(t(-1) & A)) return 0; /* handle punctuation marks here */ |
michael@0 | 114 | } else { c(-1) = 0; t(-1) = 0; } |
michael@0 | 115 | |
michael@0 | 116 | /* |
michael@0 | 117 | // get c(1..2), t(1..2) |
michael@0 | 118 | */ |
michael@0 | 119 | for(i = 1; i <= 2; i++) { |
michael@0 | 120 | if(i >= right) { c(i) = 0; t(i) = 0; } |
michael@0 | 121 | else { |
michael@0 | 122 | c(i) = rstr[i]; /* may be '\0'; */ |
michael@0 | 123 | if(!th_isthai(c(i))) right = i--; |
michael@0 | 124 | else { |
michael@0 | 125 | t(i) = twbtype(c(i)); |
michael@0 | 126 | if(!(t(i) & A)) right = i--; |
michael@0 | 127 | } |
michael@0 | 128 | } |
michael@0 | 129 | } |
michael@0 | 130 | /* |
michael@0 | 131 | // get c(-2..-3), t(-2..-3) |
michael@0 | 132 | */ |
michael@0 | 133 | for(i = -2, j = -2; i >= -3 ; j--) { |
michael@0 | 134 | if(j < -left) { c(i) = 0; t(i) = 0; i--; } |
michael@0 | 135 | else { |
michael@0 | 136 | c(i) = lstr[j]; |
michael@0 | 137 | if(!th_isthai(c(i))) left = 0; |
michael@0 | 138 | else { |
michael@0 | 139 | t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0); |
michael@0 | 140 | if(!(t(i) & A)) left = 0; |
michael@0 | 141 | else { |
michael@0 | 142 | if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) { |
michael@0 | 143 | c(i+1) = c(i); t(i+1) = t(i); |
michael@0 | 144 | } else i--; |
michael@0 | 145 | } |
michael@0 | 146 | } |
michael@0 | 147 | } |
michael@0 | 148 | } |
michael@0 | 149 | |
michael@0 | 150 | /* |
michael@0 | 151 | // prohibit the unlikely |
michael@0 | 152 | */ |
michael@0 | 153 | if((t(-1) & C) && (t(0) & C)) { |
michael@0 | 154 | if((t(-1) & CHE) || (t(0) & CHB)) return -1; |
michael@0 | 155 | } |
michael@0 | 156 | /* |
michael@0 | 157 | // special case : vlao, C/ sara_a|aa, !sara_a |
michael@0 | 158 | */ |
michael@0 | 159 | if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) && |
michael@0 | 160 | (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0; |
michael@0 | 161 | |
michael@0 | 162 | /* |
michael@0 | 163 | // prohibit break |
michael@0 | 164 | */ |
michael@0 | 165 | if(t(0) & NB) return -1; |
michael@0 | 166 | if(t(-1) & NE) return -1; |
michael@0 | 167 | |
michael@0 | 168 | |
michael@0 | 169 | /* |
michael@0 | 170 | // apply 100% rules |
michael@0 | 171 | */ |
michael@0 | 172 | if(t(-1) & VRE) { |
michael@0 | 173 | if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0; |
michael@0 | 174 | return -1; /* usually too short syllable, part of word */ |
michael@0 | 175 | } |
michael@0 | 176 | |
michael@0 | 177 | if(t(-2) & VRE) return -1; |
michael@0 | 178 | |
michael@0 | 179 | if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */ |
michael@0 | 180 | if((t(-1) & (VRS|VRX)) && c(1) == TH_SARA_I) return -1; /* exception */ |
michael@0 | 181 | if(t(-1) & (V|M)) return 0; /* !C/ C, NB */ |
michael@0 | 182 | if(t(-2) & VRS) return 0; /* VRS, C / C, NB */ |
michael@0 | 183 | if(!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */ |
michael@0 | 184 | if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */ |
michael@0 | 185 | if(t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */ |
michael@0 | 186 | } |
michael@0 | 187 | } |
michael@0 | 188 | if((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */ |
michael@0 | 189 | if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */ |
michael@0 | 190 | |
michael@0 | 191 | |
michael@0 | 192 | if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) { |
michael@0 | 193 | if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */ |
michael@0 | 194 | if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */ |
michael@0 | 195 | } |
michael@0 | 196 | /* |
michael@0 | 197 | // apply 90% rules |
michael@0 | 198 | */ |
michael@0 | 199 | if(t(0) & VL) return 0; |
michael@0 | 200 | if(t(1) & VL) return -1; |
michael@0 | 201 | if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0; |
michael@0 | 202 | |
michael@0 | 203 | /* |
michael@0 | 204 | //return -1; |
michael@0 | 205 | // apply 80% rules |
michael@0 | 206 | */ |
michael@0 | 207 | if(t(0) & CHE) { |
michael@0 | 208 | if((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */ |
michael@0 | 209 | /*if(t(-1) & VRX) return 0; // VRX/ CHE */ |
michael@0 | 210 | if(t(-1) & VC) return 0; /* VC/ CHE */ |
michael@0 | 211 | } |
michael@0 | 212 | if(t(-1) & CHB) { |
michael@0 | 213 | if((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */ |
michael@0 | 214 | if(t(0) & VC) return 0; /* CHB/ VC */ |
michael@0 | 215 | } |
michael@0 | 216 | |
michael@0 | 217 | if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */ |
michael@0 | 218 | if(t(-2) & VLI) return 0; /* VLI,C/C,VR .*/ |
michael@0 | 219 | else { /* vlao, C ? C , VR */ |
michael@0 | 220 | if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */ |
michael@0 | 221 | if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */ |
michael@0 | 222 | if(!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */ |
michael@0 | 223 | } |
michael@0 | 224 | } |
michael@0 | 225 | /* C,MT,C */ |
michael@0 | 226 | if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1; |
michael@0 | 227 | |
michael@0 | 228 | return -1; |
michael@0 | 229 | } |
michael@0 | 230 | |
michael@0 | 231 | |
michael@0 | 232 | int TrbFollowing(const th_char *begin, int length, int offset) |
michael@0 | 233 | /* |
michael@0 | 234 | //(ThBreakIterator *this, int offset) |
michael@0 | 235 | */ |
michael@0 | 236 | { |
michael@0 | 237 | const th_char *w = begin + offset; |
michael@0 | 238 | const th_char *end = begin + length; |
michael@0 | 239 | while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++; |
michael@0 | 240 | |
michael@0 | 241 | if(w < end && *w && !th_isthai(*w)) { |
michael@0 | 242 | int english = FALSE; |
michael@0 | 243 | while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) { |
michael@0 | 244 | if(th_isalpha(*w)) english = TRUE; |
michael@0 | 245 | w++; |
michael@0 | 246 | } |
michael@0 | 247 | if(english || w == end || |
michael@0 | 248 | (!th_isthai(*w) && th_isspace(*w))) return w - begin; |
michael@0 | 249 | } |
michael@0 | 250 | if(w == end || *w == 0 || !th_isthai(*w)) return w - begin; |
michael@0 | 251 | w++; |
michael@0 | 252 | if(w < end && *w && th_isthai(*w)) { |
michael@0 | 253 | int brk = TrbWordBreakPos(begin, w-begin, w, end-w); |
michael@0 | 254 | while (brk < 0) { |
michael@0 | 255 | w++; |
michael@0 | 256 | if(w == end || *w == 0 || !th_isthai(*w)) break; |
michael@0 | 257 | brk = TrbWordBreakPos(begin, w-begin, w, end-w); |
michael@0 | 258 | } |
michael@0 | 259 | if (brk > 0) w += brk; |
michael@0 | 260 | } |
michael@0 | 261 | if(w < end && *w && !th_isthai(*w)) { |
michael@0 | 262 | while(w < end && *w && !th_isthai(*w) && |
michael@0 | 263 | !th_isalpha(*w) && !th_isspace(*w)) w++; |
michael@0 | 264 | } |
michael@0 | 265 | return w - begin; |
michael@0 | 266 | } |
michael@0 | 267 | |
michael@0 | 268 | |
michael@0 | 269 | /* |
michael@0 | 270 | ///////////////////////////////////////////////// |
michael@0 | 271 | */ |
michael@0 | 272 | const twb_t _TwbType[0x100-0xa0] = { |
michael@0 | 273 | #if 0 |
michael@0 | 274 | /* 80 */ T, |
michael@0 | 275 | /* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
michael@0 | 276 | /* 90 */ T, |
michael@0 | 277 | /* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
michael@0 | 278 | #endif |
michael@0 | 279 | /* a0 */ 0, |
michael@0 | 280 | /* a1 */ CS, |
michael@0 | 281 | /* a2 */ CS | CHE, |
michael@0 | 282 | /* a3 */ CC | CHE, |
michael@0 | 283 | /* a4 */ CS | CHE, |
michael@0 | 284 | /* a5 */ CC | CHE, |
michael@0 | 285 | /* a6 */ CS, |
michael@0 | 286 | /* a7 */ CS | CHB, |
michael@0 | 287 | /* a8 */ CS, |
michael@0 | 288 | /* a9 */ CC | CHE, |
michael@0 | 289 | /* aa */ CS, |
michael@0 | 290 | /* ab */ CC | CHE, |
michael@0 | 291 | /* ac */ CC | CHB | CHE, |
michael@0 | 292 | /* ad */ CS | CHB, |
michael@0 | 293 | /* ae */ CS | CHB, |
michael@0 | 294 | /* af */ CS | CHB, |
michael@0 | 295 | /* b0 */ CS, |
michael@0 | 296 | /* b1 */ CS | CHB | CHE, |
michael@0 | 297 | /* b2 */ CS | CHB | CHE, |
michael@0 | 298 | /* b3 */ CS | CHB, |
michael@0 | 299 | /* b4 */ CS, |
michael@0 | 300 | /* b5 */ CS, |
michael@0 | 301 | /* b6 */ CS, |
michael@0 | 302 | /* b7 */ CS, |
michael@0 | 303 | /* b8 */ CS, |
michael@0 | 304 | /* b9 */ CS, |
michael@0 | 305 | /* ba */ CS, |
michael@0 | 306 | /* bb */ CS, |
michael@0 | 307 | /* bc */ CC | CHE, |
michael@0 | 308 | /* bd */ CC | CHE, |
michael@0 | 309 | /* be */ CS, |
michael@0 | 310 | /* bf */ CS, |
michael@0 | 311 | /* c0 */ CS | CHE, |
michael@0 | 312 | /* c1 */ CS, |
michael@0 | 313 | /* c2 */ CS, |
michael@0 | 314 | /* c3 */ CS | C2 | CHE, /* ? add CHE */ |
michael@0 | 315 | /* c4 */ VC | CHE, |
michael@0 | 316 | /* c5 */ CS | C2, |
michael@0 | 317 | /* c6 */ VC | CHE, |
michael@0 | 318 | /* c7 */ VC | C2, |
michael@0 | 319 | /* c8 */ CS, |
michael@0 | 320 | /* c9 */ CS | CHB, |
michael@0 | 321 | /* ca */ CS | CHE, |
michael@0 | 322 | /* cb */ CC | CHE, |
michael@0 | 323 | /* CC */ CS | CHB | CHE, |
michael@0 | 324 | /* cd */ VC, |
michael@0 | 325 | /* ce */ CC | CHE, |
michael@0 | 326 | /* cf */ T, |
michael@0 | 327 | /* d0 */ VRE | VRA, |
michael@0 | 328 | /* d1 */ VRS, |
michael@0 | 329 | /* d2 */ VRX | VRA, |
michael@0 | 330 | /* d3 */ VRE, |
michael@0 | 331 | /* d4 */ VRX | VRA, |
michael@0 | 332 | /* d5 */ VRX | VRA, |
michael@0 | 333 | /* d6 */ VRS, |
michael@0 | 334 | /* d7 */ VRS | VRA, |
michael@0 | 335 | /* d8 */ VRX, |
michael@0 | 336 | /* d9 */ VRX, |
michael@0 | 337 | /* da */ T, |
michael@0 | 338 | /* db */ 0, |
michael@0 | 339 | /* dc */ 0, |
michael@0 | 340 | /* dd */ 0, |
michael@0 | 341 | /* de */ 0, |
michael@0 | 342 | /* df */ T, |
michael@0 | 343 | /* e0 */ VLA, |
michael@0 | 344 | /* e1 */ VLO, |
michael@0 | 345 | /* e2 */ VLO, |
michael@0 | 346 | /* e3 */ VLI, |
michael@0 | 347 | /* e4 */ VLI, |
michael@0 | 348 | /* e5 */ VRE, |
michael@0 | 349 | /* e6 */ M, |
michael@0 | 350 | /* e7 */ M, |
michael@0 | 351 | /* e8 */ M | MT, |
michael@0 | 352 | /* e9 */ M | MT, |
michael@0 | 353 | /* ea */ M | MT, |
michael@0 | 354 | /* eb */ M | MT, |
michael@0 | 355 | /* ec */ M, |
michael@0 | 356 | /* ed */ T, |
michael@0 | 357 | /* ee */ T, |
michael@0 | 358 | /* ef */ T, |
michael@0 | 359 | /* f0 */ T, |
michael@0 | 360 | /* f1 */ T, |
michael@0 | 361 | /* f2 */ T, |
michael@0 | 362 | /* f3 */ T, |
michael@0 | 363 | /* f4 */ T, |
michael@0 | 364 | /* f5 */ T, |
michael@0 | 365 | /* f6 */ T, |
michael@0 | 366 | /* f7 */ T, |
michael@0 | 367 | /* f8 */ T, |
michael@0 | 368 | /* f9 */ T, |
michael@0 | 369 | /* fa */ T, |
michael@0 | 370 | /* fb */ T, |
michael@0 | 371 | /* fc */ 0, |
michael@0 | 372 | /* fd */ 0, |
michael@0 | 373 | /* fe */ 0, |
michael@0 | 374 | /* ff */ 0 |
michael@0 | 375 | }; |