intl/lwbrk/src/rulebrk.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 2 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 4 #define TH_UNICODE
michael@0 5
michael@0 6 #include <stdlib.h>
michael@0 7 #include <assert.h>
michael@0 8 #include "th_char.h"
michael@0 9 #define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z'))
michael@0 10 #define th_isspace(c) ((c)==' '||(c)=='\t')
michael@0 11
michael@0 12
michael@0 13 /*
michael@0 14 /////////////////////////////////////////////////
michael@0 15 // Thai character type array
michael@0 16 */
michael@0 17
michael@0 18 typedef unsigned short twb_t;
michael@0 19 extern const twb_t _TwbType[0x100-0xa0];
michael@0 20
michael@0 21 /*
michael@0 22 // bit definition
michael@0 23 */
michael@0 24
michael@0 25 #define VRS 0x0001
michael@0 26 #define VRE 0x0002
michael@0 27 #define VRX 0x0004
michael@0 28
michael@0 29 #define VRA 0x0008
michael@0 30
michael@0 31 #define VLA 0x0010
michael@0 32 #define VLO 0x0020
michael@0 33 #define VLI 0x0040
michael@0 34
michael@0 35 #define VC 0x0080
michael@0 36
michael@0 37 #define CC 0x0100
michael@0 38 #define CS 0x0200
michael@0 39
michael@0 40 #define C2 0x0400
michael@0 41 #define CHB 0x0800
michael@0 42 #define CHE 0x1000
michael@0 43
michael@0 44 #define MT 0x2000
michael@0 45 /*
michael@0 46 //_#define me 0x2000
michael@0 47 */
michael@0 48 #define M 0x4000
michael@0 49
michael@0 50 #define T 0x8000
michael@0 51
michael@0 52 #define VL (VLA|VLO|VLI)
michael@0 53 #define VR (VRS|VRE|VRX)
michael@0 54 #define NE (VL|VRS)
michael@0 55 #define NB (VR|M)
michael@0 56 #define V (VL|VR)
michael@0 57 #define CX (CC|CS)
michael@0 58 #define C (CX|VC)
michael@0 59 #define A (C|V|M)
michael@0 60
michael@0 61 #define twbtype(c) (_TwbType[th_zcode(c)])
michael@0 62
michael@0 63 #ifndef TRUE
michael@0 64 #define TRUE 1
michael@0 65 #define FALSE 0
michael@0 66 #endif
michael@0 67 #define RETURN(b) return (b)
michael@0 68
michael@0 69
michael@0 70 /*
michael@0 71 /////////////////////////////////////////////////
michael@0 72 */
michael@0 73
michael@0 74 int TrbWordBreakPos(const th_char *pstr, int left,
michael@0 75 const th_char *rstr, int right)
michael@0 76 /* const ThBreakIterator *it, const th_char **p)*/
michael@0 77 {
michael@0 78 /*
michael@0 79 //int left, right;
michael@0 80 //const th_char *s = *p;
michael@0 81 */
michael@0 82 const th_char *lstr = pstr + left;
michael@0 83 th_char _c[6];
michael@0 84 twb_t _t[6];
michael@0 85 #define c(i) (_c[(i)+3])
michael@0 86 #define t(i) (_t[(i)+3])
michael@0 87 int i, j;
michael@0 88
michael@0 89 /*
michael@0 90 //left = s - it->begin;
michael@0 91 */
michael@0 92 if(left < 0) return -1;
michael@0 93 /*
michael@0 94 //right = (it->end == NULL) ? 4 : it->begin - s;
michael@0 95 */
michael@0 96 if(right < 1) return -1;
michael@0 97
michael@0 98 /*
michael@0 99 // get c(0), t(0)
michael@0 100 */
michael@0 101 c(0) = rstr[0]; /* may be '\0' */
michael@0 102 if(!th_isthai(c(0))) return -1;
michael@0 103 t(0) = twbtype(c(0));
michael@0 104 if(!(t(0) & A)) return -1;
michael@0 105
michael@0 106 /*
michael@0 107 // get c(-1), t(-1)
michael@0 108 */
michael@0 109 if(left >= 1) {
michael@0 110 c(-1) = lstr[-1];
michael@0 111 if(!th_isthai(c(-1))) return 0;
michael@0 112 t(-1) = twbtype(c(-1));
michael@0 113 if(!(t(-1) & A)) return 0; /* handle punctuation marks here */
michael@0 114 } else { c(-1) = 0; t(-1) = 0; }
michael@0 115
michael@0 116 /*
michael@0 117 // get c(1..2), t(1..2)
michael@0 118 */
michael@0 119 for(i = 1; i <= 2; i++) {
michael@0 120 if(i >= right) { c(i) = 0; t(i) = 0; }
michael@0 121 else {
michael@0 122 c(i) = rstr[i]; /* may be '\0'; */
michael@0 123 if(!th_isthai(c(i))) right = i--;
michael@0 124 else {
michael@0 125 t(i) = twbtype(c(i));
michael@0 126 if(!(t(i) & A)) right = i--;
michael@0 127 }
michael@0 128 }
michael@0 129 }
michael@0 130 /*
michael@0 131 // get c(-2..-3), t(-2..-3)
michael@0 132 */
michael@0 133 for(i = -2, j = -2; i >= -3 ; j--) {
michael@0 134 if(j < -left) { c(i) = 0; t(i) = 0; i--; }
michael@0 135 else {
michael@0 136 c(i) = lstr[j];
michael@0 137 if(!th_isthai(c(i))) left = 0;
michael@0 138 else {
michael@0 139 t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
michael@0 140 if(!(t(i) & A)) left = 0;
michael@0 141 else {
michael@0 142 if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) {
michael@0 143 c(i+1) = c(i); t(i+1) = t(i);
michael@0 144 } else i--;
michael@0 145 }
michael@0 146 }
michael@0 147 }
michael@0 148 }
michael@0 149
michael@0 150 /*
michael@0 151 // prohibit the unlikely
michael@0 152 */
michael@0 153 if((t(-1) & C) && (t(0) & C)) {
michael@0 154 if((t(-1) & CHE) || (t(0) & CHB)) return -1;
michael@0 155 }
michael@0 156 /*
michael@0 157 // special case : vlao, C/ sara_a|aa, !sara_a
michael@0 158 */
michael@0 159 if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
michael@0 160 (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0;
michael@0 161
michael@0 162 /*
michael@0 163 // prohibit break
michael@0 164 */
michael@0 165 if(t(0) & NB) return -1;
michael@0 166 if(t(-1) & NE) return -1;
michael@0 167
michael@0 168
michael@0 169 /*
michael@0 170 // apply 100% rules
michael@0 171 */
michael@0 172 if(t(-1) & VRE) {
michael@0 173 if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
michael@0 174 return -1; /* usually too short syllable, part of word */
michael@0 175 }
michael@0 176
michael@0 177 if(t(-2) & VRE) return -1;
michael@0 178
michael@0 179 if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */
michael@0 180 if((t(-1) & (VRS|VRX)) && c(1) == TH_SARA_I) return -1; /* exception */
michael@0 181 if(t(-1) & (V|M)) return 0; /* !C/ C, NB */
michael@0 182 if(t(-2) & VRS) return 0; /* VRS, C / C, NB */
michael@0 183 if(!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */
michael@0 184 if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */
michael@0 185 if(t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */
michael@0 186 }
michael@0 187 }
michael@0 188 if((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */
michael@0 189 if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */
michael@0 190
michael@0 191
michael@0 192 if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
michael@0 193 if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */
michael@0 194 if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
michael@0 195 }
michael@0 196 /*
michael@0 197 // apply 90% rules
michael@0 198 */
michael@0 199 if(t(0) & VL) return 0;
michael@0 200 if(t(1) & VL) return -1;
michael@0 201 if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0;
michael@0 202
michael@0 203 /*
michael@0 204 //return -1;
michael@0 205 // apply 80% rules
michael@0 206 */
michael@0 207 if(t(0) & CHE) {
michael@0 208 if((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */
michael@0 209 /*if(t(-1) & VRX) return 0; // VRX/ CHE */
michael@0 210 if(t(-1) & VC) return 0; /* VC/ CHE */
michael@0 211 }
michael@0 212 if(t(-1) & CHB) {
michael@0 213 if((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */
michael@0 214 if(t(0) & VC) return 0; /* CHB/ VC */
michael@0 215 }
michael@0 216
michael@0 217 if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
michael@0 218 if(t(-2) & VLI) return 0; /* VLI,C/C,VR .*/
michael@0 219 else { /* vlao, C ? C , VR */
michael@0 220 if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
michael@0 221 if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */
michael@0 222 if(!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */
michael@0 223 }
michael@0 224 }
michael@0 225 /* C,MT,C */
michael@0 226 if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
michael@0 227
michael@0 228 return -1;
michael@0 229 }
michael@0 230
michael@0 231
michael@0 232 int TrbFollowing(const th_char *begin, int length, int offset)
michael@0 233 /*
michael@0 234 //(ThBreakIterator *this, int offset)
michael@0 235 */
michael@0 236 {
michael@0 237 const th_char *w = begin + offset;
michael@0 238 const th_char *end = begin + length;
michael@0 239 while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
michael@0 240
michael@0 241 if(w < end && *w && !th_isthai(*w)) {
michael@0 242 int english = FALSE;
michael@0 243 while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
michael@0 244 if(th_isalpha(*w)) english = TRUE;
michael@0 245 w++;
michael@0 246 }
michael@0 247 if(english || w == end ||
michael@0 248 (!th_isthai(*w) && th_isspace(*w))) return w - begin;
michael@0 249 }
michael@0 250 if(w == end || *w == 0 || !th_isthai(*w)) return w - begin;
michael@0 251 w++;
michael@0 252 if(w < end && *w && th_isthai(*w)) {
michael@0 253 int brk = TrbWordBreakPos(begin, w-begin, w, end-w);
michael@0 254 while (brk < 0) {
michael@0 255 w++;
michael@0 256 if(w == end || *w == 0 || !th_isthai(*w)) break;
michael@0 257 brk = TrbWordBreakPos(begin, w-begin, w, end-w);
michael@0 258 }
michael@0 259 if (brk > 0) w += brk;
michael@0 260 }
michael@0 261 if(w < end && *w && !th_isthai(*w)) {
michael@0 262 while(w < end && *w && !th_isthai(*w) &&
michael@0 263 !th_isalpha(*w) && !th_isspace(*w)) w++;
michael@0 264 }
michael@0 265 return w - begin;
michael@0 266 }
michael@0 267
michael@0 268
michael@0 269 /*
michael@0 270 /////////////////////////////////////////////////
michael@0 271 */
michael@0 272 const twb_t _TwbType[0x100-0xa0] = {
michael@0 273 #if 0
michael@0 274 /* 80 */ T,
michael@0 275 /* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
michael@0 276 /* 90 */ T,
michael@0 277 /* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
michael@0 278 #endif
michael@0 279 /* a0 */ 0,
michael@0 280 /* a1 */ CS,
michael@0 281 /* a2 */ CS | CHE,
michael@0 282 /* a3 */ CC | CHE,
michael@0 283 /* a4 */ CS | CHE,
michael@0 284 /* a5 */ CC | CHE,
michael@0 285 /* a6 */ CS,
michael@0 286 /* a7 */ CS | CHB,
michael@0 287 /* a8 */ CS,
michael@0 288 /* a9 */ CC | CHE,
michael@0 289 /* aa */ CS,
michael@0 290 /* ab */ CC | CHE,
michael@0 291 /* ac */ CC | CHB | CHE,
michael@0 292 /* ad */ CS | CHB,
michael@0 293 /* ae */ CS | CHB,
michael@0 294 /* af */ CS | CHB,
michael@0 295 /* b0 */ CS,
michael@0 296 /* b1 */ CS | CHB | CHE,
michael@0 297 /* b2 */ CS | CHB | CHE,
michael@0 298 /* b3 */ CS | CHB,
michael@0 299 /* b4 */ CS,
michael@0 300 /* b5 */ CS,
michael@0 301 /* b6 */ CS,
michael@0 302 /* b7 */ CS,
michael@0 303 /* b8 */ CS,
michael@0 304 /* b9 */ CS,
michael@0 305 /* ba */ CS,
michael@0 306 /* bb */ CS,
michael@0 307 /* bc */ CC | CHE,
michael@0 308 /* bd */ CC | CHE,
michael@0 309 /* be */ CS,
michael@0 310 /* bf */ CS,
michael@0 311 /* c0 */ CS | CHE,
michael@0 312 /* c1 */ CS,
michael@0 313 /* c2 */ CS,
michael@0 314 /* c3 */ CS | C2 | CHE, /* ? add CHE */
michael@0 315 /* c4 */ VC | CHE,
michael@0 316 /* c5 */ CS | C2,
michael@0 317 /* c6 */ VC | CHE,
michael@0 318 /* c7 */ VC | C2,
michael@0 319 /* c8 */ CS,
michael@0 320 /* c9 */ CS | CHB,
michael@0 321 /* ca */ CS | CHE,
michael@0 322 /* cb */ CC | CHE,
michael@0 323 /* CC */ CS | CHB | CHE,
michael@0 324 /* cd */ VC,
michael@0 325 /* ce */ CC | CHE,
michael@0 326 /* cf */ T,
michael@0 327 /* d0 */ VRE | VRA,
michael@0 328 /* d1 */ VRS,
michael@0 329 /* d2 */ VRX | VRA,
michael@0 330 /* d3 */ VRE,
michael@0 331 /* d4 */ VRX | VRA,
michael@0 332 /* d5 */ VRX | VRA,
michael@0 333 /* d6 */ VRS,
michael@0 334 /* d7 */ VRS | VRA,
michael@0 335 /* d8 */ VRX,
michael@0 336 /* d9 */ VRX,
michael@0 337 /* da */ T,
michael@0 338 /* db */ 0,
michael@0 339 /* dc */ 0,
michael@0 340 /* dd */ 0,
michael@0 341 /* de */ 0,
michael@0 342 /* df */ T,
michael@0 343 /* e0 */ VLA,
michael@0 344 /* e1 */ VLO,
michael@0 345 /* e2 */ VLO,
michael@0 346 /* e3 */ VLI,
michael@0 347 /* e4 */ VLI,
michael@0 348 /* e5 */ VRE,
michael@0 349 /* e6 */ M,
michael@0 350 /* e7 */ M,
michael@0 351 /* e8 */ M | MT,
michael@0 352 /* e9 */ M | MT,
michael@0 353 /* ea */ M | MT,
michael@0 354 /* eb */ M | MT,
michael@0 355 /* ec */ M,
michael@0 356 /* ed */ T,
michael@0 357 /* ee */ T,
michael@0 358 /* ef */ T,
michael@0 359 /* f0 */ T,
michael@0 360 /* f1 */ T,
michael@0 361 /* f2 */ T,
michael@0 362 /* f3 */ T,
michael@0 363 /* f4 */ T,
michael@0 364 /* f5 */ T,
michael@0 365 /* f6 */ T,
michael@0 366 /* f7 */ T,
michael@0 367 /* f8 */ T,
michael@0 368 /* f9 */ T,
michael@0 369 /* fa */ T,
michael@0 370 /* fb */ T,
michael@0 371 /* fc */ 0,
michael@0 372 /* fd */ 0,
michael@0 373 /* fe */ 0,
michael@0 374 /* ff */ 0
michael@0 375 };

mercurial