1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/lwbrk/src/rulebrk.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,375 @@ 1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.7 +#define TH_UNICODE 1.8 + 1.9 +#include <stdlib.h> 1.10 +#include <assert.h> 1.11 +#include "th_char.h" 1.12 +#define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z')) 1.13 +#define th_isspace(c) ((c)==' '||(c)=='\t') 1.14 + 1.15 + 1.16 +/* 1.17 +///////////////////////////////////////////////// 1.18 +// Thai character type array 1.19 +*/ 1.20 + 1.21 +typedef unsigned short twb_t; 1.22 +extern const twb_t _TwbType[0x100-0xa0]; 1.23 + 1.24 +/* 1.25 +// bit definition 1.26 +*/ 1.27 + 1.28 +#define VRS 0x0001 1.29 +#define VRE 0x0002 1.30 +#define VRX 0x0004 1.31 + 1.32 +#define VRA 0x0008 1.33 + 1.34 +#define VLA 0x0010 1.35 +#define VLO 0x0020 1.36 +#define VLI 0x0040 1.37 + 1.38 +#define VC 0x0080 1.39 + 1.40 +#define CC 0x0100 1.41 +#define CS 0x0200 1.42 + 1.43 +#define C2 0x0400 1.44 +#define CHB 0x0800 1.45 +#define CHE 0x1000 1.46 + 1.47 +#define MT 0x2000 1.48 +/* 1.49 +//_#define me 0x2000 1.50 +*/ 1.51 +#define M 0x4000 1.52 + 1.53 +#define T 0x8000 1.54 + 1.55 +#define VL (VLA|VLO|VLI) 1.56 +#define VR (VRS|VRE|VRX) 1.57 +#define NE (VL|VRS) 1.58 +#define NB (VR|M) 1.59 +#define V (VL|VR) 1.60 +#define CX (CC|CS) 1.61 +#define C (CX|VC) 1.62 +#define A (C|V|M) 1.63 + 1.64 +#define twbtype(c) (_TwbType[th_zcode(c)]) 1.65 + 1.66 +#ifndef TRUE 1.67 +#define TRUE 1 1.68 +#define FALSE 0 1.69 +#endif 1.70 +#define RETURN(b) return (b) 1.71 + 1.72 + 1.73 +/* 1.74 +///////////////////////////////////////////////// 1.75 +*/ 1.76 + 1.77 +int TrbWordBreakPos(const th_char *pstr, int left, 1.78 + const th_char *rstr, int right) 1.79 +/* const ThBreakIterator *it, const th_char **p)*/ 1.80 +{ 1.81 + /* 1.82 + //int left, right; 1.83 + //const th_char *s = *p; 1.84 + */ 1.85 + const th_char *lstr = pstr + left; 1.86 + th_char _c[6]; 1.87 + twb_t _t[6]; 1.88 + #define c(i) (_c[(i)+3]) 1.89 + #define t(i) (_t[(i)+3]) 1.90 + int i, j; 1.91 + 1.92 + /* 1.93 + //left = s - it->begin; 1.94 + */ 1.95 + if(left < 0) return -1; 1.96 + /* 1.97 + //right = (it->end == NULL) ? 4 : it->begin - s; 1.98 + */ 1.99 + if(right < 1) return -1; 1.100 + 1.101 + /* 1.102 + // get c(0), t(0) 1.103 + */ 1.104 + c(0) = rstr[0]; /* may be '\0' */ 1.105 + if(!th_isthai(c(0))) return -1; 1.106 + t(0) = twbtype(c(0)); 1.107 + if(!(t(0) & A)) return -1; 1.108 + 1.109 + /* 1.110 + // get c(-1), t(-1) 1.111 + */ 1.112 + if(left >= 1) { 1.113 + c(-1) = lstr[-1]; 1.114 + if(!th_isthai(c(-1))) return 0; 1.115 + t(-1) = twbtype(c(-1)); 1.116 + if(!(t(-1) & A)) return 0; /* handle punctuation marks here */ 1.117 + } else { c(-1) = 0; t(-1) = 0; } 1.118 + 1.119 + /* 1.120 + // get c(1..2), t(1..2) 1.121 + */ 1.122 + for(i = 1; i <= 2; i++) { 1.123 + if(i >= right) { c(i) = 0; t(i) = 0; } 1.124 + else { 1.125 + c(i) = rstr[i]; /* may be '\0'; */ 1.126 + if(!th_isthai(c(i))) right = i--; 1.127 + else { 1.128 + t(i) = twbtype(c(i)); 1.129 + if(!(t(i) & A)) right = i--; 1.130 + } 1.131 + } 1.132 + } 1.133 + /* 1.134 + // get c(-2..-3), t(-2..-3) 1.135 + */ 1.136 + for(i = -2, j = -2; i >= -3 ; j--) { 1.137 + if(j < -left) { c(i) = 0; t(i) = 0; i--; } 1.138 + else { 1.139 + c(i) = lstr[j]; 1.140 + if(!th_isthai(c(i))) left = 0; 1.141 + else { 1.142 + t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0); 1.143 + if(!(t(i) & A)) left = 0; 1.144 + else { 1.145 + if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) { 1.146 + c(i+1) = c(i); t(i+1) = t(i); 1.147 + } else i--; 1.148 + } 1.149 + } 1.150 + } 1.151 + } 1.152 + 1.153 + /* 1.154 + // prohibit the unlikely 1.155 + */ 1.156 + if((t(-1) & C) && (t(0) & C)) { 1.157 + if((t(-1) & CHE) || (t(0) & CHB)) return -1; 1.158 + } 1.159 + /* 1.160 + // special case : vlao, C/ sara_a|aa, !sara_a 1.161 + */ 1.162 + if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) && 1.163 + (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0; 1.164 + 1.165 + /* 1.166 + // prohibit break 1.167 + */ 1.168 + if(t(0) & NB) return -1; 1.169 + if(t(-1) & NE) return -1; 1.170 + 1.171 + 1.172 + /* 1.173 + // apply 100% rules 1.174 + */ 1.175 + if(t(-1) & VRE) { 1.176 + if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0; 1.177 + return -1; /* usually too short syllable, part of word */ 1.178 + } 1.179 + 1.180 + if(t(-2) & VRE) return -1; 1.181 + 1.182 + if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */ 1.183 + if((t(-1) & (VRS|VRX)) && c(1) == TH_SARA_I) return -1; /* exception */ 1.184 + if(t(-1) & (V|M)) return 0; /* !C/ C, NB */ 1.185 + if(t(-2) & VRS) return 0; /* VRS, C / C, NB */ 1.186 + if(!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */ 1.187 + if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */ 1.188 + if(t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */ 1.189 + } 1.190 + } 1.191 + if((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */ 1.192 + if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */ 1.193 + 1.194 + 1.195 + if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) { 1.196 + if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */ 1.197 + if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */ 1.198 + } 1.199 + /* 1.200 + // apply 90% rules 1.201 + */ 1.202 + if(t(0) & VL) return 0; 1.203 + if(t(1) & VL) return -1; 1.204 + if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0; 1.205 + 1.206 + /* 1.207 + //return -1; 1.208 + // apply 80% rules 1.209 + */ 1.210 + if(t(0) & CHE) { 1.211 + if((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */ 1.212 + /*if(t(-1) & VRX) return 0; // VRX/ CHE */ 1.213 + if(t(-1) & VC) return 0; /* VC/ CHE */ 1.214 + } 1.215 + if(t(-1) & CHB) { 1.216 + if((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */ 1.217 + if(t(0) & VC) return 0; /* CHB/ VC */ 1.218 + } 1.219 + 1.220 + if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */ 1.221 + if(t(-2) & VLI) return 0; /* VLI,C/C,VR .*/ 1.222 + else { /* vlao, C ? C , VR */ 1.223 + if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */ 1.224 + if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */ 1.225 + if(!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */ 1.226 + } 1.227 + } 1.228 + /* C,MT,C */ 1.229 + if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1; 1.230 + 1.231 + return -1; 1.232 +} 1.233 + 1.234 + 1.235 +int TrbFollowing(const th_char *begin, int length, int offset) 1.236 +/* 1.237 +//(ThBreakIterator *this, int offset) 1.238 +*/ 1.239 +{ 1.240 + const th_char *w = begin + offset; 1.241 + const th_char *end = begin + length; 1.242 + while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++; 1.243 + 1.244 + if(w < end && *w && !th_isthai(*w)) { 1.245 + int english = FALSE; 1.246 + while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) { 1.247 + if(th_isalpha(*w)) english = TRUE; 1.248 + w++; 1.249 + } 1.250 + if(english || w == end || 1.251 + (!th_isthai(*w) && th_isspace(*w))) return w - begin; 1.252 + } 1.253 + if(w == end || *w == 0 || !th_isthai(*w)) return w - begin; 1.254 + w++; 1.255 + if(w < end && *w && th_isthai(*w)) { 1.256 + int brk = TrbWordBreakPos(begin, w-begin, w, end-w); 1.257 + while (brk < 0) { 1.258 + w++; 1.259 + if(w == end || *w == 0 || !th_isthai(*w)) break; 1.260 + brk = TrbWordBreakPos(begin, w-begin, w, end-w); 1.261 + } 1.262 + if (brk > 0) w += brk; 1.263 + } 1.264 + if(w < end && *w && !th_isthai(*w)) { 1.265 + while(w < end && *w && !th_isthai(*w) && 1.266 + !th_isalpha(*w) && !th_isspace(*w)) w++; 1.267 + } 1.268 + return w - begin; 1.269 +} 1.270 + 1.271 + 1.272 +/* 1.273 +///////////////////////////////////////////////// 1.274 +*/ 1.275 +const twb_t _TwbType[0x100-0xa0] = { 1.276 +#if 0 1.277 +/* 80 */ T, 1.278 +/* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.279 +/* 90 */ T, 1.280 +/* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.281 +#endif 1.282 +/* a0 */ 0, 1.283 +/* a1 */ CS, 1.284 +/* a2 */ CS | CHE, 1.285 +/* a3 */ CC | CHE, 1.286 +/* a4 */ CS | CHE, 1.287 +/* a5 */ CC | CHE, 1.288 +/* a6 */ CS, 1.289 +/* a7 */ CS | CHB, 1.290 +/* a8 */ CS, 1.291 +/* a9 */ CC | CHE, 1.292 +/* aa */ CS, 1.293 +/* ab */ CC | CHE, 1.294 +/* ac */ CC | CHB | CHE, 1.295 +/* ad */ CS | CHB, 1.296 +/* ae */ CS | CHB, 1.297 +/* af */ CS | CHB, 1.298 +/* b0 */ CS, 1.299 +/* b1 */ CS | CHB | CHE, 1.300 +/* b2 */ CS | CHB | CHE, 1.301 +/* b3 */ CS | CHB, 1.302 +/* b4 */ CS, 1.303 +/* b5 */ CS, 1.304 +/* b6 */ CS, 1.305 +/* b7 */ CS, 1.306 +/* b8 */ CS, 1.307 +/* b9 */ CS, 1.308 +/* ba */ CS, 1.309 +/* bb */ CS, 1.310 +/* bc */ CC | CHE, 1.311 +/* bd */ CC | CHE, 1.312 +/* be */ CS, 1.313 +/* bf */ CS, 1.314 +/* c0 */ CS | CHE, 1.315 +/* c1 */ CS, 1.316 +/* c2 */ CS, 1.317 +/* c3 */ CS | C2 | CHE, /* ? add CHE */ 1.318 +/* c4 */ VC | CHE, 1.319 +/* c5 */ CS | C2, 1.320 +/* c6 */ VC | CHE, 1.321 +/* c7 */ VC | C2, 1.322 +/* c8 */ CS, 1.323 +/* c9 */ CS | CHB, 1.324 +/* ca */ CS | CHE, 1.325 +/* cb */ CC | CHE, 1.326 +/* CC */ CS | CHB | CHE, 1.327 +/* cd */ VC, 1.328 +/* ce */ CC | CHE, 1.329 +/* cf */ T, 1.330 +/* d0 */ VRE | VRA, 1.331 +/* d1 */ VRS, 1.332 +/* d2 */ VRX | VRA, 1.333 +/* d3 */ VRE, 1.334 +/* d4 */ VRX | VRA, 1.335 +/* d5 */ VRX | VRA, 1.336 +/* d6 */ VRS, 1.337 +/* d7 */ VRS | VRA, 1.338 +/* d8 */ VRX, 1.339 +/* d9 */ VRX, 1.340 +/* da */ T, 1.341 +/* db */ 0, 1.342 +/* dc */ 0, 1.343 +/* dd */ 0, 1.344 +/* de */ 0, 1.345 +/* df */ T, 1.346 +/* e0 */ VLA, 1.347 +/* e1 */ VLO, 1.348 +/* e2 */ VLO, 1.349 +/* e3 */ VLI, 1.350 +/* e4 */ VLI, 1.351 +/* e5 */ VRE, 1.352 +/* e6 */ M, 1.353 +/* e7 */ M, 1.354 +/* e8 */ M | MT, 1.355 +/* e9 */ M | MT, 1.356 +/* ea */ M | MT, 1.357 +/* eb */ M | MT, 1.358 +/* ec */ M, 1.359 +/* ed */ T, 1.360 +/* ee */ T, 1.361 +/* ef */ T, 1.362 +/* f0 */ T, 1.363 +/* f1 */ T, 1.364 +/* f2 */ T, 1.365 +/* f3 */ T, 1.366 +/* f4 */ T, 1.367 +/* f5 */ T, 1.368 +/* f6 */ T, 1.369 +/* f7 */ T, 1.370 +/* f8 */ T, 1.371 +/* f9 */ T, 1.372 +/* fa */ T, 1.373 +/* fb */ T, 1.374 +/* fc */ 0, 1.375 +/* fd */ 0, 1.376 +/* fe */ 0, 1.377 +/* ff */ 0 1.378 +};