intl/lwbrk/src/rulebrk.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/lwbrk/src/rulebrk.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,375 @@
     1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.7 +#define TH_UNICODE
     1.8 +
     1.9 +#include <stdlib.h>
    1.10 +#include <assert.h>
    1.11 +#include "th_char.h"
    1.12 +#define th_isalpha(c)   (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z'))
    1.13 +#define th_isspace(c)   ((c)==' '||(c)=='\t')
    1.14 +
    1.15 +
    1.16 +/*
    1.17 +/////////////////////////////////////////////////
    1.18 +// Thai character type array
    1.19 +*/
    1.20 +
    1.21 +typedef unsigned short twb_t; 
    1.22 +extern const twb_t _TwbType[0x100-0xa0];
    1.23 +
    1.24 +/*
    1.25 +// bit definition
    1.26 +*/
    1.27 +
    1.28 +#define VRS 0x0001
    1.29 +#define VRE 0x0002
    1.30 +#define VRX 0x0004
    1.31 +
    1.32 +#define VRA 0x0008
    1.33 +
    1.34 +#define VLA 0x0010
    1.35 +#define VLO 0x0020
    1.36 +#define VLI 0x0040
    1.37 +
    1.38 +#define VC 0x0080
    1.39 +
    1.40 +#define CC 0x0100
    1.41 +#define CS 0x0200
    1.42 +
    1.43 +#define C2 0x0400
    1.44 +#define CHB 0x0800
    1.45 +#define CHE 0x1000
    1.46 +
    1.47 +#define MT 0x2000
    1.48 +/*
    1.49 +//_#define me 0x2000
    1.50 +*/
    1.51 +#define M 0x4000
    1.52 +
    1.53 +#define T 0x8000
    1.54 +
    1.55 +#define VL	(VLA|VLO|VLI)
    1.56 +#define VR	(VRS|VRE|VRX)
    1.57 +#define NE	(VL|VRS)
    1.58 +#define NB	(VR|M)
    1.59 +#define V	(VL|VR)
    1.60 +#define CX	(CC|CS)
    1.61 +#define C	(CX|VC)
    1.62 +#define A (C|V|M)
    1.63 +
    1.64 +#define twbtype(c)	(_TwbType[th_zcode(c)])
    1.65 +
    1.66 +#ifndef TRUE
    1.67 +#define TRUE 1
    1.68 +#define FALSE 0
    1.69 +#endif
    1.70 +#define RETURN(b) return (b)
    1.71 +
    1.72 +
    1.73 +/*
    1.74 +/////////////////////////////////////////////////
    1.75 +*/
    1.76 +
    1.77 +int TrbWordBreakPos(const th_char *pstr, int left, 
    1.78 +                    const th_char *rstr, int right)
    1.79 +/*                 const ThBreakIterator *it, const th_char **p)*/
    1.80 +{
    1.81 +	/*
    1.82 +	//int left, right;
    1.83 +	//const th_char *s = *p;
    1.84 +	*/
    1.85 +    const th_char *lstr = pstr + left;
    1.86 +	th_char _c[6];
    1.87 +	twb_t _t[6];
    1.88 +	#define c(i) (_c[(i)+3])
    1.89 +	#define t(i) (_t[(i)+3])
    1.90 +	int i, j;
    1.91 +
    1.92 +	/*
    1.93 +	//left = s - it->begin; 
    1.94 +	*/
    1.95 +	if(left < 0) return -1;
    1.96 +	/*
    1.97 +        //right = (it->end == NULL) ? 4 : it->begin - s;
    1.98 +	*/
    1.99 +	if(right < 1) return -1;
   1.100 +
   1.101 +        /*
   1.102 +	// get c(0), t(0)
   1.103 +        */
   1.104 +	c(0) = rstr[0]; /* may be '\0' */
   1.105 +    if(!th_isthai(c(0))) return -1;
   1.106 +	t(0) = twbtype(c(0));
   1.107 +	if(!(t(0) & A)) return -1;
   1.108 +
   1.109 +        /*
   1.110 +	// get c(-1), t(-1)
   1.111 +        */
   1.112 +	if(left >= 1) { 
   1.113 +		c(-1) = lstr[-1]; 
   1.114 +		if(!th_isthai(c(-1))) return 0;
   1.115 +		t(-1) = twbtype(c(-1)); 
   1.116 +		if(!(t(-1) & A)) return 0;	/* handle punctuation marks here */
   1.117 +	} else { c(-1) = 0; t(-1) = 0; }
   1.118 +
   1.119 +	/*
   1.120 +	// get c(1..2), t(1..2)
   1.121 +	*/
   1.122 +	for(i = 1; i <= 2; i++) {
   1.123 +		if(i >= right) { c(i) = 0; t(i) = 0; }
   1.124 +		else {
   1.125 +			c(i) = rstr[i]; /* may be '\0'; */
   1.126 +			if(!th_isthai(c(i))) right = i--;
   1.127 +			else {
   1.128 +				t(i) = twbtype(c(i));
   1.129 +				if(!(t(i) & A)) right = i--;
   1.130 +			}
   1.131 +		}
   1.132 +	}
   1.133 +	/*
   1.134 +	// get c(-2..-3), t(-2..-3)
   1.135 +	*/
   1.136 +	for(i = -2, j = -2; i >= -3 ; j--) {
   1.137 +		if(j < -left) { c(i) = 0; t(i) = 0; i--; }
   1.138 +		else {
   1.139 +			c(i) = lstr[j]; 
   1.140 +			if(!th_isthai(c(i))) left = 0;
   1.141 +			else {
   1.142 +				t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
   1.143 +				if(!(t(i) & A)) left = 0;
   1.144 +				else {
   1.145 +					if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) {
   1.146 +						c(i+1) = c(i); t(i+1) = t(i);
   1.147 +					} else i--;
   1.148 +				}
   1.149 +			}
   1.150 +		}
   1.151 +	}
   1.152 +
   1.153 +	/*
   1.154 +	// prohibit the unlikely
   1.155 +	*/
   1.156 +	if((t(-1) & C) && (t(0) & C)) {
   1.157 +	  if((t(-1) & CHE) || (t(0) & CHB)) return -1;
   1.158 +	}
   1.159 +	/*
   1.160 +	// special case : vlao, C/ sara_a|aa, !sara_a
   1.161 +	*/
   1.162 +	if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
   1.163 +		(c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0;
   1.164 +
   1.165 +	/*
   1.166 +	// prohibit break
   1.167 +	*/
   1.168 +	if(t(0) & NB) return -1; 
   1.169 +	if(t(-1) & NE) return -1;
   1.170 +
   1.171 +
   1.172 +  /*
   1.173 +	// apply 100% rules
   1.174 +  */
   1.175 +	if(t(-1) & VRE) {
   1.176 +		if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
   1.177 +		return -1; /* usually too short syllable, part of word */
   1.178 +	}
   1.179 +
   1.180 +	if(t(-2) & VRE) return -1;
   1.181 +
   1.182 +	if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */
   1.183 +		if((t(-1) & (VRS|VRX))  && c(1) == TH_SARA_I) return -1; /* exception */
   1.184 +		if(t(-1) & (V|M)) return 0; /* !C/ C, NB */
   1.185 +		if(t(-2) & VRS) return 0;	/* VRS, C / C, NB */
   1.186 +		if(!(t(0) & C2) && c(1) == TH_SARA_I) {	/*	/ !C2 or /c, sara_i */
   1.187 +			if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */
   1.188 +			if(t(-2) & VC) return 0;	/* VC, C / C, NB ? 100% */
   1.189 +		}
   1.190 +	}
   1.191 +	if((t(-1) & VRX) && (t(0) & CC)) return 0;				/* VRX/ CC */
   1.192 +	if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */
   1.193 +
   1.194 +	
   1.195 +	if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
   1.196 +		if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */
   1.197 +		if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
   1.198 +	}
   1.199 +	/*
   1.200 +	// apply 90% rules
   1.201 +	*/
   1.202 +	if(t(0) & VL) return 0;
   1.203 +	if(t(1) & VL) return -1;
   1.204 +	if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0;
   1.205 +
   1.206 +	/*
   1.207 +	//return -1;
   1.208 +	// apply 80% rules
   1.209 +	*/
   1.210 +	if(t(0) & CHE) {
   1.211 +		if((t(-2) & VRS) && (t(-1) & C)) return 0;	/* VRS, C/ CHE */
   1.212 +		/*if(t(-1) & VRX) return 0;					// VRX/ CHE */
   1.213 +		if(t(-1) & VC) return 0;					/* VC/ CHE */
   1.214 +	}
   1.215 +	if(t(-1) & CHB) {
   1.216 +		if((t(0) & C) && (t(1) & VR)) return 0;	/* CHB/ CC, VR */
   1.217 +		if(t(0) & VC) return 0;					/* CHB/ VC */
   1.218 +	}
   1.219 +	
   1.220 +	if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
   1.221 +		if(t(-2) & VLI) return 0;  /* VLI,C/C,VR .*/
   1.222 +		else { /* vlao, C ? C , VR */
   1.223 +			if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
   1.224 +			if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */
   1.225 +			if(!(t(1) & VRA)) return 0;	/* VLA, C/ C, !vca */
   1.226 +		}
   1.227 +	}
   1.228 +	/* C,MT,C */ 
   1.229 +	if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
   1.230 +
   1.231 +	return -1;
   1.232 +}
   1.233 +
   1.234 +
   1.235 +int TrbFollowing(const th_char *begin, int length, int offset)
   1.236 +/*
   1.237 +//(ThBreakIterator *this, int offset)
   1.238 +*/
   1.239 +{
   1.240 +	const th_char *w = begin + offset;
   1.241 +    const th_char *end = begin + length;
   1.242 +	while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
   1.243 +
   1.244 +	if(w < end && *w && !th_isthai(*w)) {
   1.245 +		int english = FALSE;
   1.246 +		while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
   1.247 +			if(th_isalpha(*w)) english = TRUE;
   1.248 +			w++; 
   1.249 +		}
   1.250 +		if(english || w == end || 
   1.251 +            (!th_isthai(*w) && th_isspace(*w))) return w - begin;
   1.252 +	} 
   1.253 +	if(w == end || *w == 0 || !th_isthai(*w)) return w - begin;
   1.254 +	w++;
   1.255 +	if(w < end && *w && th_isthai(*w)) {
   1.256 +		int brk = TrbWordBreakPos(begin, w-begin, w, end-w);
   1.257 +		while (brk < 0) {
   1.258 +			w++;
   1.259 +			if(w == end || *w == 0 || !th_isthai(*w)) break;
   1.260 +			brk = TrbWordBreakPos(begin, w-begin, w, end-w);
   1.261 +		}
   1.262 +        if (brk > 0) w += brk;
   1.263 +	}
   1.264 +	if(w < end && *w && !th_isthai(*w)) {
   1.265 +		while(w < end && *w && !th_isthai(*w) && 
   1.266 +            !th_isalpha(*w) && !th_isspace(*w)) w++;
   1.267 +	}
   1.268 +	return w - begin;
   1.269 +}
   1.270 +
   1.271 +
   1.272 +/*
   1.273 +/////////////////////////////////////////////////
   1.274 +*/
   1.275 +const twb_t  _TwbType[0x100-0xa0] = {
   1.276 +#if 0
   1.277 +/* 80  */	T,
   1.278 +/* 81-8f */	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1.279 +/* 90  */	T,
   1.280 +/* 91-9f */	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1.281 +#endif
   1.282 +/* a0  */	0,
   1.283 +/* a1  */	CS,
   1.284 +/* a2  */	CS | CHE,
   1.285 +/* a3  */	CC | CHE,
   1.286 +/* a4  */	CS | CHE,
   1.287 +/* a5  */	CC | CHE,
   1.288 +/* a6  */	CS,
   1.289 +/* a7  */	CS | CHB,
   1.290 +/* a8  */	CS,
   1.291 +/* a9  */	CC | CHE,
   1.292 +/* aa  */	CS,
   1.293 +/* ab  */	CC | CHE,
   1.294 +/* ac  */	CC | CHB | CHE,
   1.295 +/* ad  */	CS | CHB,
   1.296 +/* ae  */	CS | CHB,
   1.297 +/* af  */	CS | CHB,
   1.298 +/* b0  */	CS,
   1.299 +/* b1  */	CS | CHB | CHE,
   1.300 +/* b2  */	CS | CHB | CHE,
   1.301 +/* b3  */	CS | CHB,
   1.302 +/* b4  */	CS,
   1.303 +/* b5  */	CS,
   1.304 +/* b6  */	CS,
   1.305 +/* b7  */	CS,
   1.306 +/* b8  */	CS,
   1.307 +/* b9  */	CS,
   1.308 +/* ba  */	CS,
   1.309 +/* bb  */	CS,
   1.310 +/* bc  */	CC | CHE,
   1.311 +/* bd  */	CC | CHE,
   1.312 +/* be  */	CS,
   1.313 +/* bf  */	CS,
   1.314 +/* c0  */	CS | CHE,
   1.315 +/* c1  */	CS,
   1.316 +/* c2  */	CS,
   1.317 +/* c3  */	CS | C2 | CHE, /* ? add CHE  */
   1.318 +/* c4  */	VC | CHE,
   1.319 +/* c5  */	CS | C2,
   1.320 +/* c6  */	VC | CHE,
   1.321 +/* c7  */	VC | C2,
   1.322 +/* c8  */	CS,
   1.323 +/* c9  */	CS | CHB,
   1.324 +/* ca  */	CS | CHE,
   1.325 +/* cb  */	CC | CHE,
   1.326 +/* CC  */	CS | CHB | CHE,
   1.327 +/* cd  */	VC,
   1.328 +/* ce  */	CC | CHE,
   1.329 +/* cf  */	T,
   1.330 +/* d0  */	VRE | VRA,
   1.331 +/* d1   */	VRS,
   1.332 +/* d2  */	VRX | VRA,
   1.333 +/* d3   */	VRE,
   1.334 +/* d4   */	VRX | VRA,
   1.335 +/* d5   */	VRX | VRA,
   1.336 +/* d6   */	VRS,
   1.337 +/* d7   */	VRS | VRA,
   1.338 +/* d8   */	VRX,
   1.339 +/* d9   */	VRX,
   1.340 +/* da   */	T,
   1.341 +/* db  */ 0,
   1.342 +/* dc  */ 0,
   1.343 +/* dd  */ 0,
   1.344 +/* de  */ 0,
   1.345 +/* df  */	T,
   1.346 +/* e0  */	VLA,
   1.347 +/* e1  */	VLO,
   1.348 +/* e2  */	VLO,
   1.349 +/* e3  */	VLI,
   1.350 +/* e4  */	VLI,
   1.351 +/* e5  */	VRE,
   1.352 +/* e6  */	M,
   1.353 +/* e7   */	M,
   1.354 +/* e8   */	M | MT,
   1.355 +/* e9   */	M | MT,
   1.356 +/* ea   */	M | MT,
   1.357 +/* eb   */	M | MT,
   1.358 +/* ec   */	M,
   1.359 +/* ed   */	T,
   1.360 +/* ee   */	T,
   1.361 +/* ef  */	T,
   1.362 +/* f0  */	T,
   1.363 +/* f1  */	T,
   1.364 +/* f2  */	T,
   1.365 +/* f3  */	T,
   1.366 +/* f4  */	T,
   1.367 +/* f5  */	T,
   1.368 +/* f6  */	T,
   1.369 +/* f7  */	T,
   1.370 +/* f8  */	T,
   1.371 +/* f9  */	T,
   1.372 +/* fa  */	T,
   1.373 +/* fb  */	T,
   1.374 +/* fc  */ 0,
   1.375 +/* fd  */ 0,
   1.376 +/* fe  */ 0,
   1.377 +/* ff  */ 0
   1.378 +};

mercurial