intl/lwbrk/src/rulebrk.c

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 /* This Source Code Form is subject to the terms of the Mozilla Public
     2  * License, v. 2.0. If a copy of the MPL was not distributed with this
     3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     4 #define TH_UNICODE
     6 #include <stdlib.h>
     7 #include <assert.h>
     8 #include "th_char.h"
     9 #define th_isalpha(c)   (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z'))
    10 #define th_isspace(c)   ((c)==' '||(c)=='\t')
    13 /*
    14 /////////////////////////////////////////////////
    15 // Thai character type array
    16 */
    18 typedef unsigned short twb_t; 
    19 extern const twb_t _TwbType[0x100-0xa0];
    21 /*
    22 // bit definition
    23 */
    25 #define VRS 0x0001
    26 #define VRE 0x0002
    27 #define VRX 0x0004
    29 #define VRA 0x0008
    31 #define VLA 0x0010
    32 #define VLO 0x0020
    33 #define VLI 0x0040
    35 #define VC 0x0080
    37 #define CC 0x0100
    38 #define CS 0x0200
    40 #define C2 0x0400
    41 #define CHB 0x0800
    42 #define CHE 0x1000
    44 #define MT 0x2000
    45 /*
    46 //_#define me 0x2000
    47 */
    48 #define M 0x4000
    50 #define T 0x8000
    52 #define VL	(VLA|VLO|VLI)
    53 #define VR	(VRS|VRE|VRX)
    54 #define NE	(VL|VRS)
    55 #define NB	(VR|M)
    56 #define V	(VL|VR)
    57 #define CX	(CC|CS)
    58 #define C	(CX|VC)
    59 #define A (C|V|M)
    61 #define twbtype(c)	(_TwbType[th_zcode(c)])
    63 #ifndef TRUE
    64 #define TRUE 1
    65 #define FALSE 0
    66 #endif
    67 #define RETURN(b) return (b)
    70 /*
    71 /////////////////////////////////////////////////
    72 */
    74 int TrbWordBreakPos(const th_char *pstr, int left, 
    75                     const th_char *rstr, int right)
    76 /*                 const ThBreakIterator *it, const th_char **p)*/
    77 {
    78 	/*
    79 	//int left, right;
    80 	//const th_char *s = *p;
    81 	*/
    82     const th_char *lstr = pstr + left;
    83 	th_char _c[6];
    84 	twb_t _t[6];
    85 	#define c(i) (_c[(i)+3])
    86 	#define t(i) (_t[(i)+3])
    87 	int i, j;
    89 	/*
    90 	//left = s - it->begin; 
    91 	*/
    92 	if(left < 0) return -1;
    93 	/*
    94         //right = (it->end == NULL) ? 4 : it->begin - s;
    95 	*/
    96 	if(right < 1) return -1;
    98         /*
    99 	// get c(0), t(0)
   100         */
   101 	c(0) = rstr[0]; /* may be '\0' */
   102     if(!th_isthai(c(0))) return -1;
   103 	t(0) = twbtype(c(0));
   104 	if(!(t(0) & A)) return -1;
   106         /*
   107 	// get c(-1), t(-1)
   108         */
   109 	if(left >= 1) { 
   110 		c(-1) = lstr[-1]; 
   111 		if(!th_isthai(c(-1))) return 0;
   112 		t(-1) = twbtype(c(-1)); 
   113 		if(!(t(-1) & A)) return 0;	/* handle punctuation marks here */
   114 	} else { c(-1) = 0; t(-1) = 0; }
   116 	/*
   117 	// get c(1..2), t(1..2)
   118 	*/
   119 	for(i = 1; i <= 2; i++) {
   120 		if(i >= right) { c(i) = 0; t(i) = 0; }
   121 		else {
   122 			c(i) = rstr[i]; /* may be '\0'; */
   123 			if(!th_isthai(c(i))) right = i--;
   124 			else {
   125 				t(i) = twbtype(c(i));
   126 				if(!(t(i) & A)) right = i--;
   127 			}
   128 		}
   129 	}
   130 	/*
   131 	// get c(-2..-3), t(-2..-3)
   132 	*/
   133 	for(i = -2, j = -2; i >= -3 ; j--) {
   134 		if(j < -left) { c(i) = 0; t(i) = 0; i--; }
   135 		else {
   136 			c(i) = lstr[j]; 
   137 			if(!th_isthai(c(i))) left = 0;
   138 			else {
   139 				t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
   140 				if(!(t(i) & A)) left = 0;
   141 				else {
   142 					if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) {
   143 						c(i+1) = c(i); t(i+1) = t(i);
   144 					} else i--;
   145 				}
   146 			}
   147 		}
   148 	}
   150 	/*
   151 	// prohibit the unlikely
   152 	*/
   153 	if((t(-1) & C) && (t(0) & C)) {
   154 	  if((t(-1) & CHE) || (t(0) & CHB)) return -1;
   155 	}
   156 	/*
   157 	// special case : vlao, C/ sara_a|aa, !sara_a
   158 	*/
   159 	if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
   160 		(c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0;
   162 	/*
   163 	// prohibit break
   164 	*/
   165 	if(t(0) & NB) return -1; 
   166 	if(t(-1) & NE) return -1;
   169   /*
   170 	// apply 100% rules
   171   */
   172 	if(t(-1) & VRE) {
   173 		if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
   174 		return -1; /* usually too short syllable, part of word */
   175 	}
   177 	if(t(-2) & VRE) return -1;
   179 	if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */
   180 		if((t(-1) & (VRS|VRX))  && c(1) == TH_SARA_I) return -1; /* exception */
   181 		if(t(-1) & (V|M)) return 0; /* !C/ C, NB */
   182 		if(t(-2) & VRS) return 0;	/* VRS, C / C, NB */
   183 		if(!(t(0) & C2) && c(1) == TH_SARA_I) {	/*	/ !C2 or /c, sara_i */
   184 			if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */
   185 			if(t(-2) & VC) return 0;	/* VC, C / C, NB ? 100% */
   186 		}
   187 	}
   188 	if((t(-1) & VRX) && (t(0) & CC)) return 0;				/* VRX/ CC */
   189 	if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */
   192 	if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
   193 		if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */
   194 		if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
   195 	}
   196 	/*
   197 	// apply 90% rules
   198 	*/
   199 	if(t(0) & VL) return 0;
   200 	if(t(1) & VL) return -1;
   201 	if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0;
   203 	/*
   204 	//return -1;
   205 	// apply 80% rules
   206 	*/
   207 	if(t(0) & CHE) {
   208 		if((t(-2) & VRS) && (t(-1) & C)) return 0;	/* VRS, C/ CHE */
   209 		/*if(t(-1) & VRX) return 0;					// VRX/ CHE */
   210 		if(t(-1) & VC) return 0;					/* VC/ CHE */
   211 	}
   212 	if(t(-1) & CHB) {
   213 		if((t(0) & C) && (t(1) & VR)) return 0;	/* CHB/ CC, VR */
   214 		if(t(0) & VC) return 0;					/* CHB/ VC */
   215 	}
   217 	if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
   218 		if(t(-2) & VLI) return 0;  /* VLI,C/C,VR .*/
   219 		else { /* vlao, C ? C , VR */
   220 			if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
   221 			if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */
   222 			if(!(t(1) & VRA)) return 0;	/* VLA, C/ C, !vca */
   223 		}
   224 	}
   225 	/* C,MT,C */ 
   226 	if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
   228 	return -1;
   229 }
   232 int TrbFollowing(const th_char *begin, int length, int offset)
   233 /*
   234 //(ThBreakIterator *this, int offset)
   235 */
   236 {
   237 	const th_char *w = begin + offset;
   238     const th_char *end = begin + length;
   239 	while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
   241 	if(w < end && *w && !th_isthai(*w)) {
   242 		int english = FALSE;
   243 		while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
   244 			if(th_isalpha(*w)) english = TRUE;
   245 			w++; 
   246 		}
   247 		if(english || w == end || 
   248             (!th_isthai(*w) && th_isspace(*w))) return w - begin;
   249 	} 
   250 	if(w == end || *w == 0 || !th_isthai(*w)) return w - begin;
   251 	w++;
   252 	if(w < end && *w && th_isthai(*w)) {
   253 		int brk = TrbWordBreakPos(begin, w-begin, w, end-w);
   254 		while (brk < 0) {
   255 			w++;
   256 			if(w == end || *w == 0 || !th_isthai(*w)) break;
   257 			brk = TrbWordBreakPos(begin, w-begin, w, end-w);
   258 		}
   259         if (brk > 0) w += brk;
   260 	}
   261 	if(w < end && *w && !th_isthai(*w)) {
   262 		while(w < end && *w && !th_isthai(*w) && 
   263             !th_isalpha(*w) && !th_isspace(*w)) w++;
   264 	}
   265 	return w - begin;
   266 }
   269 /*
   270 /////////////////////////////////////////////////
   271 */
   272 const twb_t  _TwbType[0x100-0xa0] = {
   273 #if 0
   274 /* 80  */	T,
   275 /* 81-8f */	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   276 /* 90  */	T,
   277 /* 91-9f */	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   278 #endif
   279 /* a0  */	0,
   280 /* a1  */	CS,
   281 /* a2  */	CS | CHE,
   282 /* a3  */	CC | CHE,
   283 /* a4  */	CS | CHE,
   284 /* a5  */	CC | CHE,
   285 /* a6  */	CS,
   286 /* a7  */	CS | CHB,
   287 /* a8  */	CS,
   288 /* a9  */	CC | CHE,
   289 /* aa  */	CS,
   290 /* ab  */	CC | CHE,
   291 /* ac  */	CC | CHB | CHE,
   292 /* ad  */	CS | CHB,
   293 /* ae  */	CS | CHB,
   294 /* af  */	CS | CHB,
   295 /* b0  */	CS,
   296 /* b1  */	CS | CHB | CHE,
   297 /* b2  */	CS | CHB | CHE,
   298 /* b3  */	CS | CHB,
   299 /* b4  */	CS,
   300 /* b5  */	CS,
   301 /* b6  */	CS,
   302 /* b7  */	CS,
   303 /* b8  */	CS,
   304 /* b9  */	CS,
   305 /* ba  */	CS,
   306 /* bb  */	CS,
   307 /* bc  */	CC | CHE,
   308 /* bd  */	CC | CHE,
   309 /* be  */	CS,
   310 /* bf  */	CS,
   311 /* c0  */	CS | CHE,
   312 /* c1  */	CS,
   313 /* c2  */	CS,
   314 /* c3  */	CS | C2 | CHE, /* ? add CHE  */
   315 /* c4  */	VC | CHE,
   316 /* c5  */	CS | C2,
   317 /* c6  */	VC | CHE,
   318 /* c7  */	VC | C2,
   319 /* c8  */	CS,
   320 /* c9  */	CS | CHB,
   321 /* ca  */	CS | CHE,
   322 /* cb  */	CC | CHE,
   323 /* CC  */	CS | CHB | CHE,
   324 /* cd  */	VC,
   325 /* ce  */	CC | CHE,
   326 /* cf  */	T,
   327 /* d0  */	VRE | VRA,
   328 /* d1   */	VRS,
   329 /* d2  */	VRX | VRA,
   330 /* d3   */	VRE,
   331 /* d4   */	VRX | VRA,
   332 /* d5   */	VRX | VRA,
   333 /* d6   */	VRS,
   334 /* d7   */	VRS | VRA,
   335 /* d8   */	VRX,
   336 /* d9   */	VRX,
   337 /* da   */	T,
   338 /* db  */ 0,
   339 /* dc  */ 0,
   340 /* dd  */ 0,
   341 /* de  */ 0,
   342 /* df  */	T,
   343 /* e0  */	VLA,
   344 /* e1  */	VLO,
   345 /* e2  */	VLO,
   346 /* e3  */	VLI,
   347 /* e4  */	VLI,
   348 /* e5  */	VRE,
   349 /* e6  */	M,
   350 /* e7   */	M,
   351 /* e8   */	M | MT,
   352 /* e9   */	M | MT,
   353 /* ea   */	M | MT,
   354 /* eb   */	M | MT,
   355 /* ec   */	M,
   356 /* ed   */	T,
   357 /* ee   */	T,
   358 /* ef  */	T,
   359 /* f0  */	T,
   360 /* f1  */	T,
   361 /* f2  */	T,
   362 /* f3  */	T,
   363 /* f4  */	T,
   364 /* f5  */	T,
   365 /* f6  */	T,
   366 /* f7  */	T,
   367 /* f8  */	T,
   368 /* f9  */	T,
   369 /* fa  */	T,
   370 /* fb  */	T,
   371 /* fc  */ 0,
   372 /* fd  */ 0,
   373 /* fe  */ 0,
   374 /* ff  */ 0
   375 };

mercurial