michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: #define TH_UNICODE michael@0: michael@0: #include michael@0: #include michael@0: #include "th_char.h" michael@0: #define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z')) michael@0: #define th_isspace(c) ((c)==' '||(c)=='\t') michael@0: michael@0: michael@0: /* michael@0: ///////////////////////////////////////////////// michael@0: // Thai character type array michael@0: */ michael@0: michael@0: typedef unsigned short twb_t; michael@0: extern const twb_t _TwbType[0x100-0xa0]; michael@0: michael@0: /* michael@0: // bit definition michael@0: */ michael@0: michael@0: #define VRS 0x0001 michael@0: #define VRE 0x0002 michael@0: #define VRX 0x0004 michael@0: michael@0: #define VRA 0x0008 michael@0: michael@0: #define VLA 0x0010 michael@0: #define VLO 0x0020 michael@0: #define VLI 0x0040 michael@0: michael@0: #define VC 0x0080 michael@0: michael@0: #define CC 0x0100 michael@0: #define CS 0x0200 michael@0: michael@0: #define C2 0x0400 michael@0: #define CHB 0x0800 michael@0: #define CHE 0x1000 michael@0: michael@0: #define MT 0x2000 michael@0: /* michael@0: //_#define me 0x2000 michael@0: */ michael@0: #define M 0x4000 michael@0: michael@0: #define T 0x8000 michael@0: michael@0: #define VL (VLA|VLO|VLI) michael@0: #define VR (VRS|VRE|VRX) michael@0: #define NE (VL|VRS) michael@0: #define NB (VR|M) michael@0: #define V (VL|VR) michael@0: #define CX (CC|CS) michael@0: #define C (CX|VC) michael@0: #define A (C|V|M) michael@0: michael@0: #define twbtype(c) (_TwbType[th_zcode(c)]) michael@0: michael@0: #ifndef TRUE michael@0: #define TRUE 1 michael@0: #define FALSE 0 michael@0: #endif michael@0: #define RETURN(b) return (b) michael@0: michael@0: michael@0: /* michael@0: ///////////////////////////////////////////////// michael@0: */ michael@0: michael@0: int TrbWordBreakPos(const th_char *pstr, int left, michael@0: const th_char *rstr, int right) michael@0: /* const ThBreakIterator *it, const th_char **p)*/ michael@0: { michael@0: /* michael@0: //int left, right; michael@0: //const th_char *s = *p; michael@0: */ michael@0: const th_char *lstr = pstr + left; michael@0: th_char _c[6]; michael@0: twb_t _t[6]; michael@0: #define c(i) (_c[(i)+3]) michael@0: #define t(i) (_t[(i)+3]) michael@0: int i, j; michael@0: michael@0: /* michael@0: //left = s - it->begin; michael@0: */ michael@0: if(left < 0) return -1; michael@0: /* michael@0: //right = (it->end == NULL) ? 4 : it->begin - s; michael@0: */ michael@0: if(right < 1) return -1; michael@0: michael@0: /* michael@0: // get c(0), t(0) michael@0: */ michael@0: c(0) = rstr[0]; /* may be '\0' */ michael@0: if(!th_isthai(c(0))) return -1; michael@0: t(0) = twbtype(c(0)); michael@0: if(!(t(0) & A)) return -1; michael@0: michael@0: /* michael@0: // get c(-1), t(-1) michael@0: */ michael@0: if(left >= 1) { michael@0: c(-1) = lstr[-1]; michael@0: if(!th_isthai(c(-1))) return 0; michael@0: t(-1) = twbtype(c(-1)); michael@0: if(!(t(-1) & A)) return 0; /* handle punctuation marks here */ michael@0: } else { c(-1) = 0; t(-1) = 0; } michael@0: michael@0: /* michael@0: // get c(1..2), t(1..2) michael@0: */ michael@0: for(i = 1; i <= 2; i++) { michael@0: if(i >= right) { c(i) = 0; t(i) = 0; } michael@0: else { michael@0: c(i) = rstr[i]; /* may be '\0'; */ michael@0: if(!th_isthai(c(i))) right = i--; michael@0: else { michael@0: t(i) = twbtype(c(i)); michael@0: if(!(t(i) & A)) right = i--; michael@0: } michael@0: } michael@0: } michael@0: /* michael@0: // get c(-2..-3), t(-2..-3) michael@0: */ michael@0: for(i = -2, j = -2; i >= -3 ; j--) { michael@0: if(j < -left) { c(i) = 0; t(i) = 0; i--; } michael@0: else { michael@0: c(i) = lstr[j]; michael@0: if(!th_isthai(c(i))) left = 0; michael@0: else { michael@0: t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0); michael@0: if(!(t(i) & A)) left = 0; michael@0: else { michael@0: if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) { michael@0: c(i+1) = c(i); t(i+1) = t(i); michael@0: } else i--; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* michael@0: // prohibit the unlikely michael@0: */ michael@0: if((t(-1) & C) && (t(0) & C)) { michael@0: if((t(-1) & CHE) || (t(0) & CHB)) return -1; michael@0: } michael@0: /* michael@0: // special case : vlao, C/ sara_a|aa, !sara_a michael@0: */ michael@0: if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) && michael@0: (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0; michael@0: michael@0: /* michael@0: // prohibit break michael@0: */ michael@0: if(t(0) & NB) return -1; michael@0: if(t(-1) & NE) return -1; michael@0: michael@0: michael@0: /* michael@0: // apply 100% rules michael@0: */ michael@0: if(t(-1) & VRE) { michael@0: if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0; michael@0: return -1; /* usually too short syllable, part of word */ michael@0: } michael@0: michael@0: if(t(-2) & VRE) return -1; michael@0: michael@0: if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */ michael@0: if((t(-1) & (VRS|VRX)) && c(1) == TH_SARA_I) return -1; /* exception */ michael@0: if(t(-1) & (V|M)) return 0; /* !C/ C, NB */ michael@0: if(t(-2) & VRS) return 0; /* VRS, C / C, NB */ michael@0: if(!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */ michael@0: if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */ michael@0: if(t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */ michael@0: } michael@0: } michael@0: if((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */ michael@0: if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */ michael@0: michael@0: michael@0: if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) { michael@0: if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */ michael@0: if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */ michael@0: } michael@0: /* michael@0: // apply 90% rules michael@0: */ michael@0: if(t(0) & VL) return 0; michael@0: if(t(1) & VL) return -1; michael@0: if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0; michael@0: michael@0: /* michael@0: //return -1; michael@0: // apply 80% rules michael@0: */ michael@0: if(t(0) & CHE) { michael@0: if((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */ michael@0: /*if(t(-1) & VRX) return 0; // VRX/ CHE */ michael@0: if(t(-1) & VC) return 0; /* VC/ CHE */ michael@0: } michael@0: if(t(-1) & CHB) { michael@0: if((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */ michael@0: if(t(0) & VC) return 0; /* CHB/ VC */ michael@0: } michael@0: michael@0: if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */ michael@0: if(t(-2) & VLI) return 0; /* VLI,C/C,VR .*/ michael@0: else { /* vlao, C ? C , VR */ michael@0: if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */ michael@0: if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */ michael@0: if(!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */ michael@0: } michael@0: } michael@0: /* C,MT,C */ michael@0: if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1; michael@0: michael@0: return -1; michael@0: } michael@0: michael@0: michael@0: int TrbFollowing(const th_char *begin, int length, int offset) michael@0: /* michael@0: //(ThBreakIterator *this, int offset) michael@0: */ michael@0: { michael@0: const th_char *w = begin + offset; michael@0: const th_char *end = begin + length; michael@0: while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++; michael@0: michael@0: if(w < end && *w && !th_isthai(*w)) { michael@0: int english = FALSE; michael@0: while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) { michael@0: if(th_isalpha(*w)) english = TRUE; michael@0: w++; michael@0: } michael@0: if(english || w == end || michael@0: (!th_isthai(*w) && th_isspace(*w))) return w - begin; michael@0: } michael@0: if(w == end || *w == 0 || !th_isthai(*w)) return w - begin; michael@0: w++; michael@0: if(w < end && *w && th_isthai(*w)) { michael@0: int brk = TrbWordBreakPos(begin, w-begin, w, end-w); michael@0: while (brk < 0) { michael@0: w++; michael@0: if(w == end || *w == 0 || !th_isthai(*w)) break; michael@0: brk = TrbWordBreakPos(begin, w-begin, w, end-w); michael@0: } michael@0: if (brk > 0) w += brk; michael@0: } michael@0: if(w < end && *w && !th_isthai(*w)) { michael@0: while(w < end && *w && !th_isthai(*w) && michael@0: !th_isalpha(*w) && !th_isspace(*w)) w++; michael@0: } michael@0: return w - begin; michael@0: } michael@0: michael@0: michael@0: /* michael@0: ///////////////////////////////////////////////// michael@0: */ michael@0: const twb_t _TwbType[0x100-0xa0] = { michael@0: #if 0 michael@0: /* 80 € */ T, michael@0: /* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, michael@0: /* 90 */ T, michael@0: /* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, michael@0: #endif michael@0: /* a0   */ 0, michael@0: /* a1 ¡ */ CS, michael@0: /* a2 ¢ */ CS | CHE, michael@0: /* a3 £ */ CC | CHE, michael@0: /* a4 ¤ */ CS | CHE, michael@0: /* a5 ¥ */ CC | CHE, michael@0: /* a6 ¦ */ CS, michael@0: /* a7 § */ CS | CHB, michael@0: /* a8 ¨ */ CS, michael@0: /* a9 © */ CC | CHE, michael@0: /* aa ª */ CS, michael@0: /* ab « */ CC | CHE, michael@0: /* ac ¬ */ CC | CHB | CHE, michael@0: /* ad ­ */ CS | CHB, michael@0: /* ae ® */ CS | CHB, michael@0: /* af ¯ */ CS | CHB, michael@0: /* b0 ° */ CS, michael@0: /* b1 ± */ CS | CHB | CHE, michael@0: /* b2 ² */ CS | CHB | CHE, michael@0: /* b3 ³ */ CS | CHB, michael@0: /* b4 ´ */ CS, michael@0: /* b5 µ */ CS, michael@0: /* b6 ¶ */ CS, michael@0: /* b7 · */ CS, michael@0: /* b8 ¸ */ CS, michael@0: /* b9 ¹ */ CS, michael@0: /* ba º */ CS, michael@0: /* bb » */ CS, michael@0: /* bc ¼ */ CC | CHE, michael@0: /* bd ½ */ CC | CHE, michael@0: /* be ¾ */ CS, michael@0: /* bf ¿ */ CS, michael@0: /* c0 À */ CS | CHE, michael@0: /* c1 Á */ CS, michael@0: /* c2 Â */ CS, michael@0: /* c3 Ã */ CS | C2 | CHE, /* ? add CHE */ michael@0: /* c4 Ä */ VC | CHE, michael@0: /* c5 Å */ CS | C2, michael@0: /* c6 Æ */ VC | CHE, michael@0: /* c7 Ç */ VC | C2, michael@0: /* c8 È */ CS, michael@0: /* c9 É */ CS | CHB, michael@0: /* ca Ê */ CS | CHE, michael@0: /* cb Ë */ CC | CHE, michael@0: /* CC Ì */ CS | CHB | CHE, michael@0: /* cd Í */ VC, michael@0: /* ce Î */ CC | CHE, michael@0: /* cf Ï */ T, michael@0: /* d0 Ð */ VRE | VRA, michael@0: /* d1 Ñ */ VRS, michael@0: /* d2 Ò */ VRX | VRA, michael@0: /* d3 Ó */ VRE, michael@0: /* d4 Ô */ VRX | VRA, michael@0: /* d5 Õ */ VRX | VRA, michael@0: /* d6 Ö */ VRS, michael@0: /* d7 × */ VRS | VRA, michael@0: /* d8 Ø */ VRX, michael@0: /* d9 Ù */ VRX, michael@0: /* da Ú */ T, michael@0: /* db Û */ 0, michael@0: /* dc Ü */ 0, michael@0: /* dd Ý */ 0, michael@0: /* de Þ */ 0, michael@0: /* df ß */ T, michael@0: /* e0 à */ VLA, michael@0: /* e1 á */ VLO, michael@0: /* e2 â */ VLO, michael@0: /* e3 ã */ VLI, michael@0: /* e4 ä */ VLI, michael@0: /* e5 å */ VRE, michael@0: /* e6 æ */ M, michael@0: /* e7 ç */ M, michael@0: /* e8 è */ M | MT, michael@0: /* e9 é */ M | MT, michael@0: /* ea ê */ M | MT, michael@0: /* eb ë */ M | MT, michael@0: /* ec ì */ M, michael@0: /* ed í */ T, michael@0: /* ee î */ T, michael@0: /* ef ï */ T, michael@0: /* f0 ð */ T, michael@0: /* f1 ñ */ T, michael@0: /* f2 ò */ T, michael@0: /* f3 ó */ T, michael@0: /* f4 ô */ T, michael@0: /* f5 õ */ T, michael@0: /* f6 ö */ T, michael@0: /* f7 ÷ */ T, michael@0: /* f8 ø */ T, michael@0: /* f9 ù */ T, michael@0: /* fa ú */ T, michael@0: /* fb û */ T, michael@0: /* fc ü */ 0, michael@0: /* fd ý */ 0, michael@0: /* fe þ */ 0, michael@0: /* ff ’ */ 0 michael@0: };