1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/lwbrk/src/nsSampleWordBreaker.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,150 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 + 1.10 +#include "nsSampleWordBreaker.h" 1.11 + 1.12 +nsSampleWordBreaker::nsSampleWordBreaker() 1.13 +{ 1.14 +} 1.15 +nsSampleWordBreaker::~nsSampleWordBreaker() 1.16 +{ 1.17 +} 1.18 + 1.19 +NS_IMPL_ISUPPORTS(nsSampleWordBreaker, nsIWordBreaker) 1.20 + 1.21 +bool nsSampleWordBreaker::BreakInBetween( 1.22 + const char16_t* aText1 , uint32_t aTextLen1, 1.23 + const char16_t* aText2 , uint32_t aTextLen2) 1.24 +{ 1.25 + NS_PRECONDITION( nullptr != aText1, "null ptr"); 1.26 + NS_PRECONDITION( nullptr != aText2, "null ptr"); 1.27 + 1.28 + if(!aText1 || !aText2 || (0 == aTextLen1) || (0 == aTextLen2)) 1.29 + return false; 1.30 + 1.31 + return (this->GetClass(aText1[aTextLen1-1]) != this->GetClass(aText2[0])); 1.32 +} 1.33 + 1.34 + 1.35 +#define IS_ASCII(c) (0 == ( 0xFF80 & (c))) 1.36 +#define ASCII_IS_ALPHA(c) ((( 'a' <= (c)) && ((c) <= 'z')) || (( 'A' <= (c)) && ((c) <= 'Z'))) 1.37 +#define ASCII_IS_DIGIT(c) (( '0' <= (c)) && ((c) <= '9')) 1.38 +#define ASCII_IS_SPACE(c) (( ' ' == (c)) || ( '\t' == (c)) || ( '\r' == (c)) || ( '\n' == (c))) 1.39 +#define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80) 1.40 + 1.41 +// we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect Unicode 3.0 1.42 +#define IS_HAN(c) (( 0x3400 <= (c)) && ((c) <= 0x9fff))||(( 0xf900 <= (c)) && ((c) <= 0xfaff)) 1.43 +#define IS_KATAKANA(c) (( 0x30A0 <= (c)) && ((c) <= 0x30FF)) 1.44 +#define IS_HIRAGANA(c) (( 0x3040 <= (c)) && ((c) <= 0x309F)) 1.45 +#define IS_HALFWIDTHKATAKANA(c) (( 0xFF60 <= (c)) && ((c) <= 0xFF9F)) 1.46 +#define IS_THAI(c) (0x0E00 == (0xFF80 & (c) )) // Look at the higest 9 bits 1.47 + 1.48 +uint8_t nsSampleWordBreaker::GetClass(char16_t c) 1.49 +{ 1.50 + // begin of the hack 1.51 + 1.52 + if (IS_ALPHABETICAL_SCRIPT(c)) { 1.53 + if(IS_ASCII(c)) { 1.54 + if(ASCII_IS_SPACE(c)) { 1.55 + return kWbClassSpace; 1.56 + } else if(ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c)) { 1.57 + return kWbClassAlphaLetter; 1.58 + } else { 1.59 + return kWbClassPunct; 1.60 + } 1.61 + } else if(IS_THAI(c)) { 1.62 + return kWbClassThaiLetter; 1.63 + } else if (c == 0x00A0/*NBSP*/) { 1.64 + return kWbClassSpace; 1.65 + } else { 1.66 + return kWbClassAlphaLetter; 1.67 + } 1.68 + } else { 1.69 + if(IS_HAN(c)) { 1.70 + return kWbClassHanLetter; 1.71 + } else if(IS_KATAKANA(c)) { 1.72 + return kWbClassKatakanaLetter; 1.73 + } else if(IS_HIRAGANA(c)) { 1.74 + return kWbClassHiraganaLetter; 1.75 + } else if(IS_HALFWIDTHKATAKANA(c)) { 1.76 + return kWbClassHWKatakanaLetter; 1.77 + } else { 1.78 + return kWbClassAlphaLetter; 1.79 + } 1.80 + } 1.81 + return 0; 1.82 +} 1.83 + 1.84 +nsWordRange nsSampleWordBreaker::FindWord( 1.85 + const char16_t* aText , uint32_t aTextLen, 1.86 + uint32_t aOffset) 1.87 +{ 1.88 + nsWordRange range; 1.89 + NS_PRECONDITION( nullptr != aText, "null ptr"); 1.90 + NS_PRECONDITION( 0 != aTextLen, "len = 0"); 1.91 + NS_PRECONDITION( aOffset <= aTextLen, "aOffset > aTextLen"); 1.92 + 1.93 + range.mBegin = aTextLen + 1; 1.94 + range.mEnd = aTextLen + 1; 1.95 + 1.96 + if(!aText || aOffset > aTextLen) 1.97 + return range; 1.98 + 1.99 + uint8_t c = this->GetClass(aText[aOffset]); 1.100 + uint32_t i; 1.101 + // Scan forward 1.102 + range.mEnd--; 1.103 + for(i = aOffset +1;i <= aTextLen; i++) 1.104 + { 1.105 + if( c != this->GetClass(aText[i])) 1.106 + { 1.107 + range.mEnd = i; 1.108 + break; 1.109 + } 1.110 + } 1.111 + 1.112 + // Scan backward 1.113 + range.mBegin = 0; 1.114 + for(i = aOffset ;i > 0; i--) 1.115 + { 1.116 + if( c != this->GetClass(aText[i-1])) 1.117 + { 1.118 + range.mBegin = i; 1.119 + break; 1.120 + } 1.121 + } 1.122 + if(kWbClassThaiLetter == c) 1.123 + { 1.124 + // need to call Thai word breaker from here 1.125 + // we should pass the whole Thai segment to the thai word breaker to find a shorter answer 1.126 + } 1.127 + return range; 1.128 +} 1.129 + 1.130 +int32_t nsSampleWordBreaker::NextWord( 1.131 + const char16_t* aText, uint32_t aLen, uint32_t aPos) 1.132 +{ 1.133 + int8_t c1, c2; 1.134 + uint32_t cur = aPos; 1.135 + if (cur == aLen) 1.136 + return NS_WORDBREAKER_NEED_MORE_TEXT; 1.137 + c1 = this->GetClass(aText[cur]); 1.138 + 1.139 + for(cur++; cur <aLen; cur++) 1.140 + { 1.141 + c2 = this->GetClass(aText[cur]); 1.142 + if(c2 != c1) 1.143 + break; 1.144 + } 1.145 + if(kWbClassThaiLetter == c1) 1.146 + { 1.147 + // need to call Thai word breaker from here 1.148 + // we should pass the whole Thai segment to the thai word breaker to find a shorter answer 1.149 + } 1.150 + if (cur == aLen) 1.151 + return NS_WORDBREAKER_NEED_MORE_TEXT; 1.152 + return cur; 1.153 +}