The Tor Browser: diff intl/lwbrk/src/nsSampleWordBreaker.cpp

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/lwbrk/src/nsSampleWordBreaker.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,150 @@
     1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +
    1.10 +#include "nsSampleWordBreaker.h"
    1.11 +
    1.12 +nsSampleWordBreaker::nsSampleWordBreaker()
    1.13 +{
    1.14 +}
    1.15 +nsSampleWordBreaker::~nsSampleWordBreaker()
    1.16 +{
    1.17 +}
    1.18 +
    1.19 +NS_IMPL_ISUPPORTS(nsSampleWordBreaker, nsIWordBreaker)
    1.20 +
    1.21 +bool nsSampleWordBreaker::BreakInBetween(
    1.22 +  const char16_t* aText1 , uint32_t aTextLen1,
    1.23 +  const char16_t* aText2 , uint32_t aTextLen2)
    1.24 +{
    1.25 +  NS_PRECONDITION( nullptr != aText1, "null ptr");
    1.26 +  NS_PRECONDITION( nullptr != aText2, "null ptr");
    1.27 +
    1.28 +  if(!aText1 || !aText2 || (0 == aTextLen1) || (0 == aTextLen2))
    1.29 +    return false;
    1.30 +
    1.31 +  return (this->GetClass(aText1[aTextLen1-1]) != this->GetClass(aText2[0]));
    1.32 +}
    1.33 +
    1.34 +
    1.35 +#define IS_ASCII(c)            (0 == ( 0xFF80 & (c)))
    1.36 +#define ASCII_IS_ALPHA(c)         ((( 'a' <= (c)) && ((c) <= 'z')) || (( 'A' <= (c)) && ((c) <= 'Z')))
    1.37 +#define ASCII_IS_DIGIT(c)         (( '0' <= (c)) && ((c) <= '9'))
    1.38 +#define ASCII_IS_SPACE(c)         (( ' ' == (c)) || ( '\t' == (c)) || ( '\r' == (c)) || ( '\n' == (c)))
    1.39 +#define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80) 
    1.40 +
    1.41 +// we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect Unicode 3.0 
    1.42 +#define IS_HAN(c)              (( 0x3400 <= (c)) && ((c) <= 0x9fff))||(( 0xf900 <= (c)) && ((c) <= 0xfaff))
    1.43 +#define IS_KATAKANA(c)         (( 0x30A0 <= (c)) && ((c) <= 0x30FF))
    1.44 +#define IS_HIRAGANA(c)         (( 0x3040 <= (c)) && ((c) <= 0x309F))
    1.45 +#define IS_HALFWIDTHKATAKANA(c)         (( 0xFF60 <= (c)) && ((c) <= 0xFF9F))
    1.46 +#define IS_THAI(c)         (0x0E00 == (0xFF80 & (c) )) // Look at the higest 9 bits
    1.47 +
    1.48 +uint8_t nsSampleWordBreaker::GetClass(char16_t c)
    1.49 +{
    1.50 +  // begin of the hack
    1.51 +
    1.52 +  if (IS_ALPHABETICAL_SCRIPT(c))  {
    1.53 +	  if(IS_ASCII(c))  {
    1.54 +		  if(ASCII_IS_SPACE(c)) {
    1.55 +			  return kWbClassSpace;
    1.56 +		  } else if(ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c)) {
    1.57 +			  return kWbClassAlphaLetter;
    1.58 +		  } else {
    1.59 +			  return kWbClassPunct;
    1.60 +		  }
    1.61 +	  } else if(IS_THAI(c))	{
    1.62 +		  return kWbClassThaiLetter;
    1.63 +	  } else if (c == 0x00A0/*NBSP*/) {
    1.64 +      return kWbClassSpace;
    1.65 +    } else {
    1.66 +		  return kWbClassAlphaLetter;
    1.67 +	  }
    1.68 +  }  else {
    1.69 +	  if(IS_HAN(c)) {
    1.70 +		  return kWbClassHanLetter;
    1.71 +	  } else if(IS_KATAKANA(c))   {
    1.72 +		  return kWbClassKatakanaLetter;
    1.73 +	  } else if(IS_HIRAGANA(c))   {
    1.74 +		  return kWbClassHiraganaLetter;
    1.75 +	  } else if(IS_HALFWIDTHKATAKANA(c))  {
    1.76 +		  return kWbClassHWKatakanaLetter;
    1.77 +	  } else  {
    1.78 +		  return kWbClassAlphaLetter;
    1.79 +	  }
    1.80 +  }
    1.81 +  return 0;
    1.82 +}
    1.83 +
    1.84 +nsWordRange nsSampleWordBreaker::FindWord(
    1.85 +  const char16_t* aText , uint32_t aTextLen,
    1.86 +  uint32_t aOffset)
    1.87 +{
    1.88 +  nsWordRange range;
    1.89 +  NS_PRECONDITION( nullptr != aText, "null ptr");
    1.90 +  NS_PRECONDITION( 0 != aTextLen, "len = 0");
    1.91 +  NS_PRECONDITION( aOffset <= aTextLen, "aOffset > aTextLen");
    1.92 +
    1.93 +  range.mBegin = aTextLen + 1;
    1.94 +  range.mEnd = aTextLen + 1;
    1.95 +
    1.96 +  if(!aText || aOffset > aTextLen)
    1.97 +    return range;
    1.98 +
    1.99 +  uint8_t c = this->GetClass(aText[aOffset]);
   1.100 +  uint32_t i;
   1.101 +  // Scan forward
   1.102 +  range.mEnd--;
   1.103 +  for(i = aOffset +1;i <= aTextLen; i++)
   1.104 +  {
   1.105 +     if( c != this->GetClass(aText[i]))
   1.106 +     {
   1.107 +       range.mEnd = i;
   1.108 +       break;
   1.109 +     }
   1.110 +  }
   1.111 +
   1.112 +  // Scan backward
   1.113 +  range.mBegin = 0;
   1.114 +  for(i = aOffset ;i > 0; i--)
   1.115 +  {
   1.116 +     if( c != this->GetClass(aText[i-1]))
   1.117 +     {
   1.118 +       range.mBegin = i;
   1.119 +       break;
   1.120 +     }
   1.121 +  }
   1.122 +  if(kWbClassThaiLetter == c)
   1.123 +  {
   1.124 +	// need to call Thai word breaker from here
   1.125 +	// we should pass the whole Thai segment to the thai word breaker to find a shorter answer
   1.126 +  }
   1.127 +  return range;
   1.128 +}
   1.129 +
   1.130 +int32_t nsSampleWordBreaker::NextWord( 
   1.131 +  const char16_t* aText, uint32_t aLen, uint32_t aPos) 
   1.132 +{
   1.133 +  int8_t c1, c2;
   1.134 +  uint32_t cur = aPos;
   1.135 +  if (cur == aLen)
   1.136 +    return NS_WORDBREAKER_NEED_MORE_TEXT;
   1.137 +  c1 = this->GetClass(aText[cur]);
   1.138 + 
   1.139 +  for(cur++; cur <aLen; cur++)
   1.140 +  {
   1.141 +     c2 = this->GetClass(aText[cur]);
   1.142 +     if(c2 != c1) 
   1.143 +       break;
   1.144 +  }
   1.145 +  if(kWbClassThaiLetter == c1)
   1.146 +  {
   1.147 +	// need to call Thai word breaker from here
   1.148 +	// we should pass the whole Thai segment to the thai word breaker to find a shorter answer
   1.149 +  }
   1.150 +  if (cur == aLen)
   1.151 +    return NS_WORDBREAKER_NEED_MORE_TEXT;
   1.152 +  return cur;
   1.153 +}
The Tor Browser / file diff

diff: intl/lwbrk/src/nsSampleWordBreaker.cpp

intl/lwbrk/src/nsSampleWordBreaker.cpp