extensions/spellcheck/src/mozEnglishWordUtils.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #include "mozEnglishWordUtils.h"
     7 #include "nsReadableUtils.h"
     8 #include "nsIServiceManager.h"
     9 #include "nsUnicharUtils.h"
    10 #include "nsUnicharUtilCIID.h"
    11 #include "nsUnicodeProperties.h"
    12 #include "nsCRT.h"
    13 #include "mozilla/Likely.h"
    15 NS_IMPL_CYCLE_COLLECTING_ADDREF(mozEnglishWordUtils)
    16 NS_IMPL_CYCLE_COLLECTING_RELEASE(mozEnglishWordUtils)
    18 NS_INTERFACE_MAP_BEGIN(mozEnglishWordUtils)
    19   NS_INTERFACE_MAP_ENTRY(mozISpellI18NUtil)
    20   NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, mozISpellI18NUtil)
    21   NS_INTERFACE_MAP_ENTRIES_CYCLE_COLLECTION(mozEnglishWordUtils)
    22 NS_INTERFACE_MAP_END
    24 NS_IMPL_CYCLE_COLLECTION(mozEnglishWordUtils,
    25                          mURLDetector)
    27 mozEnglishWordUtils::mozEnglishWordUtils()
    28 {
    29   mLanguage.AssignLiteral("en");
    31   nsresult rv;
    32   mURLDetector = do_CreateInstance(MOZ_TXTTOHTMLCONV_CONTRACTID, &rv);
    33 }
    35 mozEnglishWordUtils::~mozEnglishWordUtils()
    36 {
    37 }
    39 /* attribute wstring language; */
    40 NS_IMETHODIMP mozEnglishWordUtils::GetLanguage(char16_t * *aLanguage)
    41 {
    42   nsresult rv = NS_OK;
    43   NS_ENSURE_ARG_POINTER(aLanguage);
    45   *aLanguage = ToNewUnicode(mLanguage);
    46   if(!aLanguage) rv = NS_ERROR_OUT_OF_MEMORY;
    47   return rv;
    48  }
    50 /* void GetRootForm (in wstring aWord, in uint32_t type, [array, size_is (count)] out wstring words, out uint32_t count); */
    51 // return the possible root forms of aWord.
    52 NS_IMETHODIMP mozEnglishWordUtils::GetRootForm(const char16_t *aWord, uint32_t type, char16_t ***words, uint32_t *count)
    53 {
    54   nsAutoString word(aWord);
    55   char16_t **tmpPtr;
    56   int32_t length = word.Length();
    58   *count = 0;
    60   mozEnglishWordUtils::myspCapitalization ct = captype(word);
    61   switch (ct)
    62     {
    63     case HuhCap:
    64     case NoCap: 
    65       tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *));
    66       if (!tmpPtr)
    67         return NS_ERROR_OUT_OF_MEMORY;
    68       tmpPtr[0] = ToNewUnicode(word);
    69       if (!tmpPtr[0]) {
    70         NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
    71         return NS_ERROR_OUT_OF_MEMORY;
    72       }
    73       *words = tmpPtr;
    74       *count = 1;
    75       break;
    78     case AllCap:
    79       tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *) * 3);
    80       if (!tmpPtr)
    81         return NS_ERROR_OUT_OF_MEMORY;
    82       tmpPtr[0] = ToNewUnicode(word);
    83       if (!tmpPtr[0]) {
    84         NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
    85         return NS_ERROR_OUT_OF_MEMORY;
    86       }
    87       ToLowerCase(tmpPtr[0], tmpPtr[0], length);
    89       tmpPtr[1] = ToNewUnicode(word);
    90       if (!tmpPtr[1]) {
    91         NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr);
    92         return NS_ERROR_OUT_OF_MEMORY;
    93       }
    94       ToLowerCase(tmpPtr[1], tmpPtr[1], length);
    95       ToUpperCase(tmpPtr[1], tmpPtr[1], 1);
    97       tmpPtr[2] = ToNewUnicode(word);
    98       if (!tmpPtr[2]) {
    99         NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(2, tmpPtr);
   100         return NS_ERROR_OUT_OF_MEMORY;
   101       }
   103       *words = tmpPtr;
   104       *count = 3;
   105       break;
   107     case InitCap:  
   108       tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *) * 2);
   109       if (!tmpPtr)
   110         return NS_ERROR_OUT_OF_MEMORY;
   112       tmpPtr[0] = ToNewUnicode(word);
   113       if (!tmpPtr[0]) {
   114         NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
   115         return NS_ERROR_OUT_OF_MEMORY;
   116       }
   117       ToLowerCase(tmpPtr[0], tmpPtr[0], length);
   119       tmpPtr[1] = ToNewUnicode(word);
   120       if (!tmpPtr[1]) {
   121         NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr);
   122         return NS_ERROR_OUT_OF_MEMORY;
   123       }
   125       *words = tmpPtr;
   126       *count = 2;
   127       break;
   128     default:
   129       return NS_ERROR_FAILURE; // should never get here;
   130     }
   131   return NS_OK;
   132 }
   134 // This needs vast improvement
   135 bool mozEnglishWordUtils::ucIsAlpha(char16_t aChar)
   136 {
   137   // XXX we have to fix callers to handle the full Unicode range
   138   return nsIUGenCategory::kLetter == mozilla::unicode::GetGenCategory(aChar);
   139 }
   141 /* void FindNextWord (in wstring word, in uint32_t length, in uint32_t offset, out uint32_t begin, out uint32_t end); */
   142 NS_IMETHODIMP mozEnglishWordUtils::FindNextWord(const char16_t *word, uint32_t length, uint32_t offset, int32_t *begin, int32_t *end)
   143 {
   144   const char16_t *p = word + offset;
   145   const char16_t *endbuf = word + length;
   146   const char16_t *startWord=p;
   147   if(p<endbuf){
   148     // XXX These loops should be modified to handle non-BMP characters.
   149     // if previous character is a word character, need to advance out of the word
   150     if (offset > 0 && ucIsAlpha(*(p-1))) {
   151       while (p < endbuf && ucIsAlpha(*p))
   152         p++;
   153     }
   154     while((p < endbuf) && (!ucIsAlpha(*p)))
   155       {
   156         p++;
   157       }
   158     startWord=p;
   159     while((p < endbuf) && ((ucIsAlpha(*p))||(*p=='\'')))
   160       { 
   161         p++;
   162       }
   164     // we could be trying to break down a url, we don't want to break a url into parts,
   165     // instead we want to find out if it really is a url and if so, skip it, advancing startWord 
   166     // to a point after the url.
   168     // before we spend more time looking to see if the word is a url, look for a url identifer
   169     // and make sure that identifer isn't the last character in the word fragment.
   170     if ( (*p == ':' || *p == '@' || *p == '.') &&  p < endbuf - 1) {
   172         // ok, we have a possible url...do more research to find out if we really have one
   173         // and determine the length of the url so we can skip over it.
   175         if (mURLDetector)
   176         {
   177           int32_t startPos = -1;
   178           int32_t endPos = -1;        
   180           mURLDetector->FindURLInPlaintext(startWord, endbuf - startWord, p - startWord, &startPos, &endPos);
   182           // ok, if we got a url, adjust the array bounds, skip the current url text and find the next word again
   183           if (startPos != -1 && endPos != -1) { 
   184             startWord = p + endPos + 1; // skip over the url
   185             p = startWord; // reset p
   187             // now recursively call FindNextWord to search for the next word now that we have skipped the url
   188             return FindNextWord(word, length, startWord - word, begin, end);
   189           }
   190         }
   191     }
   193     while((p > startWord)&&(*(p-1) == '\'')){  // trim trailing apostrophes
   194       p--;
   195     }
   196   }
   197   else{
   198     startWord = endbuf;
   199   }
   200   if(startWord == endbuf){
   201     *begin = -1;
   202     *end = -1;
   203   }
   204   else{
   205     *begin = startWord-word;
   206     *end = p-word;
   207   }
   208   return NS_OK;
   209 }
   211 mozEnglishWordUtils::myspCapitalization 
   212 mozEnglishWordUtils::captype(const nsString &word)
   213 {
   214   char16_t* lword=ToNewUnicode(word);  
   215   ToUpperCase(lword,lword,word.Length());
   216   if(word.Equals(lword)){
   217     nsMemory::Free(lword);
   218     return AllCap;
   219   }
   221   ToLowerCase(lword,lword,word.Length());
   222   if(word.Equals(lword)){
   223     nsMemory::Free(lword);
   224     return NoCap;
   225   }
   226   int32_t length=word.Length();
   227   if(Substring(word,1,length-1).Equals(lword+1)){
   228     nsMemory::Free(lword);
   229     return InitCap;
   230   }
   231   nsMemory::Free(lword);
   232   return HuhCap;
   233 }
   235 // Convert the list of words in iwords to the same capitalization aWord and 
   236 // return them in owords.
   237 NS_IMETHODIMP mozEnglishWordUtils::FromRootForm(const char16_t *aWord, const char16_t **iwords, uint32_t icount, char16_t ***owords, uint32_t *ocount)
   238 {
   239   nsAutoString word(aWord);
   240   nsresult rv = NS_OK;
   242   int32_t length;
   243   char16_t **tmpPtr  = (char16_t **)nsMemory::Alloc(sizeof(char16_t *)*icount);
   244   if (!tmpPtr)
   245     return NS_ERROR_OUT_OF_MEMORY;
   247   mozEnglishWordUtils::myspCapitalization ct = captype(word);
   248   for(uint32_t i = 0; i < icount; ++i) {
   249     length = NS_strlen(iwords[i]);
   250     tmpPtr[i] = (char16_t *) nsMemory::Alloc(sizeof(char16_t) * (length + 1));
   251     if (MOZ_UNLIKELY(!tmpPtr[i])) {
   252       NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(i, tmpPtr);
   253       return NS_ERROR_OUT_OF_MEMORY;
   254     }
   255     memcpy(tmpPtr[i], iwords[i], (length + 1) * sizeof(char16_t));
   257     nsAutoString capTest(tmpPtr[i]);
   258     mozEnglishWordUtils::myspCapitalization newCt=captype(capTest);
   259     if(newCt == NoCap){
   260       switch(ct) 
   261         {
   262         case HuhCap:
   263         case NoCap:
   264           break;
   265         case AllCap:
   266           ToUpperCase(tmpPtr[i],tmpPtr[i],length);
   267           rv = NS_OK;
   268           break;
   269         case InitCap:  
   270           ToUpperCase(tmpPtr[i],tmpPtr[i],1);
   271           rv = NS_OK;
   272           break;
   273         default:
   274           rv = NS_ERROR_FAILURE; // should never get here;
   275           break;
   277         }
   278     }
   279   }
   280   if (NS_SUCCEEDED(rv)){
   281     *owords = tmpPtr;
   282     *ocount = icount;
   283   }
   284   return rv;
   285 }

mercurial