1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/spellcheck/src/mozEnglishWordUtils.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,286 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "mozEnglishWordUtils.h" 1.10 +#include "nsReadableUtils.h" 1.11 +#include "nsIServiceManager.h" 1.12 +#include "nsUnicharUtils.h" 1.13 +#include "nsUnicharUtilCIID.h" 1.14 +#include "nsUnicodeProperties.h" 1.15 +#include "nsCRT.h" 1.16 +#include "mozilla/Likely.h" 1.17 + 1.18 +NS_IMPL_CYCLE_COLLECTING_ADDREF(mozEnglishWordUtils) 1.19 +NS_IMPL_CYCLE_COLLECTING_RELEASE(mozEnglishWordUtils) 1.20 + 1.21 +NS_INTERFACE_MAP_BEGIN(mozEnglishWordUtils) 1.22 + NS_INTERFACE_MAP_ENTRY(mozISpellI18NUtil) 1.23 + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, mozISpellI18NUtil) 1.24 + NS_INTERFACE_MAP_ENTRIES_CYCLE_COLLECTION(mozEnglishWordUtils) 1.25 +NS_INTERFACE_MAP_END 1.26 + 1.27 +NS_IMPL_CYCLE_COLLECTION(mozEnglishWordUtils, 1.28 + mURLDetector) 1.29 + 1.30 +mozEnglishWordUtils::mozEnglishWordUtils() 1.31 +{ 1.32 + mLanguage.AssignLiteral("en"); 1.33 + 1.34 + nsresult rv; 1.35 + mURLDetector = do_CreateInstance(MOZ_TXTTOHTMLCONV_CONTRACTID, &rv); 1.36 +} 1.37 + 1.38 +mozEnglishWordUtils::~mozEnglishWordUtils() 1.39 +{ 1.40 +} 1.41 + 1.42 +/* attribute wstring language; */ 1.43 +NS_IMETHODIMP mozEnglishWordUtils::GetLanguage(char16_t * *aLanguage) 1.44 +{ 1.45 + nsresult rv = NS_OK; 1.46 + NS_ENSURE_ARG_POINTER(aLanguage); 1.47 + 1.48 + *aLanguage = ToNewUnicode(mLanguage); 1.49 + if(!aLanguage) rv = NS_ERROR_OUT_OF_MEMORY; 1.50 + return rv; 1.51 + } 1.52 + 1.53 +/* void GetRootForm (in wstring aWord, in uint32_t type, [array, size_is (count)] out wstring words, out uint32_t count); */ 1.54 +// return the possible root forms of aWord. 1.55 +NS_IMETHODIMP mozEnglishWordUtils::GetRootForm(const char16_t *aWord, uint32_t type, char16_t ***words, uint32_t *count) 1.56 +{ 1.57 + nsAutoString word(aWord); 1.58 + char16_t **tmpPtr; 1.59 + int32_t length = word.Length(); 1.60 + 1.61 + *count = 0; 1.62 + 1.63 + mozEnglishWordUtils::myspCapitalization ct = captype(word); 1.64 + switch (ct) 1.65 + { 1.66 + case HuhCap: 1.67 + case NoCap: 1.68 + tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *)); 1.69 + if (!tmpPtr) 1.70 + return NS_ERROR_OUT_OF_MEMORY; 1.71 + tmpPtr[0] = ToNewUnicode(word); 1.72 + if (!tmpPtr[0]) { 1.73 + NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr); 1.74 + return NS_ERROR_OUT_OF_MEMORY; 1.75 + } 1.76 + *words = tmpPtr; 1.77 + *count = 1; 1.78 + break; 1.79 + 1.80 + 1.81 + case AllCap: 1.82 + tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *) * 3); 1.83 + if (!tmpPtr) 1.84 + return NS_ERROR_OUT_OF_MEMORY; 1.85 + tmpPtr[0] = ToNewUnicode(word); 1.86 + if (!tmpPtr[0]) { 1.87 + NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr); 1.88 + return NS_ERROR_OUT_OF_MEMORY; 1.89 + } 1.90 + ToLowerCase(tmpPtr[0], tmpPtr[0], length); 1.91 + 1.92 + tmpPtr[1] = ToNewUnicode(word); 1.93 + if (!tmpPtr[1]) { 1.94 + NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr); 1.95 + return NS_ERROR_OUT_OF_MEMORY; 1.96 + } 1.97 + ToLowerCase(tmpPtr[1], tmpPtr[1], length); 1.98 + ToUpperCase(tmpPtr[1], tmpPtr[1], 1); 1.99 + 1.100 + tmpPtr[2] = ToNewUnicode(word); 1.101 + if (!tmpPtr[2]) { 1.102 + NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(2, tmpPtr); 1.103 + return NS_ERROR_OUT_OF_MEMORY; 1.104 + } 1.105 + 1.106 + *words = tmpPtr; 1.107 + *count = 3; 1.108 + break; 1.109 + 1.110 + case InitCap: 1.111 + tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *) * 2); 1.112 + if (!tmpPtr) 1.113 + return NS_ERROR_OUT_OF_MEMORY; 1.114 + 1.115 + tmpPtr[0] = ToNewUnicode(word); 1.116 + if (!tmpPtr[0]) { 1.117 + NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr); 1.118 + return NS_ERROR_OUT_OF_MEMORY; 1.119 + } 1.120 + ToLowerCase(tmpPtr[0], tmpPtr[0], length); 1.121 + 1.122 + tmpPtr[1] = ToNewUnicode(word); 1.123 + if (!tmpPtr[1]) { 1.124 + NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr); 1.125 + return NS_ERROR_OUT_OF_MEMORY; 1.126 + } 1.127 + 1.128 + *words = tmpPtr; 1.129 + *count = 2; 1.130 + break; 1.131 + default: 1.132 + return NS_ERROR_FAILURE; // should never get here; 1.133 + } 1.134 + return NS_OK; 1.135 +} 1.136 + 1.137 +// This needs vast improvement 1.138 +bool mozEnglishWordUtils::ucIsAlpha(char16_t aChar) 1.139 +{ 1.140 + // XXX we have to fix callers to handle the full Unicode range 1.141 + return nsIUGenCategory::kLetter == mozilla::unicode::GetGenCategory(aChar); 1.142 +} 1.143 + 1.144 +/* void FindNextWord (in wstring word, in uint32_t length, in uint32_t offset, out uint32_t begin, out uint32_t end); */ 1.145 +NS_IMETHODIMP mozEnglishWordUtils::FindNextWord(const char16_t *word, uint32_t length, uint32_t offset, int32_t *begin, int32_t *end) 1.146 +{ 1.147 + const char16_t *p = word + offset; 1.148 + const char16_t *endbuf = word + length; 1.149 + const char16_t *startWord=p; 1.150 + if(p<endbuf){ 1.151 + // XXX These loops should be modified to handle non-BMP characters. 1.152 + // if previous character is a word character, need to advance out of the word 1.153 + if (offset > 0 && ucIsAlpha(*(p-1))) { 1.154 + while (p < endbuf && ucIsAlpha(*p)) 1.155 + p++; 1.156 + } 1.157 + while((p < endbuf) && (!ucIsAlpha(*p))) 1.158 + { 1.159 + p++; 1.160 + } 1.161 + startWord=p; 1.162 + while((p < endbuf) && ((ucIsAlpha(*p))||(*p=='\''))) 1.163 + { 1.164 + p++; 1.165 + } 1.166 + 1.167 + // we could be trying to break down a url, we don't want to break a url into parts, 1.168 + // instead we want to find out if it really is a url and if so, skip it, advancing startWord 1.169 + // to a point after the url. 1.170 + 1.171 + // before we spend more time looking to see if the word is a url, look for a url identifer 1.172 + // and make sure that identifer isn't the last character in the word fragment. 1.173 + if ( (*p == ':' || *p == '@' || *p == '.') && p < endbuf - 1) { 1.174 + 1.175 + // ok, we have a possible url...do more research to find out if we really have one 1.176 + // and determine the length of the url so we can skip over it. 1.177 + 1.178 + if (mURLDetector) 1.179 + { 1.180 + int32_t startPos = -1; 1.181 + int32_t endPos = -1; 1.182 + 1.183 + mURLDetector->FindURLInPlaintext(startWord, endbuf - startWord, p - startWord, &startPos, &endPos); 1.184 + 1.185 + // ok, if we got a url, adjust the array bounds, skip the current url text and find the next word again 1.186 + if (startPos != -1 && endPos != -1) { 1.187 + startWord = p + endPos + 1; // skip over the url 1.188 + p = startWord; // reset p 1.189 + 1.190 + // now recursively call FindNextWord to search for the next word now that we have skipped the url 1.191 + return FindNextWord(word, length, startWord - word, begin, end); 1.192 + } 1.193 + } 1.194 + } 1.195 + 1.196 + while((p > startWord)&&(*(p-1) == '\'')){ // trim trailing apostrophes 1.197 + p--; 1.198 + } 1.199 + } 1.200 + else{ 1.201 + startWord = endbuf; 1.202 + } 1.203 + if(startWord == endbuf){ 1.204 + *begin = -1; 1.205 + *end = -1; 1.206 + } 1.207 + else{ 1.208 + *begin = startWord-word; 1.209 + *end = p-word; 1.210 + } 1.211 + return NS_OK; 1.212 +} 1.213 + 1.214 +mozEnglishWordUtils::myspCapitalization 1.215 +mozEnglishWordUtils::captype(const nsString &word) 1.216 +{ 1.217 + char16_t* lword=ToNewUnicode(word); 1.218 + ToUpperCase(lword,lword,word.Length()); 1.219 + if(word.Equals(lword)){ 1.220 + nsMemory::Free(lword); 1.221 + return AllCap; 1.222 + } 1.223 + 1.224 + ToLowerCase(lword,lword,word.Length()); 1.225 + if(word.Equals(lword)){ 1.226 + nsMemory::Free(lword); 1.227 + return NoCap; 1.228 + } 1.229 + int32_t length=word.Length(); 1.230 + if(Substring(word,1,length-1).Equals(lword+1)){ 1.231 + nsMemory::Free(lword); 1.232 + return InitCap; 1.233 + } 1.234 + nsMemory::Free(lword); 1.235 + return HuhCap; 1.236 +} 1.237 + 1.238 +// Convert the list of words in iwords to the same capitalization aWord and 1.239 +// return them in owords. 1.240 +NS_IMETHODIMP mozEnglishWordUtils::FromRootForm(const char16_t *aWord, const char16_t **iwords, uint32_t icount, char16_t ***owords, uint32_t *ocount) 1.241 +{ 1.242 + nsAutoString word(aWord); 1.243 + nsresult rv = NS_OK; 1.244 + 1.245 + int32_t length; 1.246 + char16_t **tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *)*icount); 1.247 + if (!tmpPtr) 1.248 + return NS_ERROR_OUT_OF_MEMORY; 1.249 + 1.250 + mozEnglishWordUtils::myspCapitalization ct = captype(word); 1.251 + for(uint32_t i = 0; i < icount; ++i) { 1.252 + length = NS_strlen(iwords[i]); 1.253 + tmpPtr[i] = (char16_t *) nsMemory::Alloc(sizeof(char16_t) * (length + 1)); 1.254 + if (MOZ_UNLIKELY(!tmpPtr[i])) { 1.255 + NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(i, tmpPtr); 1.256 + return NS_ERROR_OUT_OF_MEMORY; 1.257 + } 1.258 + memcpy(tmpPtr[i], iwords[i], (length + 1) * sizeof(char16_t)); 1.259 + 1.260 + nsAutoString capTest(tmpPtr[i]); 1.261 + mozEnglishWordUtils::myspCapitalization newCt=captype(capTest); 1.262 + if(newCt == NoCap){ 1.263 + switch(ct) 1.264 + { 1.265 + case HuhCap: 1.266 + case NoCap: 1.267 + break; 1.268 + case AllCap: 1.269 + ToUpperCase(tmpPtr[i],tmpPtr[i],length); 1.270 + rv = NS_OK; 1.271 + break; 1.272 + case InitCap: 1.273 + ToUpperCase(tmpPtr[i],tmpPtr[i],1); 1.274 + rv = NS_OK; 1.275 + break; 1.276 + default: 1.277 + rv = NS_ERROR_FAILURE; // should never get here; 1.278 + break; 1.279 + 1.280 + } 1.281 + } 1.282 + } 1.283 + if (NS_SUCCEEDED(rv)){ 1.284 + *owords = tmpPtr; 1.285 + *ocount = icount; 1.286 + } 1.287 + return rv; 1.288 +} 1.289 +