Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | |
michael@0 | 6 | #include "mozEnglishWordUtils.h" |
michael@0 | 7 | #include "nsReadableUtils.h" |
michael@0 | 8 | #include "nsIServiceManager.h" |
michael@0 | 9 | #include "nsUnicharUtils.h" |
michael@0 | 10 | #include "nsUnicharUtilCIID.h" |
michael@0 | 11 | #include "nsUnicodeProperties.h" |
michael@0 | 12 | #include "nsCRT.h" |
michael@0 | 13 | #include "mozilla/Likely.h" |
michael@0 | 14 | |
michael@0 | 15 | NS_IMPL_CYCLE_COLLECTING_ADDREF(mozEnglishWordUtils) |
michael@0 | 16 | NS_IMPL_CYCLE_COLLECTING_RELEASE(mozEnglishWordUtils) |
michael@0 | 17 | |
michael@0 | 18 | NS_INTERFACE_MAP_BEGIN(mozEnglishWordUtils) |
michael@0 | 19 | NS_INTERFACE_MAP_ENTRY(mozISpellI18NUtil) |
michael@0 | 20 | NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, mozISpellI18NUtil) |
michael@0 | 21 | NS_INTERFACE_MAP_ENTRIES_CYCLE_COLLECTION(mozEnglishWordUtils) |
michael@0 | 22 | NS_INTERFACE_MAP_END |
michael@0 | 23 | |
michael@0 | 24 | NS_IMPL_CYCLE_COLLECTION(mozEnglishWordUtils, |
michael@0 | 25 | mURLDetector) |
michael@0 | 26 | |
michael@0 | 27 | mozEnglishWordUtils::mozEnglishWordUtils() |
michael@0 | 28 | { |
michael@0 | 29 | mLanguage.AssignLiteral("en"); |
michael@0 | 30 | |
michael@0 | 31 | nsresult rv; |
michael@0 | 32 | mURLDetector = do_CreateInstance(MOZ_TXTTOHTMLCONV_CONTRACTID, &rv); |
michael@0 | 33 | } |
michael@0 | 34 | |
michael@0 | 35 | mozEnglishWordUtils::~mozEnglishWordUtils() |
michael@0 | 36 | { |
michael@0 | 37 | } |
michael@0 | 38 | |
michael@0 | 39 | /* attribute wstring language; */ |
michael@0 | 40 | NS_IMETHODIMP mozEnglishWordUtils::GetLanguage(char16_t * *aLanguage) |
michael@0 | 41 | { |
michael@0 | 42 | nsresult rv = NS_OK; |
michael@0 | 43 | NS_ENSURE_ARG_POINTER(aLanguage); |
michael@0 | 44 | |
michael@0 | 45 | *aLanguage = ToNewUnicode(mLanguage); |
michael@0 | 46 | if(!aLanguage) rv = NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 47 | return rv; |
michael@0 | 48 | } |
michael@0 | 49 | |
michael@0 | 50 | /* void GetRootForm (in wstring aWord, in uint32_t type, [array, size_is (count)] out wstring words, out uint32_t count); */ |
michael@0 | 51 | // return the possible root forms of aWord. |
michael@0 | 52 | NS_IMETHODIMP mozEnglishWordUtils::GetRootForm(const char16_t *aWord, uint32_t type, char16_t ***words, uint32_t *count) |
michael@0 | 53 | { |
michael@0 | 54 | nsAutoString word(aWord); |
michael@0 | 55 | char16_t **tmpPtr; |
michael@0 | 56 | int32_t length = word.Length(); |
michael@0 | 57 | |
michael@0 | 58 | *count = 0; |
michael@0 | 59 | |
michael@0 | 60 | mozEnglishWordUtils::myspCapitalization ct = captype(word); |
michael@0 | 61 | switch (ct) |
michael@0 | 62 | { |
michael@0 | 63 | case HuhCap: |
michael@0 | 64 | case NoCap: |
michael@0 | 65 | tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *)); |
michael@0 | 66 | if (!tmpPtr) |
michael@0 | 67 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 68 | tmpPtr[0] = ToNewUnicode(word); |
michael@0 | 69 | if (!tmpPtr[0]) { |
michael@0 | 70 | NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr); |
michael@0 | 71 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 72 | } |
michael@0 | 73 | *words = tmpPtr; |
michael@0 | 74 | *count = 1; |
michael@0 | 75 | break; |
michael@0 | 76 | |
michael@0 | 77 | |
michael@0 | 78 | case AllCap: |
michael@0 | 79 | tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *) * 3); |
michael@0 | 80 | if (!tmpPtr) |
michael@0 | 81 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 82 | tmpPtr[0] = ToNewUnicode(word); |
michael@0 | 83 | if (!tmpPtr[0]) { |
michael@0 | 84 | NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr); |
michael@0 | 85 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 86 | } |
michael@0 | 87 | ToLowerCase(tmpPtr[0], tmpPtr[0], length); |
michael@0 | 88 | |
michael@0 | 89 | tmpPtr[1] = ToNewUnicode(word); |
michael@0 | 90 | if (!tmpPtr[1]) { |
michael@0 | 91 | NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr); |
michael@0 | 92 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 93 | } |
michael@0 | 94 | ToLowerCase(tmpPtr[1], tmpPtr[1], length); |
michael@0 | 95 | ToUpperCase(tmpPtr[1], tmpPtr[1], 1); |
michael@0 | 96 | |
michael@0 | 97 | tmpPtr[2] = ToNewUnicode(word); |
michael@0 | 98 | if (!tmpPtr[2]) { |
michael@0 | 99 | NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(2, tmpPtr); |
michael@0 | 100 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 101 | } |
michael@0 | 102 | |
michael@0 | 103 | *words = tmpPtr; |
michael@0 | 104 | *count = 3; |
michael@0 | 105 | break; |
michael@0 | 106 | |
michael@0 | 107 | case InitCap: |
michael@0 | 108 | tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *) * 2); |
michael@0 | 109 | if (!tmpPtr) |
michael@0 | 110 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 111 | |
michael@0 | 112 | tmpPtr[0] = ToNewUnicode(word); |
michael@0 | 113 | if (!tmpPtr[0]) { |
michael@0 | 114 | NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr); |
michael@0 | 115 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 116 | } |
michael@0 | 117 | ToLowerCase(tmpPtr[0], tmpPtr[0], length); |
michael@0 | 118 | |
michael@0 | 119 | tmpPtr[1] = ToNewUnicode(word); |
michael@0 | 120 | if (!tmpPtr[1]) { |
michael@0 | 121 | NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr); |
michael@0 | 122 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 123 | } |
michael@0 | 124 | |
michael@0 | 125 | *words = tmpPtr; |
michael@0 | 126 | *count = 2; |
michael@0 | 127 | break; |
michael@0 | 128 | default: |
michael@0 | 129 | return NS_ERROR_FAILURE; // should never get here; |
michael@0 | 130 | } |
michael@0 | 131 | return NS_OK; |
michael@0 | 132 | } |
michael@0 | 133 | |
michael@0 | 134 | // This needs vast improvement |
michael@0 | 135 | bool mozEnglishWordUtils::ucIsAlpha(char16_t aChar) |
michael@0 | 136 | { |
michael@0 | 137 | // XXX we have to fix callers to handle the full Unicode range |
michael@0 | 138 | return nsIUGenCategory::kLetter == mozilla::unicode::GetGenCategory(aChar); |
michael@0 | 139 | } |
michael@0 | 140 | |
michael@0 | 141 | /* void FindNextWord (in wstring word, in uint32_t length, in uint32_t offset, out uint32_t begin, out uint32_t end); */ |
michael@0 | 142 | NS_IMETHODIMP mozEnglishWordUtils::FindNextWord(const char16_t *word, uint32_t length, uint32_t offset, int32_t *begin, int32_t *end) |
michael@0 | 143 | { |
michael@0 | 144 | const char16_t *p = word + offset; |
michael@0 | 145 | const char16_t *endbuf = word + length; |
michael@0 | 146 | const char16_t *startWord=p; |
michael@0 | 147 | if(p<endbuf){ |
michael@0 | 148 | // XXX These loops should be modified to handle non-BMP characters. |
michael@0 | 149 | // if previous character is a word character, need to advance out of the word |
michael@0 | 150 | if (offset > 0 && ucIsAlpha(*(p-1))) { |
michael@0 | 151 | while (p < endbuf && ucIsAlpha(*p)) |
michael@0 | 152 | p++; |
michael@0 | 153 | } |
michael@0 | 154 | while((p < endbuf) && (!ucIsAlpha(*p))) |
michael@0 | 155 | { |
michael@0 | 156 | p++; |
michael@0 | 157 | } |
michael@0 | 158 | startWord=p; |
michael@0 | 159 | while((p < endbuf) && ((ucIsAlpha(*p))||(*p=='\''))) |
michael@0 | 160 | { |
michael@0 | 161 | p++; |
michael@0 | 162 | } |
michael@0 | 163 | |
michael@0 | 164 | // we could be trying to break down a url, we don't want to break a url into parts, |
michael@0 | 165 | // instead we want to find out if it really is a url and if so, skip it, advancing startWord |
michael@0 | 166 | // to a point after the url. |
michael@0 | 167 | |
michael@0 | 168 | // before we spend more time looking to see if the word is a url, look for a url identifer |
michael@0 | 169 | // and make sure that identifer isn't the last character in the word fragment. |
michael@0 | 170 | if ( (*p == ':' || *p == '@' || *p == '.') && p < endbuf - 1) { |
michael@0 | 171 | |
michael@0 | 172 | // ok, we have a possible url...do more research to find out if we really have one |
michael@0 | 173 | // and determine the length of the url so we can skip over it. |
michael@0 | 174 | |
michael@0 | 175 | if (mURLDetector) |
michael@0 | 176 | { |
michael@0 | 177 | int32_t startPos = -1; |
michael@0 | 178 | int32_t endPos = -1; |
michael@0 | 179 | |
michael@0 | 180 | mURLDetector->FindURLInPlaintext(startWord, endbuf - startWord, p - startWord, &startPos, &endPos); |
michael@0 | 181 | |
michael@0 | 182 | // ok, if we got a url, adjust the array bounds, skip the current url text and find the next word again |
michael@0 | 183 | if (startPos != -1 && endPos != -1) { |
michael@0 | 184 | startWord = p + endPos + 1; // skip over the url |
michael@0 | 185 | p = startWord; // reset p |
michael@0 | 186 | |
michael@0 | 187 | // now recursively call FindNextWord to search for the next word now that we have skipped the url |
michael@0 | 188 | return FindNextWord(word, length, startWord - word, begin, end); |
michael@0 | 189 | } |
michael@0 | 190 | } |
michael@0 | 191 | } |
michael@0 | 192 | |
michael@0 | 193 | while((p > startWord)&&(*(p-1) == '\'')){ // trim trailing apostrophes |
michael@0 | 194 | p--; |
michael@0 | 195 | } |
michael@0 | 196 | } |
michael@0 | 197 | else{ |
michael@0 | 198 | startWord = endbuf; |
michael@0 | 199 | } |
michael@0 | 200 | if(startWord == endbuf){ |
michael@0 | 201 | *begin = -1; |
michael@0 | 202 | *end = -1; |
michael@0 | 203 | } |
michael@0 | 204 | else{ |
michael@0 | 205 | *begin = startWord-word; |
michael@0 | 206 | *end = p-word; |
michael@0 | 207 | } |
michael@0 | 208 | return NS_OK; |
michael@0 | 209 | } |
michael@0 | 210 | |
michael@0 | 211 | mozEnglishWordUtils::myspCapitalization |
michael@0 | 212 | mozEnglishWordUtils::captype(const nsString &word) |
michael@0 | 213 | { |
michael@0 | 214 | char16_t* lword=ToNewUnicode(word); |
michael@0 | 215 | ToUpperCase(lword,lword,word.Length()); |
michael@0 | 216 | if(word.Equals(lword)){ |
michael@0 | 217 | nsMemory::Free(lword); |
michael@0 | 218 | return AllCap; |
michael@0 | 219 | } |
michael@0 | 220 | |
michael@0 | 221 | ToLowerCase(lword,lword,word.Length()); |
michael@0 | 222 | if(word.Equals(lword)){ |
michael@0 | 223 | nsMemory::Free(lword); |
michael@0 | 224 | return NoCap; |
michael@0 | 225 | } |
michael@0 | 226 | int32_t length=word.Length(); |
michael@0 | 227 | if(Substring(word,1,length-1).Equals(lword+1)){ |
michael@0 | 228 | nsMemory::Free(lword); |
michael@0 | 229 | return InitCap; |
michael@0 | 230 | } |
michael@0 | 231 | nsMemory::Free(lword); |
michael@0 | 232 | return HuhCap; |
michael@0 | 233 | } |
michael@0 | 234 | |
michael@0 | 235 | // Convert the list of words in iwords to the same capitalization aWord and |
michael@0 | 236 | // return them in owords. |
michael@0 | 237 | NS_IMETHODIMP mozEnglishWordUtils::FromRootForm(const char16_t *aWord, const char16_t **iwords, uint32_t icount, char16_t ***owords, uint32_t *ocount) |
michael@0 | 238 | { |
michael@0 | 239 | nsAutoString word(aWord); |
michael@0 | 240 | nsresult rv = NS_OK; |
michael@0 | 241 | |
michael@0 | 242 | int32_t length; |
michael@0 | 243 | char16_t **tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *)*icount); |
michael@0 | 244 | if (!tmpPtr) |
michael@0 | 245 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 246 | |
michael@0 | 247 | mozEnglishWordUtils::myspCapitalization ct = captype(word); |
michael@0 | 248 | for(uint32_t i = 0; i < icount; ++i) { |
michael@0 | 249 | length = NS_strlen(iwords[i]); |
michael@0 | 250 | tmpPtr[i] = (char16_t *) nsMemory::Alloc(sizeof(char16_t) * (length + 1)); |
michael@0 | 251 | if (MOZ_UNLIKELY(!tmpPtr[i])) { |
michael@0 | 252 | NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(i, tmpPtr); |
michael@0 | 253 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 254 | } |
michael@0 | 255 | memcpy(tmpPtr[i], iwords[i], (length + 1) * sizeof(char16_t)); |
michael@0 | 256 | |
michael@0 | 257 | nsAutoString capTest(tmpPtr[i]); |
michael@0 | 258 | mozEnglishWordUtils::myspCapitalization newCt=captype(capTest); |
michael@0 | 259 | if(newCt == NoCap){ |
michael@0 | 260 | switch(ct) |
michael@0 | 261 | { |
michael@0 | 262 | case HuhCap: |
michael@0 | 263 | case NoCap: |
michael@0 | 264 | break; |
michael@0 | 265 | case AllCap: |
michael@0 | 266 | ToUpperCase(tmpPtr[i],tmpPtr[i],length); |
michael@0 | 267 | rv = NS_OK; |
michael@0 | 268 | break; |
michael@0 | 269 | case InitCap: |
michael@0 | 270 | ToUpperCase(tmpPtr[i],tmpPtr[i],1); |
michael@0 | 271 | rv = NS_OK; |
michael@0 | 272 | break; |
michael@0 | 273 | default: |
michael@0 | 274 | rv = NS_ERROR_FAILURE; // should never get here; |
michael@0 | 275 | break; |
michael@0 | 276 | |
michael@0 | 277 | } |
michael@0 | 278 | } |
michael@0 | 279 | } |
michael@0 | 280 | if (NS_SUCCEEDED(rv)){ |
michael@0 | 281 | *owords = tmpPtr; |
michael@0 | 282 | *ocount = icount; |
michael@0 | 283 | } |
michael@0 | 284 | return rv; |
michael@0 | 285 | } |
michael@0 | 286 |