|
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "mozEnglishWordUtils.h" |
|
7 #include "nsReadableUtils.h" |
|
8 #include "nsIServiceManager.h" |
|
9 #include "nsUnicharUtils.h" |
|
10 #include "nsUnicharUtilCIID.h" |
|
11 #include "nsUnicodeProperties.h" |
|
12 #include "nsCRT.h" |
|
13 #include "mozilla/Likely.h" |
|
14 |
|
15 NS_IMPL_CYCLE_COLLECTING_ADDREF(mozEnglishWordUtils) |
|
16 NS_IMPL_CYCLE_COLLECTING_RELEASE(mozEnglishWordUtils) |
|
17 |
|
18 NS_INTERFACE_MAP_BEGIN(mozEnglishWordUtils) |
|
19 NS_INTERFACE_MAP_ENTRY(mozISpellI18NUtil) |
|
20 NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, mozISpellI18NUtil) |
|
21 NS_INTERFACE_MAP_ENTRIES_CYCLE_COLLECTION(mozEnglishWordUtils) |
|
22 NS_INTERFACE_MAP_END |
|
23 |
|
24 NS_IMPL_CYCLE_COLLECTION(mozEnglishWordUtils, |
|
25 mURLDetector) |
|
26 |
|
27 mozEnglishWordUtils::mozEnglishWordUtils() |
|
28 { |
|
29 mLanguage.AssignLiteral("en"); |
|
30 |
|
31 nsresult rv; |
|
32 mURLDetector = do_CreateInstance(MOZ_TXTTOHTMLCONV_CONTRACTID, &rv); |
|
33 } |
|
34 |
|
35 mozEnglishWordUtils::~mozEnglishWordUtils() |
|
36 { |
|
37 } |
|
38 |
|
39 /* attribute wstring language; */ |
|
40 NS_IMETHODIMP mozEnglishWordUtils::GetLanguage(char16_t * *aLanguage) |
|
41 { |
|
42 nsresult rv = NS_OK; |
|
43 NS_ENSURE_ARG_POINTER(aLanguage); |
|
44 |
|
45 *aLanguage = ToNewUnicode(mLanguage); |
|
46 if(!aLanguage) rv = NS_ERROR_OUT_OF_MEMORY; |
|
47 return rv; |
|
48 } |
|
49 |
|
50 /* void GetRootForm (in wstring aWord, in uint32_t type, [array, size_is (count)] out wstring words, out uint32_t count); */ |
|
51 // return the possible root forms of aWord. |
|
52 NS_IMETHODIMP mozEnglishWordUtils::GetRootForm(const char16_t *aWord, uint32_t type, char16_t ***words, uint32_t *count) |
|
53 { |
|
54 nsAutoString word(aWord); |
|
55 char16_t **tmpPtr; |
|
56 int32_t length = word.Length(); |
|
57 |
|
58 *count = 0; |
|
59 |
|
60 mozEnglishWordUtils::myspCapitalization ct = captype(word); |
|
61 switch (ct) |
|
62 { |
|
63 case HuhCap: |
|
64 case NoCap: |
|
65 tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *)); |
|
66 if (!tmpPtr) |
|
67 return NS_ERROR_OUT_OF_MEMORY; |
|
68 tmpPtr[0] = ToNewUnicode(word); |
|
69 if (!tmpPtr[0]) { |
|
70 NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr); |
|
71 return NS_ERROR_OUT_OF_MEMORY; |
|
72 } |
|
73 *words = tmpPtr; |
|
74 *count = 1; |
|
75 break; |
|
76 |
|
77 |
|
78 case AllCap: |
|
79 tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *) * 3); |
|
80 if (!tmpPtr) |
|
81 return NS_ERROR_OUT_OF_MEMORY; |
|
82 tmpPtr[0] = ToNewUnicode(word); |
|
83 if (!tmpPtr[0]) { |
|
84 NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr); |
|
85 return NS_ERROR_OUT_OF_MEMORY; |
|
86 } |
|
87 ToLowerCase(tmpPtr[0], tmpPtr[0], length); |
|
88 |
|
89 tmpPtr[1] = ToNewUnicode(word); |
|
90 if (!tmpPtr[1]) { |
|
91 NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr); |
|
92 return NS_ERROR_OUT_OF_MEMORY; |
|
93 } |
|
94 ToLowerCase(tmpPtr[1], tmpPtr[1], length); |
|
95 ToUpperCase(tmpPtr[1], tmpPtr[1], 1); |
|
96 |
|
97 tmpPtr[2] = ToNewUnicode(word); |
|
98 if (!tmpPtr[2]) { |
|
99 NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(2, tmpPtr); |
|
100 return NS_ERROR_OUT_OF_MEMORY; |
|
101 } |
|
102 |
|
103 *words = tmpPtr; |
|
104 *count = 3; |
|
105 break; |
|
106 |
|
107 case InitCap: |
|
108 tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *) * 2); |
|
109 if (!tmpPtr) |
|
110 return NS_ERROR_OUT_OF_MEMORY; |
|
111 |
|
112 tmpPtr[0] = ToNewUnicode(word); |
|
113 if (!tmpPtr[0]) { |
|
114 NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr); |
|
115 return NS_ERROR_OUT_OF_MEMORY; |
|
116 } |
|
117 ToLowerCase(tmpPtr[0], tmpPtr[0], length); |
|
118 |
|
119 tmpPtr[1] = ToNewUnicode(word); |
|
120 if (!tmpPtr[1]) { |
|
121 NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr); |
|
122 return NS_ERROR_OUT_OF_MEMORY; |
|
123 } |
|
124 |
|
125 *words = tmpPtr; |
|
126 *count = 2; |
|
127 break; |
|
128 default: |
|
129 return NS_ERROR_FAILURE; // should never get here; |
|
130 } |
|
131 return NS_OK; |
|
132 } |
|
133 |
|
134 // This needs vast improvement |
|
135 bool mozEnglishWordUtils::ucIsAlpha(char16_t aChar) |
|
136 { |
|
137 // XXX we have to fix callers to handle the full Unicode range |
|
138 return nsIUGenCategory::kLetter == mozilla::unicode::GetGenCategory(aChar); |
|
139 } |
|
140 |
|
141 /* void FindNextWord (in wstring word, in uint32_t length, in uint32_t offset, out uint32_t begin, out uint32_t end); */ |
|
142 NS_IMETHODIMP mozEnglishWordUtils::FindNextWord(const char16_t *word, uint32_t length, uint32_t offset, int32_t *begin, int32_t *end) |
|
143 { |
|
144 const char16_t *p = word + offset; |
|
145 const char16_t *endbuf = word + length; |
|
146 const char16_t *startWord=p; |
|
147 if(p<endbuf){ |
|
148 // XXX These loops should be modified to handle non-BMP characters. |
|
149 // if previous character is a word character, need to advance out of the word |
|
150 if (offset > 0 && ucIsAlpha(*(p-1))) { |
|
151 while (p < endbuf && ucIsAlpha(*p)) |
|
152 p++; |
|
153 } |
|
154 while((p < endbuf) && (!ucIsAlpha(*p))) |
|
155 { |
|
156 p++; |
|
157 } |
|
158 startWord=p; |
|
159 while((p < endbuf) && ((ucIsAlpha(*p))||(*p=='\''))) |
|
160 { |
|
161 p++; |
|
162 } |
|
163 |
|
164 // we could be trying to break down a url, we don't want to break a url into parts, |
|
165 // instead we want to find out if it really is a url and if so, skip it, advancing startWord |
|
166 // to a point after the url. |
|
167 |
|
168 // before we spend more time looking to see if the word is a url, look for a url identifer |
|
169 // and make sure that identifer isn't the last character in the word fragment. |
|
170 if ( (*p == ':' || *p == '@' || *p == '.') && p < endbuf - 1) { |
|
171 |
|
172 // ok, we have a possible url...do more research to find out if we really have one |
|
173 // and determine the length of the url so we can skip over it. |
|
174 |
|
175 if (mURLDetector) |
|
176 { |
|
177 int32_t startPos = -1; |
|
178 int32_t endPos = -1; |
|
179 |
|
180 mURLDetector->FindURLInPlaintext(startWord, endbuf - startWord, p - startWord, &startPos, &endPos); |
|
181 |
|
182 // ok, if we got a url, adjust the array bounds, skip the current url text and find the next word again |
|
183 if (startPos != -1 && endPos != -1) { |
|
184 startWord = p + endPos + 1; // skip over the url |
|
185 p = startWord; // reset p |
|
186 |
|
187 // now recursively call FindNextWord to search for the next word now that we have skipped the url |
|
188 return FindNextWord(word, length, startWord - word, begin, end); |
|
189 } |
|
190 } |
|
191 } |
|
192 |
|
193 while((p > startWord)&&(*(p-1) == '\'')){ // trim trailing apostrophes |
|
194 p--; |
|
195 } |
|
196 } |
|
197 else{ |
|
198 startWord = endbuf; |
|
199 } |
|
200 if(startWord == endbuf){ |
|
201 *begin = -1; |
|
202 *end = -1; |
|
203 } |
|
204 else{ |
|
205 *begin = startWord-word; |
|
206 *end = p-word; |
|
207 } |
|
208 return NS_OK; |
|
209 } |
|
210 |
|
211 mozEnglishWordUtils::myspCapitalization |
|
212 mozEnglishWordUtils::captype(const nsString &word) |
|
213 { |
|
214 char16_t* lword=ToNewUnicode(word); |
|
215 ToUpperCase(lword,lword,word.Length()); |
|
216 if(word.Equals(lword)){ |
|
217 nsMemory::Free(lword); |
|
218 return AllCap; |
|
219 } |
|
220 |
|
221 ToLowerCase(lword,lword,word.Length()); |
|
222 if(word.Equals(lword)){ |
|
223 nsMemory::Free(lword); |
|
224 return NoCap; |
|
225 } |
|
226 int32_t length=word.Length(); |
|
227 if(Substring(word,1,length-1).Equals(lword+1)){ |
|
228 nsMemory::Free(lword); |
|
229 return InitCap; |
|
230 } |
|
231 nsMemory::Free(lword); |
|
232 return HuhCap; |
|
233 } |
|
234 |
|
235 // Convert the list of words in iwords to the same capitalization aWord and |
|
236 // return them in owords. |
|
237 NS_IMETHODIMP mozEnglishWordUtils::FromRootForm(const char16_t *aWord, const char16_t **iwords, uint32_t icount, char16_t ***owords, uint32_t *ocount) |
|
238 { |
|
239 nsAutoString word(aWord); |
|
240 nsresult rv = NS_OK; |
|
241 |
|
242 int32_t length; |
|
243 char16_t **tmpPtr = (char16_t **)nsMemory::Alloc(sizeof(char16_t *)*icount); |
|
244 if (!tmpPtr) |
|
245 return NS_ERROR_OUT_OF_MEMORY; |
|
246 |
|
247 mozEnglishWordUtils::myspCapitalization ct = captype(word); |
|
248 for(uint32_t i = 0; i < icount; ++i) { |
|
249 length = NS_strlen(iwords[i]); |
|
250 tmpPtr[i] = (char16_t *) nsMemory::Alloc(sizeof(char16_t) * (length + 1)); |
|
251 if (MOZ_UNLIKELY(!tmpPtr[i])) { |
|
252 NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(i, tmpPtr); |
|
253 return NS_ERROR_OUT_OF_MEMORY; |
|
254 } |
|
255 memcpy(tmpPtr[i], iwords[i], (length + 1) * sizeof(char16_t)); |
|
256 |
|
257 nsAutoString capTest(tmpPtr[i]); |
|
258 mozEnglishWordUtils::myspCapitalization newCt=captype(capTest); |
|
259 if(newCt == NoCap){ |
|
260 switch(ct) |
|
261 { |
|
262 case HuhCap: |
|
263 case NoCap: |
|
264 break; |
|
265 case AllCap: |
|
266 ToUpperCase(tmpPtr[i],tmpPtr[i],length); |
|
267 rv = NS_OK; |
|
268 break; |
|
269 case InitCap: |
|
270 ToUpperCase(tmpPtr[i],tmpPtr[i],1); |
|
271 rv = NS_OK; |
|
272 break; |
|
273 default: |
|
274 rv = NS_ERROR_FAILURE; // should never get here; |
|
275 break; |
|
276 |
|
277 } |
|
278 } |
|
279 } |
|
280 if (NS_SUCCEEDED(rv)){ |
|
281 *owords = tmpPtr; |
|
282 *ocount = icount; |
|
283 } |
|
284 return rv; |
|
285 } |
|
286 |