toolkit/components/url-classifier/nsUrlClassifierUtils.cpp

Fri, 16 Jan 2015 18:13:44 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Fri, 16 Jan 2015 18:13:44 +0100
branch
TOR_BUG_9701
changeset 14
925c144e1f1f
permissions
-rw-r--r--

Integrate suggestion from review to improve consistency with existing code.

michael@0 1 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 2 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 4
michael@0 5 #include "nsEscape.h"
michael@0 6 #include "nsString.h"
michael@0 7 #include "nsIURI.h"
michael@0 8 #include "nsNetUtil.h"
michael@0 9 #include "nsUrlClassifierUtils.h"
michael@0 10 #include "nsTArray.h"
michael@0 11 #include "nsReadableUtils.h"
michael@0 12 #include "plbase64.h"
michael@0 13 #include "prprf.h"
michael@0 14
michael@0 15 static char int_to_hex_digit(int32_t i)
michael@0 16 {
michael@0 17 NS_ASSERTION((i >= 0) && (i <= 15), "int too big in int_to_hex_digit");
michael@0 18 return static_cast<char>(((i < 10) ? (i + '0') : ((i - 10) + 'A')));
michael@0 19 }
michael@0 20
michael@0 21 static bool
michael@0 22 IsDecimal(const nsACString & num)
michael@0 23 {
michael@0 24 for (uint32_t i = 0; i < num.Length(); i++) {
michael@0 25 if (!isdigit(num[i])) {
michael@0 26 return false;
michael@0 27 }
michael@0 28 }
michael@0 29
michael@0 30 return true;
michael@0 31 }
michael@0 32
michael@0 33 static bool
michael@0 34 IsHex(const nsACString & num)
michael@0 35 {
michael@0 36 if (num.Length() < 3) {
michael@0 37 return false;
michael@0 38 }
michael@0 39
michael@0 40 if (num[0] != '0' || !(num[1] == 'x' || num[1] == 'X')) {
michael@0 41 return false;
michael@0 42 }
michael@0 43
michael@0 44 for (uint32_t i = 2; i < num.Length(); i++) {
michael@0 45 if (!isxdigit(num[i])) {
michael@0 46 return false;
michael@0 47 }
michael@0 48 }
michael@0 49
michael@0 50 return true;
michael@0 51 }
michael@0 52
michael@0 53 static bool
michael@0 54 IsOctal(const nsACString & num)
michael@0 55 {
michael@0 56 if (num.Length() < 2) {
michael@0 57 return false;
michael@0 58 }
michael@0 59
michael@0 60 if (num[0] != '0') {
michael@0 61 return false;
michael@0 62 }
michael@0 63
michael@0 64 for (uint32_t i = 1; i < num.Length(); i++) {
michael@0 65 if (!isdigit(num[i]) || num[i] == '8' || num[i] == '9') {
michael@0 66 return false;
michael@0 67 }
michael@0 68 }
michael@0 69
michael@0 70 return true;
michael@0 71 }
michael@0 72
michael@0 73 nsUrlClassifierUtils::nsUrlClassifierUtils() : mEscapeCharmap(nullptr)
michael@0 74 {
michael@0 75 }
michael@0 76
michael@0 77 nsresult
michael@0 78 nsUrlClassifierUtils::Init()
michael@0 79 {
michael@0 80 // Everything but alpha numerics, - and .
michael@0 81 mEscapeCharmap = new Charmap(0xffffffff, 0xfc009fff, 0xf8000001, 0xf8000001,
michael@0 82 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
michael@0 83 if (!mEscapeCharmap)
michael@0 84 return NS_ERROR_OUT_OF_MEMORY;
michael@0 85 return NS_OK;
michael@0 86 }
michael@0 87
michael@0 88 NS_IMPL_ISUPPORTS(nsUrlClassifierUtils, nsIUrlClassifierUtils)
michael@0 89
michael@0 90 /////////////////////////////////////////////////////////////////////////////
michael@0 91 // nsIUrlClassifierUtils
michael@0 92
michael@0 93 NS_IMETHODIMP
michael@0 94 nsUrlClassifierUtils::GetKeyForURI(nsIURI * uri, nsACString & _retval)
michael@0 95 {
michael@0 96 nsCOMPtr<nsIURI> innerURI = NS_GetInnermostURI(uri);
michael@0 97 if (!innerURI)
michael@0 98 innerURI = uri;
michael@0 99
michael@0 100 nsAutoCString host;
michael@0 101 innerURI->GetAsciiHost(host);
michael@0 102
michael@0 103 if (host.IsEmpty()) {
michael@0 104 return NS_ERROR_MALFORMED_URI;
michael@0 105 }
michael@0 106
michael@0 107 nsresult rv = CanonicalizeHostname(host, _retval);
michael@0 108 NS_ENSURE_SUCCESS(rv, rv);
michael@0 109
michael@0 110 nsAutoCString path;
michael@0 111 rv = innerURI->GetPath(path);
michael@0 112 NS_ENSURE_SUCCESS(rv, rv);
michael@0 113
michael@0 114 // strip out anchors
michael@0 115 int32_t ref = path.FindChar('#');
michael@0 116 if (ref != kNotFound)
michael@0 117 path.SetLength(ref);
michael@0 118
michael@0 119 nsAutoCString temp;
michael@0 120 rv = CanonicalizePath(path, temp);
michael@0 121 NS_ENSURE_SUCCESS(rv, rv);
michael@0 122
michael@0 123 _retval.Append(temp);
michael@0 124
michael@0 125 return NS_OK;
michael@0 126 }
michael@0 127
michael@0 128 /////////////////////////////////////////////////////////////////////////////
michael@0 129 // non-interface methods
michael@0 130
michael@0 131 nsresult
michael@0 132 nsUrlClassifierUtils::CanonicalizeHostname(const nsACString & hostname,
michael@0 133 nsACString & _retval)
michael@0 134 {
michael@0 135 nsAutoCString unescaped;
michael@0 136 if (!NS_UnescapeURL(PromiseFlatCString(hostname).get(),
michael@0 137 PromiseFlatCString(hostname).Length(),
michael@0 138 0, unescaped)) {
michael@0 139 unescaped.Assign(hostname);
michael@0 140 }
michael@0 141
michael@0 142 nsAutoCString cleaned;
michael@0 143 CleanupHostname(unescaped, cleaned);
michael@0 144
michael@0 145 nsAutoCString temp;
michael@0 146 ParseIPAddress(cleaned, temp);
michael@0 147 if (!temp.IsEmpty()) {
michael@0 148 cleaned.Assign(temp);
michael@0 149 }
michael@0 150
michael@0 151 ToLowerCase(cleaned);
michael@0 152 SpecialEncode(cleaned, false, _retval);
michael@0 153
michael@0 154 return NS_OK;
michael@0 155 }
michael@0 156
michael@0 157
michael@0 158 nsresult
michael@0 159 nsUrlClassifierUtils::CanonicalizePath(const nsACString & path,
michael@0 160 nsACString & _retval)
michael@0 161 {
michael@0 162 _retval.Truncate();
michael@0 163
michael@0 164 nsAutoCString decodedPath(path);
michael@0 165 nsAutoCString temp;
michael@0 166 while (NS_UnescapeURL(decodedPath.get(), decodedPath.Length(), 0, temp)) {
michael@0 167 decodedPath.Assign(temp);
michael@0 168 temp.Truncate();
michael@0 169 }
michael@0 170
michael@0 171 SpecialEncode(decodedPath, true, _retval);
michael@0 172 // XXX: lowercase the path?
michael@0 173
michael@0 174 return NS_OK;
michael@0 175 }
michael@0 176
michael@0 177 void
michael@0 178 nsUrlClassifierUtils::CleanupHostname(const nsACString & hostname,
michael@0 179 nsACString & _retval)
michael@0 180 {
michael@0 181 _retval.Truncate();
michael@0 182
michael@0 183 const char* curChar = hostname.BeginReading();
michael@0 184 const char* end = hostname.EndReading();
michael@0 185 char lastChar = '\0';
michael@0 186 while (curChar != end) {
michael@0 187 unsigned char c = static_cast<unsigned char>(*curChar);
michael@0 188 if (c == '.' && (lastChar == '\0' || lastChar == '.')) {
michael@0 189 // skip
michael@0 190 } else {
michael@0 191 _retval.Append(*curChar);
michael@0 192 }
michael@0 193 lastChar = c;
michael@0 194 ++curChar;
michael@0 195 }
michael@0 196
michael@0 197 // cut off trailing dots
michael@0 198 while (_retval.Length() > 0 && _retval[_retval.Length() - 1] == '.') {
michael@0 199 _retval.SetLength(_retval.Length() - 1);
michael@0 200 }
michael@0 201 }
michael@0 202
michael@0 203 void
michael@0 204 nsUrlClassifierUtils::ParseIPAddress(const nsACString & host,
michael@0 205 nsACString & _retval)
michael@0 206 {
michael@0 207 _retval.Truncate();
michael@0 208 nsACString::const_iterator iter, end;
michael@0 209 host.BeginReading(iter);
michael@0 210 host.EndReading(end);
michael@0 211
michael@0 212 if (host.Length() <= 15) {
michael@0 213 // The Windows resolver allows a 4-part dotted decimal IP address to
michael@0 214 // have a space followed by any old rubbish, so long as the total length
michael@0 215 // of the string doesn't get above 15 characters. So, "10.192.95.89 xy"
michael@0 216 // is resolved to 10.192.95.89.
michael@0 217 // If the string length is greater than 15 characters, e.g.
michael@0 218 // "10.192.95.89 xy.wildcard.example.com", it will be resolved through
michael@0 219 // DNS.
michael@0 220
michael@0 221 if (FindCharInReadable(' ', iter, end)) {
michael@0 222 end = iter;
michael@0 223 }
michael@0 224 }
michael@0 225
michael@0 226 for (host.BeginReading(iter); iter != end; iter++) {
michael@0 227 if (!(isxdigit(*iter) || *iter == 'x' || *iter == 'X' || *iter == '.')) {
michael@0 228 // not an IP
michael@0 229 return;
michael@0 230 }
michael@0 231 }
michael@0 232
michael@0 233 host.BeginReading(iter);
michael@0 234 nsTArray<nsCString> parts;
michael@0 235 ParseString(PromiseFlatCString(Substring(iter, end)), '.', parts);
michael@0 236 if (parts.Length() > 4) {
michael@0 237 return;
michael@0 238 }
michael@0 239
michael@0 240 // If any potentially-octal numbers (start with 0 but not hex) have
michael@0 241 // non-octal digits, no part of the ip can be in octal
michael@0 242 // XXX: this came from the old javascript implementation, is it really
michael@0 243 // supposed to be like this?
michael@0 244 bool allowOctal = true;
michael@0 245 uint32_t i;
michael@0 246
michael@0 247 for (i = 0; i < parts.Length(); i++) {
michael@0 248 const nsCString& part = parts[i];
michael@0 249 if (part[0] == '0') {
michael@0 250 for (uint32_t j = 1; j < part.Length(); j++) {
michael@0 251 if (part[j] == 'x') {
michael@0 252 break;
michael@0 253 }
michael@0 254 if (part[j] == '8' || part[j] == '9') {
michael@0 255 allowOctal = false;
michael@0 256 break;
michael@0 257 }
michael@0 258 }
michael@0 259 }
michael@0 260 }
michael@0 261
michael@0 262 for (i = 0; i < parts.Length(); i++) {
michael@0 263 nsAutoCString canonical;
michael@0 264
michael@0 265 if (i == parts.Length() - 1) {
michael@0 266 CanonicalNum(parts[i], 5 - parts.Length(), allowOctal, canonical);
michael@0 267 } else {
michael@0 268 CanonicalNum(parts[i], 1, allowOctal, canonical);
michael@0 269 }
michael@0 270
michael@0 271 if (canonical.IsEmpty()) {
michael@0 272 _retval.Truncate();
michael@0 273 return;
michael@0 274 }
michael@0 275
michael@0 276 if (_retval.IsEmpty()) {
michael@0 277 _retval.Assign(canonical);
michael@0 278 } else {
michael@0 279 _retval.Append('.');
michael@0 280 _retval.Append(canonical);
michael@0 281 }
michael@0 282 }
michael@0 283 return;
michael@0 284 }
michael@0 285
michael@0 286 void
michael@0 287 nsUrlClassifierUtils::CanonicalNum(const nsACString& num,
michael@0 288 uint32_t bytes,
michael@0 289 bool allowOctal,
michael@0 290 nsACString& _retval)
michael@0 291 {
michael@0 292 _retval.Truncate();
michael@0 293
michael@0 294 if (num.Length() < 1) {
michael@0 295 return;
michael@0 296 }
michael@0 297
michael@0 298 uint32_t val;
michael@0 299 if (allowOctal && IsOctal(num)) {
michael@0 300 if (PR_sscanf(PromiseFlatCString(num).get(), "%o", &val) != 1) {
michael@0 301 return;
michael@0 302 }
michael@0 303 } else if (IsDecimal(num)) {
michael@0 304 if (PR_sscanf(PromiseFlatCString(num).get(), "%u", &val) != 1) {
michael@0 305 return;
michael@0 306 }
michael@0 307 } else if (IsHex(num)) {
michael@0 308 if (PR_sscanf(PromiseFlatCString(num).get(), num[1] == 'X' ? "0X%x" : "0x%x",
michael@0 309 &val) != 1) {
michael@0 310 return;
michael@0 311 }
michael@0 312 } else {
michael@0 313 return;
michael@0 314 }
michael@0 315
michael@0 316 while (bytes--) {
michael@0 317 char buf[20];
michael@0 318 PR_snprintf(buf, sizeof(buf), "%u", val & 0xff);
michael@0 319 if (_retval.IsEmpty()) {
michael@0 320 _retval.Assign(buf);
michael@0 321 } else {
michael@0 322 _retval = nsDependentCString(buf) + NS_LITERAL_CSTRING(".") + _retval;
michael@0 323 }
michael@0 324 val >>= 8;
michael@0 325 }
michael@0 326 }
michael@0 327
michael@0 328 // This function will encode all "special" characters in typical url
michael@0 329 // encoding, that is %hh where h is a valid hex digit. It will also fold
michael@0 330 // any duplicated slashes.
michael@0 331 bool
michael@0 332 nsUrlClassifierUtils::SpecialEncode(const nsACString & url,
michael@0 333 bool foldSlashes,
michael@0 334 nsACString & _retval)
michael@0 335 {
michael@0 336 bool changed = false;
michael@0 337 const char* curChar = url.BeginReading();
michael@0 338 const char* end = url.EndReading();
michael@0 339
michael@0 340 unsigned char lastChar = '\0';
michael@0 341 while (curChar != end) {
michael@0 342 unsigned char c = static_cast<unsigned char>(*curChar);
michael@0 343 if (ShouldURLEscape(c)) {
michael@0 344 _retval.Append('%');
michael@0 345 _retval.Append(int_to_hex_digit(c / 16));
michael@0 346 _retval.Append(int_to_hex_digit(c % 16));
michael@0 347
michael@0 348 changed = true;
michael@0 349 } else if (foldSlashes && (c == '/' && lastChar == '/')) {
michael@0 350 // skip
michael@0 351 } else {
michael@0 352 _retval.Append(*curChar);
michael@0 353 }
michael@0 354 lastChar = c;
michael@0 355 curChar++;
michael@0 356 }
michael@0 357 return changed;
michael@0 358 }
michael@0 359
michael@0 360 bool
michael@0 361 nsUrlClassifierUtils::ShouldURLEscape(const unsigned char c) const
michael@0 362 {
michael@0 363 return c <= 32 || c == '%' || c >=127;
michael@0 364 }

mercurial