toolkit/components/url-classifier/nsUrlClassifierUtils.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/toolkit/components/url-classifier/nsUrlClassifierUtils.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,364 @@
     1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.7 +
     1.8 +#include "nsEscape.h"
     1.9 +#include "nsString.h"
    1.10 +#include "nsIURI.h"
    1.11 +#include "nsNetUtil.h"
    1.12 +#include "nsUrlClassifierUtils.h"
    1.13 +#include "nsTArray.h"
    1.14 +#include "nsReadableUtils.h"
    1.15 +#include "plbase64.h"
    1.16 +#include "prprf.h"
    1.17 +
    1.18 +static char int_to_hex_digit(int32_t i)
    1.19 +{
    1.20 +  NS_ASSERTION((i >= 0) && (i <= 15), "int too big in int_to_hex_digit");
    1.21 +  return static_cast<char>(((i < 10) ? (i + '0') : ((i - 10) + 'A')));
    1.22 +}
    1.23 +
    1.24 +static bool
    1.25 +IsDecimal(const nsACString & num)
    1.26 +{
    1.27 +  for (uint32_t i = 0; i < num.Length(); i++) {
    1.28 +    if (!isdigit(num[i])) {
    1.29 +      return false;
    1.30 +    }
    1.31 +  }
    1.32 +
    1.33 +  return true;
    1.34 +}
    1.35 +
    1.36 +static bool
    1.37 +IsHex(const nsACString & num)
    1.38 +{
    1.39 +  if (num.Length() < 3) {
    1.40 +    return false;
    1.41 +  }
    1.42 +
    1.43 +  if (num[0] != '0' || !(num[1] == 'x' || num[1] == 'X')) {
    1.44 +    return false;
    1.45 +  }
    1.46 +
    1.47 +  for (uint32_t i = 2; i < num.Length(); i++) {
    1.48 +    if (!isxdigit(num[i])) {
    1.49 +      return false;
    1.50 +    }
    1.51 +  }
    1.52 +
    1.53 +  return true;
    1.54 +}
    1.55 +
    1.56 +static bool
    1.57 +IsOctal(const nsACString & num)
    1.58 +{
    1.59 +  if (num.Length() < 2) {
    1.60 +    return false;
    1.61 +  }
    1.62 +
    1.63 +  if (num[0] != '0') {
    1.64 +    return false;
    1.65 +  }
    1.66 +
    1.67 +  for (uint32_t i = 1; i < num.Length(); i++) {
    1.68 +    if (!isdigit(num[i]) || num[i] == '8' || num[i] == '9') {
    1.69 +      return false;
    1.70 +    }
    1.71 +  }
    1.72 +
    1.73 +  return true;
    1.74 +}
    1.75 +
    1.76 +nsUrlClassifierUtils::nsUrlClassifierUtils() : mEscapeCharmap(nullptr)
    1.77 +{
    1.78 +}
    1.79 +
    1.80 +nsresult
    1.81 +nsUrlClassifierUtils::Init()
    1.82 +{
    1.83 +  // Everything but alpha numerics, - and .
    1.84 +  mEscapeCharmap = new Charmap(0xffffffff, 0xfc009fff, 0xf8000001, 0xf8000001,
    1.85 +                               0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
    1.86 +  if (!mEscapeCharmap)
    1.87 +    return NS_ERROR_OUT_OF_MEMORY;
    1.88 +  return NS_OK;
    1.89 +}
    1.90 +
    1.91 +NS_IMPL_ISUPPORTS(nsUrlClassifierUtils, nsIUrlClassifierUtils)
    1.92 +
    1.93 +/////////////////////////////////////////////////////////////////////////////
    1.94 +// nsIUrlClassifierUtils
    1.95 +
    1.96 +NS_IMETHODIMP
    1.97 +nsUrlClassifierUtils::GetKeyForURI(nsIURI * uri, nsACString & _retval)
    1.98 +{
    1.99 +  nsCOMPtr<nsIURI> innerURI = NS_GetInnermostURI(uri);
   1.100 +  if (!innerURI)
   1.101 +    innerURI = uri;
   1.102 +
   1.103 +  nsAutoCString host;
   1.104 +  innerURI->GetAsciiHost(host);
   1.105 +
   1.106 +  if (host.IsEmpty()) {
   1.107 +    return NS_ERROR_MALFORMED_URI;
   1.108 +  }
   1.109 +
   1.110 +  nsresult rv = CanonicalizeHostname(host, _retval);
   1.111 +  NS_ENSURE_SUCCESS(rv, rv);
   1.112 +
   1.113 +  nsAutoCString path;
   1.114 +  rv = innerURI->GetPath(path);
   1.115 +  NS_ENSURE_SUCCESS(rv, rv);
   1.116 +
   1.117 +  // strip out anchors
   1.118 +  int32_t ref = path.FindChar('#');
   1.119 +  if (ref != kNotFound)
   1.120 +    path.SetLength(ref);
   1.121 +
   1.122 +  nsAutoCString temp;
   1.123 +  rv = CanonicalizePath(path, temp);
   1.124 +  NS_ENSURE_SUCCESS(rv, rv);
   1.125 +
   1.126 +  _retval.Append(temp);
   1.127 +
   1.128 +  return NS_OK;
   1.129 +}
   1.130 +
   1.131 +/////////////////////////////////////////////////////////////////////////////
   1.132 +// non-interface methods
   1.133 +
   1.134 +nsresult
   1.135 +nsUrlClassifierUtils::CanonicalizeHostname(const nsACString & hostname,
   1.136 +                                           nsACString & _retval)
   1.137 +{
   1.138 +  nsAutoCString unescaped;
   1.139 +  if (!NS_UnescapeURL(PromiseFlatCString(hostname).get(),
   1.140 +                      PromiseFlatCString(hostname).Length(),
   1.141 +                      0, unescaped)) {
   1.142 +    unescaped.Assign(hostname);
   1.143 +  }
   1.144 +
   1.145 +  nsAutoCString cleaned;
   1.146 +  CleanupHostname(unescaped, cleaned);
   1.147 +
   1.148 +  nsAutoCString temp;
   1.149 +  ParseIPAddress(cleaned, temp);
   1.150 +  if (!temp.IsEmpty()) {
   1.151 +    cleaned.Assign(temp);
   1.152 +  }
   1.153 +
   1.154 +  ToLowerCase(cleaned);
   1.155 +  SpecialEncode(cleaned, false, _retval);
   1.156 +
   1.157 +  return NS_OK;
   1.158 +}
   1.159 +
   1.160 +
   1.161 +nsresult
   1.162 +nsUrlClassifierUtils::CanonicalizePath(const nsACString & path,
   1.163 +                                       nsACString & _retval)
   1.164 +{
   1.165 +  _retval.Truncate();
   1.166 +
   1.167 +  nsAutoCString decodedPath(path);
   1.168 +  nsAutoCString temp;
   1.169 +  while (NS_UnescapeURL(decodedPath.get(), decodedPath.Length(), 0, temp)) {
   1.170 +    decodedPath.Assign(temp);
   1.171 +    temp.Truncate();
   1.172 +  }
   1.173 +
   1.174 +  SpecialEncode(decodedPath, true, _retval);
   1.175 +  // XXX: lowercase the path?
   1.176 +
   1.177 +  return NS_OK;
   1.178 +}
   1.179 +
   1.180 +void
   1.181 +nsUrlClassifierUtils::CleanupHostname(const nsACString & hostname,
   1.182 +                                      nsACString & _retval)
   1.183 +{
   1.184 +  _retval.Truncate();
   1.185 +
   1.186 +  const char* curChar = hostname.BeginReading();
   1.187 +  const char* end = hostname.EndReading();
   1.188 +  char lastChar = '\0';
   1.189 +  while (curChar != end) {
   1.190 +    unsigned char c = static_cast<unsigned char>(*curChar);
   1.191 +    if (c == '.' && (lastChar == '\0' || lastChar == '.')) {
   1.192 +      // skip
   1.193 +    } else {
   1.194 +      _retval.Append(*curChar);
   1.195 +    }
   1.196 +    lastChar = c;
   1.197 +    ++curChar;
   1.198 +  }
   1.199 +
   1.200 +  // cut off trailing dots
   1.201 +  while (_retval.Length() > 0 && _retval[_retval.Length() - 1] == '.') {
   1.202 +    _retval.SetLength(_retval.Length() - 1);
   1.203 +  }
   1.204 +}
   1.205 +
   1.206 +void
   1.207 +nsUrlClassifierUtils::ParseIPAddress(const nsACString & host,
   1.208 +                                     nsACString & _retval)
   1.209 +{
   1.210 +  _retval.Truncate();
   1.211 +  nsACString::const_iterator iter, end;
   1.212 +  host.BeginReading(iter);
   1.213 +  host.EndReading(end);
   1.214 +
   1.215 +  if (host.Length() <= 15) {
   1.216 +    // The Windows resolver allows a 4-part dotted decimal IP address to
   1.217 +    // have a space followed by any old rubbish, so long as the total length
   1.218 +    // of the string doesn't get above 15 characters. So, "10.192.95.89 xy"
   1.219 +    // is resolved to 10.192.95.89.
   1.220 +    // If the string length is greater than 15 characters, e.g.
   1.221 +    // "10.192.95.89 xy.wildcard.example.com", it will be resolved through
   1.222 +    // DNS.
   1.223 +
   1.224 +    if (FindCharInReadable(' ', iter, end)) {
   1.225 +      end = iter;
   1.226 +    }
   1.227 +  }
   1.228 +
   1.229 +  for (host.BeginReading(iter); iter != end; iter++) {
   1.230 +    if (!(isxdigit(*iter) || *iter == 'x' || *iter == 'X' || *iter == '.')) {
   1.231 +      // not an IP
   1.232 +      return;
   1.233 +    }
   1.234 +  }
   1.235 +
   1.236 +  host.BeginReading(iter);
   1.237 +  nsTArray<nsCString> parts;
   1.238 +  ParseString(PromiseFlatCString(Substring(iter, end)), '.', parts);
   1.239 +  if (parts.Length() > 4) {
   1.240 +    return;
   1.241 +  }
   1.242 +
   1.243 +  // If any potentially-octal numbers (start with 0 but not hex) have
   1.244 +  // non-octal digits, no part of the ip can be in octal
   1.245 +  // XXX: this came from the old javascript implementation, is it really
   1.246 +  // supposed to be like this?
   1.247 +  bool allowOctal = true;
   1.248 +  uint32_t i;
   1.249 +
   1.250 +  for (i = 0; i < parts.Length(); i++) {
   1.251 +    const nsCString& part = parts[i];
   1.252 +    if (part[0] == '0') {
   1.253 +      for (uint32_t j = 1; j < part.Length(); j++) {
   1.254 +        if (part[j] == 'x') {
   1.255 +          break;
   1.256 +        }
   1.257 +        if (part[j] == '8' || part[j] == '9') {
   1.258 +          allowOctal = false;
   1.259 +          break;
   1.260 +        }
   1.261 +      }
   1.262 +    }
   1.263 +  }
   1.264 +
   1.265 +  for (i = 0; i < parts.Length(); i++) {
   1.266 +    nsAutoCString canonical;
   1.267 +
   1.268 +    if (i == parts.Length() - 1) {
   1.269 +      CanonicalNum(parts[i], 5 - parts.Length(), allowOctal, canonical);
   1.270 +    } else {
   1.271 +      CanonicalNum(parts[i], 1, allowOctal, canonical);
   1.272 +    }
   1.273 +
   1.274 +    if (canonical.IsEmpty()) {
   1.275 +      _retval.Truncate();
   1.276 +      return;
   1.277 +    }
   1.278 +
   1.279 +    if (_retval.IsEmpty()) {
   1.280 +      _retval.Assign(canonical);
   1.281 +    } else {
   1.282 +      _retval.Append('.');
   1.283 +      _retval.Append(canonical);
   1.284 +    }
   1.285 +  }
   1.286 +  return;
   1.287 +}
   1.288 +
   1.289 +void
   1.290 +nsUrlClassifierUtils::CanonicalNum(const nsACString& num,
   1.291 +                                   uint32_t bytes,
   1.292 +                                   bool allowOctal,
   1.293 +                                   nsACString& _retval)
   1.294 +{
   1.295 +  _retval.Truncate();
   1.296 +
   1.297 +  if (num.Length() < 1) {
   1.298 +    return;
   1.299 +  }
   1.300 +
   1.301 +  uint32_t val;
   1.302 +  if (allowOctal && IsOctal(num)) {
   1.303 +    if (PR_sscanf(PromiseFlatCString(num).get(), "%o", &val) != 1) {
   1.304 +      return;
   1.305 +    }
   1.306 +  } else if (IsDecimal(num)) {
   1.307 +    if (PR_sscanf(PromiseFlatCString(num).get(), "%u", &val) != 1) {
   1.308 +      return;
   1.309 +    }
   1.310 +  } else if (IsHex(num)) {
   1.311 +  if (PR_sscanf(PromiseFlatCString(num).get(), num[1] == 'X' ? "0X%x" : "0x%x",
   1.312 +                &val) != 1) {
   1.313 +      return;
   1.314 +    }
   1.315 +  } else {
   1.316 +    return;
   1.317 +  }
   1.318 +
   1.319 +  while (bytes--) {
   1.320 +    char buf[20];
   1.321 +    PR_snprintf(buf, sizeof(buf), "%u", val & 0xff);
   1.322 +    if (_retval.IsEmpty()) {
   1.323 +      _retval.Assign(buf);
   1.324 +    } else {
   1.325 +      _retval = nsDependentCString(buf) + NS_LITERAL_CSTRING(".") + _retval;
   1.326 +    }
   1.327 +    val >>= 8;
   1.328 +  }
   1.329 +}
   1.330 +
   1.331 +// This function will encode all "special" characters in typical url
   1.332 +// encoding, that is %hh where h is a valid hex digit.  It will also fold
   1.333 +// any duplicated slashes.
   1.334 +bool
   1.335 +nsUrlClassifierUtils::SpecialEncode(const nsACString & url,
   1.336 +                                    bool foldSlashes,
   1.337 +                                    nsACString & _retval)
   1.338 +{
   1.339 +  bool changed = false;
   1.340 +  const char* curChar = url.BeginReading();
   1.341 +  const char* end = url.EndReading();
   1.342 +
   1.343 +  unsigned char lastChar = '\0';
   1.344 +  while (curChar != end) {
   1.345 +    unsigned char c = static_cast<unsigned char>(*curChar);
   1.346 +    if (ShouldURLEscape(c)) {
   1.347 +      _retval.Append('%');
   1.348 +      _retval.Append(int_to_hex_digit(c / 16));
   1.349 +      _retval.Append(int_to_hex_digit(c % 16));
   1.350 +
   1.351 +      changed = true;
   1.352 +    } else if (foldSlashes && (c == '/' && lastChar == '/')) {
   1.353 +      // skip
   1.354 +    } else {
   1.355 +      _retval.Append(*curChar);
   1.356 +    }
   1.357 +    lastChar = c;
   1.358 +    curChar++;
   1.359 +  }
   1.360 +  return changed;
   1.361 +}
   1.362 +
   1.363 +bool
   1.364 +nsUrlClassifierUtils::ShouldURLEscape(const unsigned char c) const
   1.365 +{
   1.366 +  return c <= 32 || c == '%' || c >=127;
   1.367 +}

mercurial