The Tor Browser: diff testing/tools/proxyserver/proxyserver.py

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/testing/tools/proxyserver/proxyserver.py	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,269 @@
     1.4 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.5 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.7 +
     1.8 +"""
     1.9 +Caching HTTP Proxy for use with the Talos pageload tests
    1.10 +Author: Rob Arnold
    1.11 +
    1.12 +This file implements a multithreaded caching http 1.1 proxy. HEAD and GET
    1.13 +methods are supported; POST is not yet.
    1.14 +   
    1.15 +Each incoming request is put onto a new thread; python does not have a thread
    1.16 +pool library, so a new thread is spawned for each request. I have tried to use
    1.17 +the python 2.4 standard library wherever possible.
    1.18 +
    1.19 +Caching:
    1.20 +The cache is implemented in the Cache class. Items can only be added to the
    1.21 +cache. The only way to remove items from the cache is to blow it all away,
    1.22 +either by deleting the file (default: proxy_cache.db) or passing the -c or
    1.23 +--clear-cache flags on the command line. It is technically possible to remove
    1.24 +items individually from the cache, but there has been no need to do so so far.
    1.25 +
    1.26 +The cache is implemented with the shelve module. The key is the combination of
    1.27 +host, port and request (path + params + fragment) and the values stored are the
    1.28 +http status code, headers and content that were received from the remote server.
    1.29 +
    1.30 +Access to the cache is guarded by a semaphore which allows concurrent read
    1.31 +access. The semaphore is guarded by a simple mutex which prevents a deadlock
    1.32 +from occuring when two threads try to add an item to the cache at the same time.
    1.33 +
    1.34 +Memory usage is kept to a minimum by the shelve module; only items in the cache
    1.35 +that are currently being served stay in memory.
    1.36 +
    1.37 +Proxy:
    1.38 +The BaseHTTPServer.BaseHTTPRequestHandler takes care of parsing incoming
    1.39 +requests and managing the socket connection. See the documentation of the
    1.40 +BaseHTTPServer module for more information. When do_HEAD or do_GET is called,
    1.41 +the url that we are supposed to fetch is in self.path.
    1.42 +
    1.43 +TODO:
    1.44 +* Implement POST requests. This requires implementing the do_POST method and
    1.45 +  passing the post data along.
    1.46 +* Implement different cache policies
    1.47 +* Added an interface to allow administrators to probe the cache and remove
    1.48 +  items from the database and such.
    1.49 +"""
    1.50 +
    1.51 +__version__ = "0.1"
    1.52 +
    1.53 +import os
    1.54 +import sys
    1.55 +import time
    1.56 +import threading
    1.57 +import shelve
    1.58 +from optparse import OptionParser, OptionValueError
    1.59 +
    1.60 +import SocketServer
    1.61 +import BaseHTTPServer
    1.62 +import socket
    1.63 +import httplib
    1.64 +from urlparse import urlsplit, urlunsplit
    1.65 +
    1.66 +class HTTPRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
    1.67 +  server_version = "TalosProxy/" + __version__
    1.68 +  protocol_version = "HTTP/1.1"
    1.69 +
    1.70 +  def do_GET(self):
    1.71 +    content = self.send_head()
    1.72 +    if content:
    1.73 +      try:
    1.74 +        self.wfile.write(content)
    1.75 +      except socket.error, e:
    1.76 +        if options.verbose:
    1.77 +          print "Got socket error %s" % e
    1.78 +    #self.close_connection = 1
    1.79 +  def do_HEAD(self):
    1.80 +    self.send_head()
    1.81 +
    1.82 +  def getHeaders(self):
    1.83 +    h = {}
    1.84 +    for name in self.headers.keys():
    1.85 +      h[name] = self.headers[name]
    1.86 +
    1.87 +    return h
    1.88 +
    1.89 +  def send_head(self, method="GET"): 
    1.90 +    o = urlsplit(self.path)
    1.91 +
    1.92 +    #sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    1.93 +
    1.94 +    headers = self.getHeaders()
    1.95 +    for k in "Proxy-Connection", "Connection":
    1.96 +      if k in headers:
    1.97 +        headers[k] = "Close"
    1.98 +    if "Keep-Alive" in headers:
    1.99 +      del headers["Keep-Alive"]
   1.100 +
   1.101 +    reqstring = urlunsplit(('','',o.path, o.query, o.fragment))
   1.102 +
   1.103 +    if options.no_cache:
   1.104 +      cache_result = None
   1.105 +    else:
   1.106 +      cache_result = cache.get(o.hostname, o.port, reqstring)
   1.107 +
   1.108 +    if not cache_result:
   1.109 +      if options.localonly:
   1.110 +        self.send_error(404, "Object not in cache")
   1.111 +        return None
   1.112 +      else:
   1.113 +        if options.verbose:
   1.114 +          print "Object %s was not in the cache" % self.path
   1.115 +        conn = httplib.HTTPConnection(o.netloc)
   1.116 +        conn.request("GET", reqstring, headers=headers)
   1.117 +        res = conn.getresponse()
   1.118 +
   1.119 +        content = res.read()
   1.120 +        conn.close()
   1.121 +
   1.122 +        status, headers = res.status, res.getheaders()
   1.123 +
   1.124 +        if not options.no_cache:
   1.125 +          cache.add(o.hostname, o.port, reqstring, status, headers, content)
   1.126 +    else:
   1.127 +      status, headers, content = cache_result
   1.128 +
   1.129 +    try:
   1.130 +      self.send_response(status)
   1.131 +      for name, value in headers:
   1.132 +        # kill the transfer-encoding header because we don't support it when
   1.133 +        # we send data to the client
   1.134 +        if name not in ('transfer-encoding',):
   1.135 +          self.send_header(name, value)
   1.136 +      if "Content-Length" not in headers:
   1.137 +        self.send_header("Content-Length", str(len(content)))
   1.138 +      self.end_headers()
   1.139 +    except socket.error, e:
   1.140 +      if options.verbose:
   1.141 +        print "Got socket error %s" % e
   1.142 +      return None
   1.143 +    return content
   1.144 +  def log_message(self, format, *args):
   1.145 +    if options.verbose:
   1.146 +      BaseHTTPServer.BaseHTTPRequestHandler.log_message(self, format, *args)
   1.147 +
   1.148 +class HTTPServer(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
   1.149 +  def __init__(self, address, handler):
   1.150 +    BaseHTTPServer.HTTPServer.__init__(self, address, handler)
   1.151 +
   1.152 +class Cache(object):
   1.153 +  """Multithreaded cache uses the shelve module to store pages"""
   1.154 +  # 20 concurrent threads ought to be enough for one browser
   1.155 +  max_concurrency = 20
   1.156 +  def __init__(self, name='', max_concurrency=20):
   1.157 +    name = name or options.cache or "proxy_cache.db"
   1.158 +    self.name = name
   1.159 +    self.max_concurrency = max_concurrency
   1.160 +    self.entries = {}
   1.161 +    self.sem = threading.Semaphore(self.max_concurrency)
   1.162 +    self.semlock = threading.Lock()
   1.163 +    if options.clear_cache:
   1.164 +      flag = 'n'
   1.165 +    else:
   1.166 +      flag = 'c'
   1.167 +    self.db = shelve.DbfilenameShelf(name, flag)
   1.168 +
   1.169 +  def __del__(self):
   1.170 +    if hasattr(self, 'db'):
   1.171 +      self.db.close()
   1.172 +
   1.173 +  def get_key(self, host, port, resource):
   1.174 +    return '%s:%s/%s' % (host, port, resource)
   1.175 +
   1.176 +  def get(self, host, port, resource):
   1.177 +    key = self.get_key(host, port, resource)
   1.178 +    self.semlock.acquire()
   1.179 +    self.sem.acquire()
   1.180 +    self.semlock.release()
   1.181 +    try:
   1.182 +      if not self.db.has_key(key):
   1.183 +        return None
   1.184 +      # returns status, headers, content
   1.185 +      return self.db[key]
   1.186 +    finally:
   1.187 +      self.sem.release()
   1.188 +  def add(self, host, port, resource, status, headers, content):
   1.189 +    key = self.get_key(host, port, resource)
   1.190 +    self.semlock.acquire()
   1.191 +    for i in range(self.max_concurrency):
   1.192 +      self.sem.acquire()
   1.193 +    self.semlock.release()
   1.194 +    try:
   1.195 +      self.db[key] = (status, headers, content)
   1.196 +      self.db.sync()
   1.197 +    finally:
   1.198 +      for i in range(self.max_concurrency):
   1.199 +        self.sem.release()
   1.200 +
   1.201 +class Options(object):
   1.202 +  port = 8000
   1.203 +  localonly = False
   1.204 +  clear_cache = False
   1.205 +  no_cache = False
   1.206 +  cache = 'proxy_cache.db'
   1.207 +  verbose = False
   1.208 +
   1.209 +def _parseOptions():
   1.210 +  def port_callback(option, opt, value, parser):
   1.211 +    if value > 0 and value < (2 ** 16 - 1):
   1.212 +      setattr(parser.values, option.dest, value)
   1.213 +    else:
   1.214 +      raise OptionValueError("Port number is out of range")
   1.215 +
   1.216 +  global options
   1.217 +  parser = OptionParser(version="Talos Proxy " + __version__)
   1.218 +  parser.add_option("-p", "--port", dest="port",
   1.219 +    help="The port to run the proxy server on", metavar="PORT", type="int",
   1.220 +    action="callback", callback=port_callback)
   1.221 +  parser.add_option("-v", "--verbose", action="store_true", dest="verbose",
   1.222 +    help="Include additional debugging information")
   1.223 +  parser.add_option("-l", "--localonly", action="store_true", dest="localonly",
   1.224 +    help="Only serve pages from the local database")
   1.225 +  parser.add_option("-c", "--clear", action="store_true", dest="clear_cache",
   1.226 +    help="Clear the cache on startup")
   1.227 +  parser.add_option("-n", "--no-cache", action="store_true", dest="no_cache",
   1.228 +    help="Do not use a cache")
   1.229 +  parser.add_option("-u", "--use-cache", dest="cache",
   1.230 +    help="The filename of the cache to use", metavar="NAME.db")
   1.231 +  parser.set_defaults(verbose=Options.verbose,
   1.232 +                      port=Options.port,
   1.233 +                      localonly=Options.localonly,
   1.234 +                      clear_cache=Options.clear_cache,
   1.235 +                      no_cache=Options.no_cache,
   1.236 +                      cache=Options.cache)
   1.237 +  options, args = parser.parse_args()
   1.238 +
   1.239 +"""Configures the proxy server. This should be called before run_proxy. It can be
   1.240 +called afterwards, but note that it is not threadsafe and some options (namely
   1.241 +port) will not take effect"""
   1.242 +def configure_proxy(**kwargs):
   1.243 +  global options
   1.244 +  options = Options()
   1.245 +  for key in kwargs:
   1.246 +    setattr(options, key, kwargs[key])
   1.247 +
   1.248 +def _run():
   1.249 +  global cache
   1.250 +  cache = Cache()
   1.251 +  server_address = ('', options.port)
   1.252 +  httpd = HTTPServer(server_address, HTTPRequestHandler)
   1.253 +  httpd.serve_forever()
   1.254 +
   1.255 +"""Starts the proxy; it runs on a separate daemon thread"""
   1.256 +def run_proxy():
   1.257 +  thr = threading.Thread(target=_run)
   1.258 +  # now when we die, the daemon thread will die too
   1.259 +  thr.setDaemon(1)
   1.260 +  thr.start()
   1.261 +
   1.262 +if __name__ == '__main__':
   1.263 +  _parseOptions()
   1.264 +  try:
   1.265 +    run_proxy()
   1.266 +    # thr.join() doesn't terminate on keyboard interrupt
   1.267 +    while 1: time.sleep(1)
   1.268 +  except KeyboardInterrupt:
   1.269 +    if options.verbose:
   1.270 +      print "Quittin' time..."
   1.271 +
   1.272 +__all__ = ['run_proxy', 'configure_proxy']
The Tor Browser / file diff

diff: testing/tools/proxyserver/proxyserver.py

testing/tools/proxyserver/proxyserver.py