michael@0: #!/usr/bin/env python michael@0: michael@0: #tooltool is a lookaside cache implemented in Python michael@0: #Copyright (C) 2011 John H. Ford michael@0: # michael@0: #This program is free software; you can redistribute it and/or michael@0: #modify it under the terms of the GNU General Public License michael@0: #as published by the Free Software Foundation version 2 michael@0: # michael@0: #This program is distributed in the hope that it will be useful, michael@0: #but WITHOUT ANY WARRANTY; without even the implied warranty of michael@0: #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the michael@0: #GNU General Public License for more details. michael@0: # michael@0: #You should have received a copy of the GNU General Public License michael@0: #along with this program; if not, write to the Free Software michael@0: #Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. michael@0: michael@0: # An manifest file specifies files in that directory that are stored michael@0: # elsewhere. This file should only contain file in the directory michael@0: # which the manifest file resides in and it should be called 'manifest.manifest' michael@0: michael@0: __version__ = '1' michael@0: michael@0: import json michael@0: import os michael@0: import optparse michael@0: import logging michael@0: import hashlib michael@0: import urllib2 michael@0: import ConfigParser michael@0: michael@0: log = logging.getLogger(__name__) michael@0: michael@0: class FileRecordJSONEncoderException(Exception): pass michael@0: class InvalidManifest(Exception): pass michael@0: class ExceptionWithFilename(Exception): michael@0: def __init__(self, filename): michael@0: Exception.__init__(self) michael@0: self.filename = filename michael@0: michael@0: class DigestMismatchException(ExceptionWithFilename): pass michael@0: class MissingFileException(ExceptionWithFilename): pass michael@0: michael@0: class FileRecord(object): michael@0: def __init__(self, filename, size, digest, algorithm): michael@0: object.__init__(self) michael@0: self.filename = filename michael@0: self.size = size michael@0: self.digest = digest michael@0: self.algorithm = algorithm michael@0: log.debug("creating %s 0x%x" % (self.__class__.__name__, id(self))) michael@0: michael@0: def __eq__(self, other): michael@0: if self is other: michael@0: return True michael@0: if self.filename == other.filename and \ michael@0: self.size == other.size and \ michael@0: self.digest == other.digest and \ michael@0: self.algorithm == other.algorithm: michael@0: return True michael@0: else: michael@0: return False michael@0: michael@0: def __ne__(self, other): michael@0: return not self.__eq__(other) michael@0: michael@0: def __str__(self): michael@0: return repr(self) michael@0: michael@0: def __repr__(self): michael@0: return "%s.%s(filename='%s', size='%s', digest='%s', algorithm='%s')" % (__name__, michael@0: self.__class__.__name__, michael@0: self.filename, self.size, self.digest, self.algorithm) michael@0: michael@0: def present(self): michael@0: # Doesn't check validity michael@0: return os.path.exists(self.filename) michael@0: michael@0: def validate_size(self): michael@0: if self.present(): michael@0: return self.size == os.path.getsize(self.filename) michael@0: else: michael@0: log.debug("trying to validate size on a missing file, %s", self.filename) michael@0: raise MissingFileException(filename=self.filename) michael@0: michael@0: def validate_digest(self): michael@0: if self.present(): michael@0: with open(self.filename, 'rb') as f: michael@0: return self.digest == digest_file(f, self.algorithm) michael@0: else: michael@0: log.debug("trying to validate digest on a missing file, %s', self.filename") michael@0: raise MissingFileException(filename=self.filename) michael@0: michael@0: def validate(self): michael@0: if self.validate_size(): michael@0: if self.validate_digest(): michael@0: return True michael@0: return False michael@0: michael@0: def describe(self): michael@0: if self.present() and self.validate(): michael@0: return "'%s' is present and valid" % self.filename michael@0: elif self.present(): michael@0: return "'%s' is present and invalid" % self.filename michael@0: else: michael@0: return "'%s' is absent" % self.filename michael@0: michael@0: michael@0: def create_file_record(filename, algorithm): michael@0: fo = open(filename, 'rb') michael@0: stored_filename = os.path.split(filename)[1] michael@0: fr = FileRecord(stored_filename, os.path.getsize(filename), digest_file(fo, algorithm), algorithm) michael@0: fo.close() michael@0: return fr michael@0: michael@0: michael@0: class FileRecordJSONEncoder(json.JSONEncoder): michael@0: def encode_file_record(self, obj): michael@0: if not issubclass(type(obj), FileRecord): michael@0: err = "FileRecordJSONEncoder is only for FileRecord and lists of FileRecords, not %s" % obj.__class__.__name__ michael@0: log.warn(err) michael@0: raise FileRecordJSONEncoderException(err) michael@0: else: michael@0: return {'filename': obj.filename, 'size': obj.size, 'algorithm': obj.algorithm, 'digest': obj.digest} michael@0: michael@0: def default(self, f): michael@0: if issubclass(type(f), list): michael@0: record_list = [] michael@0: for i in f: michael@0: record_list.append(self.encode_file_record(i)) michael@0: return record_list michael@0: else: michael@0: return self.encode_file_record(f) michael@0: michael@0: michael@0: class FileRecordJSONDecoder(json.JSONDecoder): michael@0: """I help the json module materialize a FileRecord from michael@0: a JSON file. I understand FileRecords and lists of michael@0: FileRecords. I ignore things that I don't expect for now""" michael@0: # TODO: make this more explicit in what it's looking for michael@0: # and error out on unexpected things michael@0: def process_file_records(self, obj): michael@0: if isinstance(obj, list): michael@0: record_list = [] michael@0: for i in obj: michael@0: record = self.process_file_records(i) michael@0: if issubclass(type(record), FileRecord): michael@0: record_list.append(record) michael@0: return record_list michael@0: if isinstance(obj, dict) and \ michael@0: len(obj.keys()) == 4 and \ michael@0: obj.has_key('filename') and \ michael@0: obj.has_key('size') and \ michael@0: obj.has_key('algorithm') and \ michael@0: obj.has_key('digest'): michael@0: rv = FileRecord(obj['filename'], obj['size'], obj['digest'], obj['algorithm']) michael@0: log.debug("materialized %s" % rv) michael@0: return rv michael@0: return obj michael@0: michael@0: def decode(self, s): michael@0: decoded = json.JSONDecoder.decode(self, s) michael@0: rv = self.process_file_records(decoded) michael@0: return rv michael@0: michael@0: michael@0: class Manifest(object): michael@0: michael@0: valid_formats = ('json',) michael@0: michael@0: def __init__(self, file_records=[]): michael@0: self.file_records = file_records michael@0: michael@0: def __eq__(self, other): michael@0: if self is other: michael@0: return True michael@0: if len(self.file_records) != len(other.file_records): michael@0: log.debug('Manifests differ in number of files') michael@0: return False michael@0: #TODO: Lists in a different order should be equal michael@0: for record in range(0,len(self.file_records)): michael@0: if self.file_records[record] != other.file_records[record]: michael@0: log.debug('FileRecords differ, %s vs %s' % (self.file_records[record], michael@0: other.file_records[record])) michael@0: return False michael@0: return True michael@0: michael@0: def __deepcopy__(self, memo): michael@0: # This is required for a deep copy michael@0: return Manifest(self.file_records[:]) michael@0: michael@0: def __copy__(self): michael@0: return Manifest(self.file_records) michael@0: michael@0: def copy(self): michael@0: return Manifest(self.file_records[:]) michael@0: michael@0: def present(self): michael@0: return all(i.present() for i in self.file_records) michael@0: michael@0: def validate_sizes(self): michael@0: return all(i.validate_size() for i in self.file_records) michael@0: michael@0: def validate_digests(self): michael@0: return all(i.validate_digest() for i in self.file_records) michael@0: michael@0: def validate(self): michael@0: return all(i.validate() for i in self.file_records) michael@0: michael@0: def sort(self): michael@0: #TODO: WRITE TESTS michael@0: self.file_records.sort(key=lambda x: x.size) michael@0: michael@0: def load(self, data_file, fmt='json'): michael@0: assert fmt in self.valid_formats michael@0: if fmt == 'json': michael@0: try: michael@0: self.file_records.extend(json.load(data_file, cls=FileRecordJSONDecoder)) michael@0: self.sort() michael@0: except ValueError: michael@0: raise InvalidManifest("trying to read invalid manifest file") michael@0: michael@0: def loads(self, data_string, fmt='json'): michael@0: assert fmt in self.valid_formats michael@0: if fmt == 'json': michael@0: try: michael@0: self.file_records.extend(json.loads(data_string, cls=FileRecordJSONDecoder)) michael@0: self.sort() michael@0: except ValueError: michael@0: raise InvalidManifest("trying to read invalid manifest file") michael@0: michael@0: def dump(self, output_file, fmt='json'): michael@0: assert fmt in self.valid_formats michael@0: self.sort() michael@0: if fmt == 'json': michael@0: rv = json.dump(self.file_records, output_file, indent=0, cls=FileRecordJSONEncoder) michael@0: print >> output_file, '' michael@0: return rv michael@0: michael@0: def dumps(self, fmt='json'): michael@0: assert fmt in self.valid_formats michael@0: self.sort() michael@0: if fmt == 'json': michael@0: return json.dumps(self.file_records, cls=FileRecordJSONEncoder) michael@0: michael@0: michael@0: def digest_file(f, a): michael@0: """I take a file like object 'f' and return a hex-string containing michael@0: of the result of the algorithm 'a' applied to 'f'.""" michael@0: h = hashlib.new(a) michael@0: chunk_size = 1024*10 michael@0: data = f.read(chunk_size) michael@0: while data: michael@0: h.update(data) michael@0: data = f.read(chunk_size) michael@0: if hasattr(f, 'name'): michael@0: log.debug('hashed %s with %s to be %s', f.name, a, h.hexdigest()) michael@0: else: michael@0: log.debug('hashed a file with %s to be %s', a, h.hexdigest()) michael@0: return h.hexdigest() michael@0: michael@0: # TODO: write tests for this function michael@0: def open_manifest(manifest_file): michael@0: """I know how to take a filename and load it into a Manifest object""" michael@0: if os.path.exists(manifest_file): michael@0: manifest = Manifest() michael@0: with open(manifest_file) as f: michael@0: manifest.load(f) michael@0: log.debug("loaded manifest from file '%s'" % manifest_file) michael@0: return manifest michael@0: else: michael@0: log.debug("tried to load absent file '%s' as manifest" % manifest_file) michael@0: raise InvalidManifest("manifest file '%s' does not exist" % manifest_file) michael@0: michael@0: # TODO: write tests for this function michael@0: def list_manifest(manifest_file): michael@0: """I know how print all the files in a location""" michael@0: try: michael@0: manifest = open_manifest(manifest_file) michael@0: except InvalidManifest: michael@0: log.error("failed to load manifest file at '%s'" % manifest_file) michael@0: return False michael@0: for f in manifest.file_records: michael@0: print "%s\t%s\t%s" % ("P" if f.present() else "-", michael@0: "V" if f.present() and f.validate() else "-", michael@0: f.filename) michael@0: return True michael@0: michael@0: def validate_manifest(manifest_file): michael@0: """I validate that all files in a manifest are present and valid but michael@0: don't fetch or delete them if they aren't""" michael@0: try: michael@0: manifest = open_manifest(manifest_file) michael@0: except InvalidManifest: michael@0: log.error("failed to load manifest file at '%s'" % manifest_file) michael@0: return False michael@0: invalid_files = [] michael@0: absent_files = [] michael@0: for f in manifest.file_records: michael@0: if not f.present(): michael@0: absent_files.append(f) michael@0: else: michael@0: if not f.validate(): michael@0: invalid_files.append(f) michael@0: if len(invalid_files + absent_files) == 0: michael@0: return True michael@0: else: michael@0: return False michael@0: michael@0: # TODO: write tests for this function michael@0: def add_files(manifest_file, algorithm, filenames): michael@0: # returns True if all files successfully added, False if not michael@0: # and doesn't catch library Exceptions. If any files are already michael@0: # tracked in the manifest, return will be False because they weren't michael@0: # added michael@0: all_files_added = True michael@0: # Create a old_manifest object to add to michael@0: if os.path.exists(manifest_file): michael@0: old_manifest = open_manifest(manifest_file) michael@0: else: michael@0: old_manifest = Manifest() michael@0: log.debug("creating a new manifest file") michael@0: new_manifest = Manifest() # use a different manifest for the output michael@0: for filename in filenames: michael@0: log.debug("adding %s" % filename) michael@0: path, name = os.path.split(filename) michael@0: new_fr = create_file_record(filename, algorithm) michael@0: log.debug("appending a new file record to manifest file") michael@0: add = True michael@0: for fr in old_manifest.file_records: michael@0: log.debug("manifest file has '%s'" % "', ".join([x.filename for x in old_manifest.file_records])) michael@0: if new_fr == fr and new_fr.validate(): michael@0: # TODO: Decide if this case should really cause a False return michael@0: log.info("file already in old_manifest file and matches") michael@0: add = False michael@0: elif new_fr == fr and not new_fr.validate(): michael@0: log.error("file already in old_manifest file but is invalid") michael@0: add = False michael@0: if filename == fr.filename: michael@0: log.error("manifest already contains file named %s" % filename) michael@0: add = False michael@0: if add: michael@0: new_manifest.file_records.append(new_fr) michael@0: log.debug("added '%s' to manifest" % filename) michael@0: else: michael@0: all_files_added = False michael@0: with open(manifest_file, 'wb') as output: michael@0: new_manifest.dump(output, fmt='json') michael@0: return all_files_added michael@0: michael@0: michael@0: # TODO: write tests for this function michael@0: def fetch_file(base_url, file_record, overwrite=False, grabchunk=1024*4): michael@0: # A file which is requested to be fetched that exists locally will be hashed. michael@0: # If the hash matches the requested file's hash, nothing will be done and the michael@0: # function will return. If the function is told to overwrite and there is a michael@0: # digest mismatch, the exiting file will be overwritten michael@0: if file_record.present(): michael@0: if file_record.validate(): michael@0: log.info("existing '%s' is valid, not fetching" % file_record.filename) michael@0: return True michael@0: if overwrite: michael@0: log.info("overwriting '%s' as requested" % file_record.filename) michael@0: else: michael@0: # All of the following is for a useful error message michael@0: with open(file_record.filename, 'rb') as f: michael@0: d = digest_file(f, file_record.algorithm) michael@0: log.error("digest mismatch between manifest(%s...) and local file(%s...)" % \ michael@0: (file_record.digest[:8], d[:8])) michael@0: log.debug("full digests: manifest (%s) local file (%s)" % (file_record.digest, d)) michael@0: # Let's bail! michael@0: return False michael@0: michael@0: # Generate the URL for the file on the server side michael@0: url = "%s/%s/%s" % (base_url, file_record.algorithm, file_record.digest) michael@0: michael@0: log.debug("fetching from '%s'" % url) michael@0: michael@0: # TODO: This should be abstracted to make generic retreival protocol handling easy michael@0: # Well, the file doesn't exist locally. Lets fetch it. michael@0: try: michael@0: f = urllib2.urlopen(url) michael@0: log.debug("opened %s for reading" % url) michael@0: with open(file_record.filename, 'wb') as out: michael@0: k = True michael@0: size = 0 michael@0: while k: michael@0: # TODO: print statistics as file transfers happen both for info and to stop michael@0: # buildbot timeouts michael@0: indata = f.read(grabchunk) michael@0: out.write(indata) michael@0: size += len(indata) michael@0: if indata == '': michael@0: k = False michael@0: if size != file_record.size: michael@0: log.error("transfer from %s to %s failed due to a difference of %d bytes" % (url, michael@0: file_record.filename, file_record.size - size)) michael@0: return False michael@0: log.info("fetched %s" % file_record.filename) michael@0: except (urllib2.URLError, urllib2.HTTPError) as e: michael@0: log.error("failed to fetch '%s': %s" % (file_record.filename, e), michael@0: exc_info=True) michael@0: return False michael@0: except IOError: michael@0: log.error("failed to write to '%s'" % file_record.filename, michael@0: exc_info=True) michael@0: return False michael@0: return True michael@0: michael@0: michael@0: # TODO: write tests for this function michael@0: def fetch_files(manifest_file, base_url, overwrite, filenames=[]): michael@0: # Lets load the manifest file michael@0: try: michael@0: manifest = open_manifest(manifest_file) michael@0: except InvalidManifest: michael@0: log.error("failed to load manifest file at '%s'" % manifest_file) michael@0: return False michael@0: # We want to track files that fail to be fetched as well as michael@0: # files that are fetched michael@0: failed_files = [] michael@0: michael@0: # Lets go through the manifest and fetch the files that we want michael@0: fetched_files = [] michael@0: for f in manifest.file_records: michael@0: if f.filename in filenames or len(filenames) == 0: michael@0: log.debug("fetching %s" % f.filename) michael@0: if fetch_file(base_url, f, overwrite): michael@0: fetched_files.append(f) michael@0: else: michael@0: failed_files.append(f.filename) michael@0: else: michael@0: log.debug("skipping %s" % f.filename) michael@0: michael@0: # Even if we get the file, lets ensure that it matches what the michael@0: # manifest specified michael@0: for localfile in fetched_files: michael@0: if not localfile.validate(): michael@0: log.error("'%s'" % localfile.describe()) michael@0: michael@0: # If we failed to fetch or validate a file, we need to fail michael@0: if len(failed_files) > 0: michael@0: log.error("The following files failed: '%s'" % "', ".join(failed_files)) michael@0: return False michael@0: return True michael@0: michael@0: michael@0: # TODO: write tests for this function michael@0: def process_command(options, args): michael@0: """ I know how to take a list of program arguments and michael@0: start doing the right thing with them""" michael@0: cmd = args[0] michael@0: cmd_args = args[1:] michael@0: log.debug("processing '%s' command with args '%s'" % (cmd, '", "'.join(cmd_args))) michael@0: log.debug("using options: %s" % options) michael@0: if cmd == 'list': michael@0: return list_manifest(options['manifest']) michael@0: if cmd == 'validate': michael@0: return validate_manifest(options['manifest']) michael@0: elif cmd == 'add': michael@0: return add_files(options['manifest'], options['algorithm'], cmd_args) michael@0: elif cmd == 'fetch': michael@0: if not options.has_key('base_url') or options.get('base_url') is None: michael@0: log.critical('fetch command requires url option') michael@0: return False michael@0: return fetch_files(options['manifest'], options['base_url'], options['overwrite'], cmd_args) michael@0: else: michael@0: log.critical('command "%s" is not implemented' % cmd) michael@0: return False michael@0: michael@0: # fetching api: michael@0: # http://hostname/algorithm/hash michael@0: # example: http://people.mozilla.org/sha1/1234567890abcedf michael@0: # This will make it possible to have the server allow clients to michael@0: # use different algorithms than what was uploaded to the server michael@0: michael@0: # TODO: Implement the following features: michael@0: # -optimization: do small files first, justification is that they are faster michael@0: # and cause a faster failure if they are invalid michael@0: # -store permissions michael@0: # -local renames i.e. call the file one thing on the server and michael@0: # something different locally michael@0: # -deal with the cases: michael@0: # -local data matches file requested with different filename michael@0: # -two different files with same name, different hash michael@0: # -?only ever locally to digest as filename, symlink to real name michael@0: # -?maybe deal with files as a dir of the filename with all files in that dir as the versions of that file michael@0: # - e.g. ./python-2.6.7.dmg/0123456789abcdef and ./python-2.6.7.dmg/abcdef0123456789 michael@0: michael@0: def main(): michael@0: # Set up logging, for now just to the console michael@0: ch = logging.StreamHandler() michael@0: cf = logging.Formatter("%(levelname)s - %(message)s") michael@0: ch.setFormatter(cf) michael@0: michael@0: # Set up option parsing michael@0: parser = optparse.OptionParser() michael@0: # I wish there was a way to say "only allow args to be michael@0: # sequential and at the end of the argv. michael@0: # OH! i could step through sys.argv and check for things starting without -/-- before things starting with them michael@0: parser.add_option('-q', '--quiet', default=False, michael@0: dest='quiet', action='store_true') michael@0: parser.add_option('-v', '--verbose', default=False, michael@0: dest='verbose', action='store_true') michael@0: parser.add_option('-m', '--manifest', default='manifest.tt', michael@0: dest='manifest', action='store', michael@0: help='specify the manifest file to be operated on') michael@0: parser.add_option('-d', '--algorithm', default='sha512', michael@0: dest='algorithm', action='store', michael@0: help='openssl hashing algorithm to use') michael@0: parser.add_option('-o', '--overwrite', default=False, michael@0: dest='overwrite', action='store_true', michael@0: help='if fetching, remote copy will overwrite a local copy that is different. ') michael@0: parser.add_option('--url', dest='base_url', action='store', michael@0: help='base url for fetching files') michael@0: parser.add_option('--ignore-config-files', action='store_true', default=False, michael@0: dest='ignore_cfg_files') michael@0: (options_obj, args) = parser.parse_args() michael@0: # Dictionaries are easier to work with michael@0: options = vars(options_obj) michael@0: michael@0: michael@0: # Use some of the option parser to figure out application michael@0: # log level michael@0: if options.get('verbose'): michael@0: ch.setLevel(logging.DEBUG) michael@0: elif options.get('quiet'): michael@0: ch.setLevel(logging.ERROR) michael@0: else: michael@0: ch.setLevel(logging.INFO) michael@0: log.addHandler(ch) michael@0: michael@0: cfg_file = ConfigParser.SafeConfigParser() michael@0: if not options.get("ignore_cfg_files"): michael@0: read_files = cfg_file.read(['/etc/tooltool', os.path.expanduser('~/.tooltool'), michael@0: os.path.join(os.getcwd(), '.tooltool')]) michael@0: log.debug("read in the config files '%s'" % '", '.join(read_files)) michael@0: else: michael@0: log.debug("skipping config files") michael@0: michael@0: for option in ('base_url', 'algorithm'): michael@0: if not options.get(option): michael@0: try: michael@0: options[option] = cfg_file.get('general', option) michael@0: log.debug("read '%s' as '%s' from cfg_file" % (option, options[option])) michael@0: except (ConfigParser.NoSectionError, ConfigParser.NoOptionError) as e: michael@0: log.debug("%s in config file" % e, exc_info=True) michael@0: michael@0: if not options.has_key('manifest'): michael@0: parser.error("no manifest file specified") michael@0: michael@0: if len(args) < 1: michael@0: parser.error('You must specify a command') michael@0: exit(0 if process_command(options, args) else 1) michael@0: michael@0: if __name__ == "__main__": michael@0: main() michael@0: else: michael@0: log.addHandler(logging.NullHandler()) michael@0: #log.addHandler(logging.StreamHandler())