|
1 #!/usr/bin/env python |
|
2 |
|
3 #tooltool is a lookaside cache implemented in Python |
|
4 #Copyright (C) 2011 John H. Ford <john@johnford.info> |
|
5 # |
|
6 #This program is free software; you can redistribute it and/or |
|
7 #modify it under the terms of the GNU General Public License |
|
8 #as published by the Free Software Foundation version 2 |
|
9 # |
|
10 #This program is distributed in the hope that it will be useful, |
|
11 #but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
13 #GNU General Public License for more details. |
|
14 # |
|
15 #You should have received a copy of the GNU General Public License |
|
16 #along with this program; if not, write to the Free Software |
|
17 #Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
|
18 |
|
19 # An manifest file specifies files in that directory that are stored |
|
20 # elsewhere. This file should only contain file in the directory |
|
21 # which the manifest file resides in and it should be called 'manifest.manifest' |
|
22 |
|
23 __version__ = '1' |
|
24 |
|
25 import json |
|
26 import os |
|
27 import optparse |
|
28 import logging |
|
29 import hashlib |
|
30 import urllib2 |
|
31 import ConfigParser |
|
32 |
|
33 log = logging.getLogger(__name__) |
|
34 |
|
35 class FileRecordJSONEncoderException(Exception): pass |
|
36 class InvalidManifest(Exception): pass |
|
37 class ExceptionWithFilename(Exception): |
|
38 def __init__(self, filename): |
|
39 Exception.__init__(self) |
|
40 self.filename = filename |
|
41 |
|
42 class DigestMismatchException(ExceptionWithFilename): pass |
|
43 class MissingFileException(ExceptionWithFilename): pass |
|
44 |
|
45 class FileRecord(object): |
|
46 def __init__(self, filename, size, digest, algorithm): |
|
47 object.__init__(self) |
|
48 self.filename = filename |
|
49 self.size = size |
|
50 self.digest = digest |
|
51 self.algorithm = algorithm |
|
52 log.debug("creating %s 0x%x" % (self.__class__.__name__, id(self))) |
|
53 |
|
54 def __eq__(self, other): |
|
55 if self is other: |
|
56 return True |
|
57 if self.filename == other.filename and \ |
|
58 self.size == other.size and \ |
|
59 self.digest == other.digest and \ |
|
60 self.algorithm == other.algorithm: |
|
61 return True |
|
62 else: |
|
63 return False |
|
64 |
|
65 def __ne__(self, other): |
|
66 return not self.__eq__(other) |
|
67 |
|
68 def __str__(self): |
|
69 return repr(self) |
|
70 |
|
71 def __repr__(self): |
|
72 return "%s.%s(filename='%s', size='%s', digest='%s', algorithm='%s')" % (__name__, |
|
73 self.__class__.__name__, |
|
74 self.filename, self.size, self.digest, self.algorithm) |
|
75 |
|
76 def present(self): |
|
77 # Doesn't check validity |
|
78 return os.path.exists(self.filename) |
|
79 |
|
80 def validate_size(self): |
|
81 if self.present(): |
|
82 return self.size == os.path.getsize(self.filename) |
|
83 else: |
|
84 log.debug("trying to validate size on a missing file, %s", self.filename) |
|
85 raise MissingFileException(filename=self.filename) |
|
86 |
|
87 def validate_digest(self): |
|
88 if self.present(): |
|
89 with open(self.filename, 'rb') as f: |
|
90 return self.digest == digest_file(f, self.algorithm) |
|
91 else: |
|
92 log.debug("trying to validate digest on a missing file, %s', self.filename") |
|
93 raise MissingFileException(filename=self.filename) |
|
94 |
|
95 def validate(self): |
|
96 if self.validate_size(): |
|
97 if self.validate_digest(): |
|
98 return True |
|
99 return False |
|
100 |
|
101 def describe(self): |
|
102 if self.present() and self.validate(): |
|
103 return "'%s' is present and valid" % self.filename |
|
104 elif self.present(): |
|
105 return "'%s' is present and invalid" % self.filename |
|
106 else: |
|
107 return "'%s' is absent" % self.filename |
|
108 |
|
109 |
|
110 def create_file_record(filename, algorithm): |
|
111 fo = open(filename, 'rb') |
|
112 stored_filename = os.path.split(filename)[1] |
|
113 fr = FileRecord(stored_filename, os.path.getsize(filename), digest_file(fo, algorithm), algorithm) |
|
114 fo.close() |
|
115 return fr |
|
116 |
|
117 |
|
118 class FileRecordJSONEncoder(json.JSONEncoder): |
|
119 def encode_file_record(self, obj): |
|
120 if not issubclass(type(obj), FileRecord): |
|
121 err = "FileRecordJSONEncoder is only for FileRecord and lists of FileRecords, not %s" % obj.__class__.__name__ |
|
122 log.warn(err) |
|
123 raise FileRecordJSONEncoderException(err) |
|
124 else: |
|
125 return {'filename': obj.filename, 'size': obj.size, 'algorithm': obj.algorithm, 'digest': obj.digest} |
|
126 |
|
127 def default(self, f): |
|
128 if issubclass(type(f), list): |
|
129 record_list = [] |
|
130 for i in f: |
|
131 record_list.append(self.encode_file_record(i)) |
|
132 return record_list |
|
133 else: |
|
134 return self.encode_file_record(f) |
|
135 |
|
136 |
|
137 class FileRecordJSONDecoder(json.JSONDecoder): |
|
138 """I help the json module materialize a FileRecord from |
|
139 a JSON file. I understand FileRecords and lists of |
|
140 FileRecords. I ignore things that I don't expect for now""" |
|
141 # TODO: make this more explicit in what it's looking for |
|
142 # and error out on unexpected things |
|
143 def process_file_records(self, obj): |
|
144 if isinstance(obj, list): |
|
145 record_list = [] |
|
146 for i in obj: |
|
147 record = self.process_file_records(i) |
|
148 if issubclass(type(record), FileRecord): |
|
149 record_list.append(record) |
|
150 return record_list |
|
151 if isinstance(obj, dict) and \ |
|
152 len(obj.keys()) == 4 and \ |
|
153 obj.has_key('filename') and \ |
|
154 obj.has_key('size') and \ |
|
155 obj.has_key('algorithm') and \ |
|
156 obj.has_key('digest'): |
|
157 rv = FileRecord(obj['filename'], obj['size'], obj['digest'], obj['algorithm']) |
|
158 log.debug("materialized %s" % rv) |
|
159 return rv |
|
160 return obj |
|
161 |
|
162 def decode(self, s): |
|
163 decoded = json.JSONDecoder.decode(self, s) |
|
164 rv = self.process_file_records(decoded) |
|
165 return rv |
|
166 |
|
167 |
|
168 class Manifest(object): |
|
169 |
|
170 valid_formats = ('json',) |
|
171 |
|
172 def __init__(self, file_records=[]): |
|
173 self.file_records = file_records |
|
174 |
|
175 def __eq__(self, other): |
|
176 if self is other: |
|
177 return True |
|
178 if len(self.file_records) != len(other.file_records): |
|
179 log.debug('Manifests differ in number of files') |
|
180 return False |
|
181 #TODO: Lists in a different order should be equal |
|
182 for record in range(0,len(self.file_records)): |
|
183 if self.file_records[record] != other.file_records[record]: |
|
184 log.debug('FileRecords differ, %s vs %s' % (self.file_records[record], |
|
185 other.file_records[record])) |
|
186 return False |
|
187 return True |
|
188 |
|
189 def __deepcopy__(self, memo): |
|
190 # This is required for a deep copy |
|
191 return Manifest(self.file_records[:]) |
|
192 |
|
193 def __copy__(self): |
|
194 return Manifest(self.file_records) |
|
195 |
|
196 def copy(self): |
|
197 return Manifest(self.file_records[:]) |
|
198 |
|
199 def present(self): |
|
200 return all(i.present() for i in self.file_records) |
|
201 |
|
202 def validate_sizes(self): |
|
203 return all(i.validate_size() for i in self.file_records) |
|
204 |
|
205 def validate_digests(self): |
|
206 return all(i.validate_digest() for i in self.file_records) |
|
207 |
|
208 def validate(self): |
|
209 return all(i.validate() for i in self.file_records) |
|
210 |
|
211 def sort(self): |
|
212 #TODO: WRITE TESTS |
|
213 self.file_records.sort(key=lambda x: x.size) |
|
214 |
|
215 def load(self, data_file, fmt='json'): |
|
216 assert fmt in self.valid_formats |
|
217 if fmt == 'json': |
|
218 try: |
|
219 self.file_records.extend(json.load(data_file, cls=FileRecordJSONDecoder)) |
|
220 self.sort() |
|
221 except ValueError: |
|
222 raise InvalidManifest("trying to read invalid manifest file") |
|
223 |
|
224 def loads(self, data_string, fmt='json'): |
|
225 assert fmt in self.valid_formats |
|
226 if fmt == 'json': |
|
227 try: |
|
228 self.file_records.extend(json.loads(data_string, cls=FileRecordJSONDecoder)) |
|
229 self.sort() |
|
230 except ValueError: |
|
231 raise InvalidManifest("trying to read invalid manifest file") |
|
232 |
|
233 def dump(self, output_file, fmt='json'): |
|
234 assert fmt in self.valid_formats |
|
235 self.sort() |
|
236 if fmt == 'json': |
|
237 rv = json.dump(self.file_records, output_file, indent=0, cls=FileRecordJSONEncoder) |
|
238 print >> output_file, '' |
|
239 return rv |
|
240 |
|
241 def dumps(self, fmt='json'): |
|
242 assert fmt in self.valid_formats |
|
243 self.sort() |
|
244 if fmt == 'json': |
|
245 return json.dumps(self.file_records, cls=FileRecordJSONEncoder) |
|
246 |
|
247 |
|
248 def digest_file(f, a): |
|
249 """I take a file like object 'f' and return a hex-string containing |
|
250 of the result of the algorithm 'a' applied to 'f'.""" |
|
251 h = hashlib.new(a) |
|
252 chunk_size = 1024*10 |
|
253 data = f.read(chunk_size) |
|
254 while data: |
|
255 h.update(data) |
|
256 data = f.read(chunk_size) |
|
257 if hasattr(f, 'name'): |
|
258 log.debug('hashed %s with %s to be %s', f.name, a, h.hexdigest()) |
|
259 else: |
|
260 log.debug('hashed a file with %s to be %s', a, h.hexdigest()) |
|
261 return h.hexdigest() |
|
262 |
|
263 # TODO: write tests for this function |
|
264 def open_manifest(manifest_file): |
|
265 """I know how to take a filename and load it into a Manifest object""" |
|
266 if os.path.exists(manifest_file): |
|
267 manifest = Manifest() |
|
268 with open(manifest_file) as f: |
|
269 manifest.load(f) |
|
270 log.debug("loaded manifest from file '%s'" % manifest_file) |
|
271 return manifest |
|
272 else: |
|
273 log.debug("tried to load absent file '%s' as manifest" % manifest_file) |
|
274 raise InvalidManifest("manifest file '%s' does not exist" % manifest_file) |
|
275 |
|
276 # TODO: write tests for this function |
|
277 def list_manifest(manifest_file): |
|
278 """I know how print all the files in a location""" |
|
279 try: |
|
280 manifest = open_manifest(manifest_file) |
|
281 except InvalidManifest: |
|
282 log.error("failed to load manifest file at '%s'" % manifest_file) |
|
283 return False |
|
284 for f in manifest.file_records: |
|
285 print "%s\t%s\t%s" % ("P" if f.present() else "-", |
|
286 "V" if f.present() and f.validate() else "-", |
|
287 f.filename) |
|
288 return True |
|
289 |
|
290 def validate_manifest(manifest_file): |
|
291 """I validate that all files in a manifest are present and valid but |
|
292 don't fetch or delete them if they aren't""" |
|
293 try: |
|
294 manifest = open_manifest(manifest_file) |
|
295 except InvalidManifest: |
|
296 log.error("failed to load manifest file at '%s'" % manifest_file) |
|
297 return False |
|
298 invalid_files = [] |
|
299 absent_files = [] |
|
300 for f in manifest.file_records: |
|
301 if not f.present(): |
|
302 absent_files.append(f) |
|
303 else: |
|
304 if not f.validate(): |
|
305 invalid_files.append(f) |
|
306 if len(invalid_files + absent_files) == 0: |
|
307 return True |
|
308 else: |
|
309 return False |
|
310 |
|
311 # TODO: write tests for this function |
|
312 def add_files(manifest_file, algorithm, filenames): |
|
313 # returns True if all files successfully added, False if not |
|
314 # and doesn't catch library Exceptions. If any files are already |
|
315 # tracked in the manifest, return will be False because they weren't |
|
316 # added |
|
317 all_files_added = True |
|
318 # Create a old_manifest object to add to |
|
319 if os.path.exists(manifest_file): |
|
320 old_manifest = open_manifest(manifest_file) |
|
321 else: |
|
322 old_manifest = Manifest() |
|
323 log.debug("creating a new manifest file") |
|
324 new_manifest = Manifest() # use a different manifest for the output |
|
325 for filename in filenames: |
|
326 log.debug("adding %s" % filename) |
|
327 path, name = os.path.split(filename) |
|
328 new_fr = create_file_record(filename, algorithm) |
|
329 log.debug("appending a new file record to manifest file") |
|
330 add = True |
|
331 for fr in old_manifest.file_records: |
|
332 log.debug("manifest file has '%s'" % "', ".join([x.filename for x in old_manifest.file_records])) |
|
333 if new_fr == fr and new_fr.validate(): |
|
334 # TODO: Decide if this case should really cause a False return |
|
335 log.info("file already in old_manifest file and matches") |
|
336 add = False |
|
337 elif new_fr == fr and not new_fr.validate(): |
|
338 log.error("file already in old_manifest file but is invalid") |
|
339 add = False |
|
340 if filename == fr.filename: |
|
341 log.error("manifest already contains file named %s" % filename) |
|
342 add = False |
|
343 if add: |
|
344 new_manifest.file_records.append(new_fr) |
|
345 log.debug("added '%s' to manifest" % filename) |
|
346 else: |
|
347 all_files_added = False |
|
348 with open(manifest_file, 'wb') as output: |
|
349 new_manifest.dump(output, fmt='json') |
|
350 return all_files_added |
|
351 |
|
352 |
|
353 # TODO: write tests for this function |
|
354 def fetch_file(base_url, file_record, overwrite=False, grabchunk=1024*4): |
|
355 # A file which is requested to be fetched that exists locally will be hashed. |
|
356 # If the hash matches the requested file's hash, nothing will be done and the |
|
357 # function will return. If the function is told to overwrite and there is a |
|
358 # digest mismatch, the exiting file will be overwritten |
|
359 if file_record.present(): |
|
360 if file_record.validate(): |
|
361 log.info("existing '%s' is valid, not fetching" % file_record.filename) |
|
362 return True |
|
363 if overwrite: |
|
364 log.info("overwriting '%s' as requested" % file_record.filename) |
|
365 else: |
|
366 # All of the following is for a useful error message |
|
367 with open(file_record.filename, 'rb') as f: |
|
368 d = digest_file(f, file_record.algorithm) |
|
369 log.error("digest mismatch between manifest(%s...) and local file(%s...)" % \ |
|
370 (file_record.digest[:8], d[:8])) |
|
371 log.debug("full digests: manifest (%s) local file (%s)" % (file_record.digest, d)) |
|
372 # Let's bail! |
|
373 return False |
|
374 |
|
375 # Generate the URL for the file on the server side |
|
376 url = "%s/%s/%s" % (base_url, file_record.algorithm, file_record.digest) |
|
377 |
|
378 log.debug("fetching from '%s'" % url) |
|
379 |
|
380 # TODO: This should be abstracted to make generic retreival protocol handling easy |
|
381 # Well, the file doesn't exist locally. Lets fetch it. |
|
382 try: |
|
383 f = urllib2.urlopen(url) |
|
384 log.debug("opened %s for reading" % url) |
|
385 with open(file_record.filename, 'wb') as out: |
|
386 k = True |
|
387 size = 0 |
|
388 while k: |
|
389 # TODO: print statistics as file transfers happen both for info and to stop |
|
390 # buildbot timeouts |
|
391 indata = f.read(grabchunk) |
|
392 out.write(indata) |
|
393 size += len(indata) |
|
394 if indata == '': |
|
395 k = False |
|
396 if size != file_record.size: |
|
397 log.error("transfer from %s to %s failed due to a difference of %d bytes" % (url, |
|
398 file_record.filename, file_record.size - size)) |
|
399 return False |
|
400 log.info("fetched %s" % file_record.filename) |
|
401 except (urllib2.URLError, urllib2.HTTPError) as e: |
|
402 log.error("failed to fetch '%s': %s" % (file_record.filename, e), |
|
403 exc_info=True) |
|
404 return False |
|
405 except IOError: |
|
406 log.error("failed to write to '%s'" % file_record.filename, |
|
407 exc_info=True) |
|
408 return False |
|
409 return True |
|
410 |
|
411 |
|
412 # TODO: write tests for this function |
|
413 def fetch_files(manifest_file, base_url, overwrite, filenames=[]): |
|
414 # Lets load the manifest file |
|
415 try: |
|
416 manifest = open_manifest(manifest_file) |
|
417 except InvalidManifest: |
|
418 log.error("failed to load manifest file at '%s'" % manifest_file) |
|
419 return False |
|
420 # We want to track files that fail to be fetched as well as |
|
421 # files that are fetched |
|
422 failed_files = [] |
|
423 |
|
424 # Lets go through the manifest and fetch the files that we want |
|
425 fetched_files = [] |
|
426 for f in manifest.file_records: |
|
427 if f.filename in filenames or len(filenames) == 0: |
|
428 log.debug("fetching %s" % f.filename) |
|
429 if fetch_file(base_url, f, overwrite): |
|
430 fetched_files.append(f) |
|
431 else: |
|
432 failed_files.append(f.filename) |
|
433 else: |
|
434 log.debug("skipping %s" % f.filename) |
|
435 |
|
436 # Even if we get the file, lets ensure that it matches what the |
|
437 # manifest specified |
|
438 for localfile in fetched_files: |
|
439 if not localfile.validate(): |
|
440 log.error("'%s'" % localfile.describe()) |
|
441 |
|
442 # If we failed to fetch or validate a file, we need to fail |
|
443 if len(failed_files) > 0: |
|
444 log.error("The following files failed: '%s'" % "', ".join(failed_files)) |
|
445 return False |
|
446 return True |
|
447 |
|
448 |
|
449 # TODO: write tests for this function |
|
450 def process_command(options, args): |
|
451 """ I know how to take a list of program arguments and |
|
452 start doing the right thing with them""" |
|
453 cmd = args[0] |
|
454 cmd_args = args[1:] |
|
455 log.debug("processing '%s' command with args '%s'" % (cmd, '", "'.join(cmd_args))) |
|
456 log.debug("using options: %s" % options) |
|
457 if cmd == 'list': |
|
458 return list_manifest(options['manifest']) |
|
459 if cmd == 'validate': |
|
460 return validate_manifest(options['manifest']) |
|
461 elif cmd == 'add': |
|
462 return add_files(options['manifest'], options['algorithm'], cmd_args) |
|
463 elif cmd == 'fetch': |
|
464 if not options.has_key('base_url') or options.get('base_url') is None: |
|
465 log.critical('fetch command requires url option') |
|
466 return False |
|
467 return fetch_files(options['manifest'], options['base_url'], options['overwrite'], cmd_args) |
|
468 else: |
|
469 log.critical('command "%s" is not implemented' % cmd) |
|
470 return False |
|
471 |
|
472 # fetching api: |
|
473 # http://hostname/algorithm/hash |
|
474 # example: http://people.mozilla.org/sha1/1234567890abcedf |
|
475 # This will make it possible to have the server allow clients to |
|
476 # use different algorithms than what was uploaded to the server |
|
477 |
|
478 # TODO: Implement the following features: |
|
479 # -optimization: do small files first, justification is that they are faster |
|
480 # and cause a faster failure if they are invalid |
|
481 # -store permissions |
|
482 # -local renames i.e. call the file one thing on the server and |
|
483 # something different locally |
|
484 # -deal with the cases: |
|
485 # -local data matches file requested with different filename |
|
486 # -two different files with same name, different hash |
|
487 # -?only ever locally to digest as filename, symlink to real name |
|
488 # -?maybe deal with files as a dir of the filename with all files in that dir as the versions of that file |
|
489 # - e.g. ./python-2.6.7.dmg/0123456789abcdef and ./python-2.6.7.dmg/abcdef0123456789 |
|
490 |
|
491 def main(): |
|
492 # Set up logging, for now just to the console |
|
493 ch = logging.StreamHandler() |
|
494 cf = logging.Formatter("%(levelname)s - %(message)s") |
|
495 ch.setFormatter(cf) |
|
496 |
|
497 # Set up option parsing |
|
498 parser = optparse.OptionParser() |
|
499 # I wish there was a way to say "only allow args to be |
|
500 # sequential and at the end of the argv. |
|
501 # OH! i could step through sys.argv and check for things starting without -/-- before things starting with them |
|
502 parser.add_option('-q', '--quiet', default=False, |
|
503 dest='quiet', action='store_true') |
|
504 parser.add_option('-v', '--verbose', default=False, |
|
505 dest='verbose', action='store_true') |
|
506 parser.add_option('-m', '--manifest', default='manifest.tt', |
|
507 dest='manifest', action='store', |
|
508 help='specify the manifest file to be operated on') |
|
509 parser.add_option('-d', '--algorithm', default='sha512', |
|
510 dest='algorithm', action='store', |
|
511 help='openssl hashing algorithm to use') |
|
512 parser.add_option('-o', '--overwrite', default=False, |
|
513 dest='overwrite', action='store_true', |
|
514 help='if fetching, remote copy will overwrite a local copy that is different. ') |
|
515 parser.add_option('--url', dest='base_url', action='store', |
|
516 help='base url for fetching files') |
|
517 parser.add_option('--ignore-config-files', action='store_true', default=False, |
|
518 dest='ignore_cfg_files') |
|
519 (options_obj, args) = parser.parse_args() |
|
520 # Dictionaries are easier to work with |
|
521 options = vars(options_obj) |
|
522 |
|
523 |
|
524 # Use some of the option parser to figure out application |
|
525 # log level |
|
526 if options.get('verbose'): |
|
527 ch.setLevel(logging.DEBUG) |
|
528 elif options.get('quiet'): |
|
529 ch.setLevel(logging.ERROR) |
|
530 else: |
|
531 ch.setLevel(logging.INFO) |
|
532 log.addHandler(ch) |
|
533 |
|
534 cfg_file = ConfigParser.SafeConfigParser() |
|
535 if not options.get("ignore_cfg_files"): |
|
536 read_files = cfg_file.read(['/etc/tooltool', os.path.expanduser('~/.tooltool'), |
|
537 os.path.join(os.getcwd(), '.tooltool')]) |
|
538 log.debug("read in the config files '%s'" % '", '.join(read_files)) |
|
539 else: |
|
540 log.debug("skipping config files") |
|
541 |
|
542 for option in ('base_url', 'algorithm'): |
|
543 if not options.get(option): |
|
544 try: |
|
545 options[option] = cfg_file.get('general', option) |
|
546 log.debug("read '%s' as '%s' from cfg_file" % (option, options[option])) |
|
547 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError) as e: |
|
548 log.debug("%s in config file" % e, exc_info=True) |
|
549 |
|
550 if not options.has_key('manifest'): |
|
551 parser.error("no manifest file specified") |
|
552 |
|
553 if len(args) < 1: |
|
554 parser.error('You must specify a command') |
|
555 exit(0 if process_command(options, args) else 1) |
|
556 |
|
557 if __name__ == "__main__": |
|
558 main() |
|
559 else: |
|
560 log.addHandler(logging.NullHandler()) |
|
561 #log.addHandler(logging.StreamHandler()) |