diff -r 000000000000 -r 6474c204b198 python/mozbuild/mozpack/mozjar.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/python/mozbuild/mozpack/mozjar.py Wed Dec 31 06:09:35 2014 +0100 @@ -0,0 +1,804 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from io import BytesIO +import struct +import zlib +import os +from zipfile import ( + ZIP_STORED, + ZIP_DEFLATED, +) +from collections import OrderedDict +from urlparse import urlparse, ParseResult +import mozpack.path + +JAR_STORED = ZIP_STORED +JAR_DEFLATED = ZIP_DEFLATED +MAX_WBITS = 15 + + +class JarReaderError(Exception): + '''Error type for Jar reader errors.''' + + +class JarWriterError(Exception): + '''Error type for Jar writer errors.''' + + +class JarStruct(object): + ''' + Helper used to define ZIP archive raw data structures. Data structures + handled by this helper all start with a magic number, defined in + subclasses MAGIC field as a 32-bits unsigned integer, followed by data + structured as described in subclasses STRUCT field. + + The STRUCT field contains a list of (name, type) pairs where name is a + field name, and the type can be one of 'uint32', 'uint16' or one of the + field names. In the latter case, the field is considered to be a string + buffer with a length given in that field. + For example, + STRUCT = [ + ('version', 'uint32'), + ('filename_size', 'uint16'), + ('filename', 'filename_size') + ] + describes a structure with a 'version' 32-bits unsigned integer field, + followed by a 'filename_size' 16-bits unsigned integer field, followed by a + filename_size-long string buffer 'filename'. + + Fields that are used as other fields size are not stored in objects. In the + above example, an instance of such subclass would only have two attributes: + obj['version'] + obj['filename'] + filename_size would be obtained with len(obj['filename']). + + JarStruct subclasses instances can be either initialized from existing data + (deserialized), or with empty fields. + ''' + + TYPE_MAPPING = {'uint32': ('I', 4), 'uint16': ('H', 2)} + + def __init__(self, data=None): + ''' + Create an instance from the given data. Data may be omitted to create + an instance with empty fields. + ''' + assert self.MAGIC and isinstance(self.STRUCT, OrderedDict) + self.size_fields = set(t for t in self.STRUCT.itervalues() + if not t in JarStruct.TYPE_MAPPING) + self._values = {} + if data: + self._init_data(data) + else: + self._init_empty() + + def _init_data(self, data): + ''' + Initialize an instance from data, following the data structure + described in self.STRUCT. The self.MAGIC signature is expected at + data[:4]. + ''' + assert data is not None + self.signature, size = JarStruct.get_data('uint32', data) + if self.signature != self.MAGIC: + raise JarReaderError('Bad magic') + offset = size + # For all fields used as other fields sizes, keep track of their value + # separately. + sizes = dict((t, 0) for t in self.size_fields) + for name, t in self.STRUCT.iteritems(): + if t in JarStruct.TYPE_MAPPING: + value, size = JarStruct.get_data(t, data[offset:]) + else: + size = sizes[t] + value = data[offset:offset + size] + if isinstance(value, memoryview): + value = value.tobytes() + if not name in sizes: + self._values[name] = value + else: + sizes[name] = value + offset += size + + def _init_empty(self): + ''' + Initialize an instance with empty fields. + ''' + self.signature = self.MAGIC + for name, t in self.STRUCT.iteritems(): + if name in self.size_fields: + continue + self._values[name] = 0 if t in JarStruct.TYPE_MAPPING else '' + + @staticmethod + def get_data(type, data): + ''' + Deserialize a single field of given type (must be one of + JarStruct.TYPE_MAPPING) at the given offset in the given data. + ''' + assert type in JarStruct.TYPE_MAPPING + assert data is not None + format, size = JarStruct.TYPE_MAPPING[type] + data = data[:size] + if isinstance(data, memoryview): + data = data.tobytes() + return struct.unpack('<' + format, data)[0], size + + def serialize(self): + ''' + Serialize the data structure according to the data structure definition + from self.STRUCT. + ''' + serialized = struct.pack('" % (self.__class__.__name__, + ' '.join('%s=%s' % (n, v) for n, v in self)) + + +class JarCdirEnd(JarStruct): + ''' + End of central directory record. + ''' + MAGIC = 0x06054b50 + STRUCT = OrderedDict([ + ('disk_num', 'uint16'), + ('cdir_disk', 'uint16'), + ('disk_entries', 'uint16'), + ('cdir_entries', 'uint16'), + ('cdir_size', 'uint32'), + ('cdir_offset', 'uint32'), + ('comment_size', 'uint16'), + ('comment', 'comment_size'), + ]) + +CDIR_END_SIZE = JarCdirEnd().size + + +class JarCdirEntry(JarStruct): + ''' + Central directory file header + ''' + MAGIC = 0x02014b50 + STRUCT = OrderedDict([ + ('creator_version', 'uint16'), + ('min_version', 'uint16'), + ('general_flag', 'uint16'), + ('compression', 'uint16'), + ('lastmod_time', 'uint16'), + ('lastmod_date', 'uint16'), + ('crc32', 'uint32'), + ('compressed_size', 'uint32'), + ('uncompressed_size', 'uint32'), + ('filename_size', 'uint16'), + ('extrafield_size', 'uint16'), + ('filecomment_size', 'uint16'), + ('disknum', 'uint16'), + ('internal_attr', 'uint16'), + ('external_attr', 'uint32'), + ('offset', 'uint32'), + ('filename', 'filename_size'), + ('extrafield', 'extrafield_size'), + ('filecomment', 'filecomment_size'), + ]) + + +class JarLocalFileHeader(JarStruct): + ''' + Local file header + ''' + MAGIC = 0x04034b50 + STRUCT = OrderedDict([ + ('min_version', 'uint16'), + ('general_flag', 'uint16'), + ('compression', 'uint16'), + ('lastmod_time', 'uint16'), + ('lastmod_date', 'uint16'), + ('crc32', 'uint32'), + ('compressed_size', 'uint32'), + ('uncompressed_size', 'uint32'), + ('filename_size', 'uint16'), + ('extra_field_size', 'uint16'), + ('filename', 'filename_size'), + ('extra_field', 'extra_field_size'), + ]) + + +class JarFileReader(object): + ''' + File-like class for use by JarReader to give access to individual files + within a Jar archive. + ''' + def __init__(self, header, data): + ''' + Initialize a JarFileReader. header is the local file header + corresponding to the file in the jar archive, data a buffer containing + the file data. + ''' + assert header['compression'] in [JAR_DEFLATED, JAR_STORED] + self._data = data + # Copy some local file header fields. + for name in ['filename', 'compressed_size', + 'uncompressed_size', 'crc32']: + setattr(self, name, header[name]) + self.compressed = header['compression'] == JAR_DEFLATED + + def read(self, length=-1): + ''' + Read some amount of uncompressed data. + ''' + return self.uncompressed_data.read(length) + + def readlines(self): + ''' + Return a list containing all the lines of data in the uncompressed + data. + ''' + return self.read().splitlines(True) + + def __iter__(self): + ''' + Iterator, to support the "for line in fileobj" constructs. + ''' + return iter(self.readlines()) + + def seek(self, pos, whence=os.SEEK_SET): + ''' + Change the current position in the uncompressed data. Subsequent reads + will start from there. + ''' + return self.uncompressed_data.seek(pos, whence) + + def close(self): + ''' + Free the uncompressed data buffer. + ''' + self.uncompressed_data.close() + + @property + def compressed_data(self): + ''' + Return the raw compressed data. + ''' + return self._data[:self.compressed_size] + + @property + def uncompressed_data(self): + ''' + Return the uncompressed data. + ''' + if hasattr(self, '_uncompressed_data'): + return self._uncompressed_data + data = self.compressed_data + if self.compressed: + data = zlib.decompress(data.tobytes(), -MAX_WBITS) + else: + data = data.tobytes() + if len(data) != self.uncompressed_size: + raise JarReaderError('Corrupted file? %s' % self.filename) + self._uncompressed_data = BytesIO(data) + return self._uncompressed_data + + +class JarReader(object): + ''' + Class with methods to read Jar files. Can open standard jar files as well + as Mozilla jar files (see further details in the JarWriter documentation). + ''' + def __init__(self, file=None, fileobj=None): + ''' + Opens the given file as a Jar archive. Use the given file-like object + if one is given instead of opening the given file name. + ''' + if fileobj: + data = fileobj.read() + else: + data = open(file, 'rb').read() + self._data = memoryview(data) + # The End of Central Directory Record has a variable size because of + # comments it may contain, so scan for it from the end of the file. + offset = -CDIR_END_SIZE + while True: + signature = JarStruct.get_data('uint32', self._data[offset:])[0] + if signature == JarCdirEnd.MAGIC: + break + if offset == -len(self._data): + raise JarReaderError('Not a jar?') + offset -= 1 + self._cdir_end = JarCdirEnd(self._data[offset:]) + + def close(self): + ''' + Free some resources associated with the Jar. + ''' + del self._data + + @property + def entries(self): + ''' + Return an ordered dict of central directory entries, indexed by + filename, in the order they appear in the Jar archive central + directory. Directory entries are skipped. + ''' + if hasattr(self, '_entries'): + return self._entries + preload = 0 + if self.is_optimized: + preload = JarStruct.get_data('uint32', self._data)[0] + entries = OrderedDict() + offset = self._cdir_end['cdir_offset'] + for e in xrange(self._cdir_end['cdir_entries']): + entry = JarCdirEntry(self._data[offset:]) + offset += entry.size + # Creator host system. 0 is MSDOS, 3 is Unix + host = entry['creator_version'] >> 8 + # External attributes values depend on host above. On Unix the + # higher bits are the stat.st_mode value. On MSDOS, the lower bits + # are the FAT attributes. + xattr = entry['external_attr'] + # Skip directories + if (host == 0 and xattr & 0x10) or (host == 3 and + xattr & (040000 << 16)): + continue + entries[entry['filename']] = entry + if entry['offset'] < preload: + self._last_preloaded = entry['filename'] + self._entries = entries + return entries + + @property + def is_optimized(self): + ''' + Return whether the jar archive is optimized. + ''' + # In optimized jars, the central directory is at the beginning of the + # file, after a single 32-bits value, which is the length of data + # preloaded. + return self._cdir_end['cdir_offset'] == \ + JarStruct.TYPE_MAPPING['uint32'][1] + + @property + def last_preloaded(self): + ''' + Return the name of the last file that is set to be preloaded. + See JarWriter documentation for more details on preloading. + ''' + if hasattr(self, '_last_preloaded'): + return self._last_preloaded + self._last_preloaded = None + self.entries + return self._last_preloaded + + def _getreader(self, entry): + ''' + Helper to create a JarFileReader corresponding to the given central + directory entry. + ''' + header = JarLocalFileHeader(self._data[entry['offset']:]) + for key, value in entry: + if key in header and header[key] != value: + raise JarReaderError('Central directory and file header ' + + 'mismatch. Corrupted archive?') + return JarFileReader(header, + self._data[entry['offset'] + header.size:]) + + def __iter__(self): + ''' + Iterate over all files in the Jar archive, in the form of + JarFileReaders. + for file in jarReader: + ... + ''' + for entry in self.entries.itervalues(): + yield self._getreader(entry) + + def __getitem__(self, name): + ''' + Get a JarFileReader for the given file name. + ''' + return self._getreader(self.entries[name]) + + def __contains__(self, name): + ''' + Return whether the given file name appears in the Jar archive. + ''' + return name in self.entries + + +class JarWriter(object): + ''' + Class with methods to write Jar files. Can write more-or-less standard jar + archives as well as jar archives optimized for Gecko. See the documentation + for the close() member function for a description of both layouts. + ''' + def __init__(self, file=None, fileobj=None, compress=True, optimize=True): + ''' + Initialize a Jar archive in the given file. Use the given file-like + object if one is given instead of opening the given file name. + The compress option determines the default behavior for storing data + in the jar archive. The optimize options determines whether the jar + archive should be optimized for Gecko or not. + ''' + if fileobj: + self._data = fileobj + else: + self._data = open(file, 'wb') + self._compress = compress + self._contents = OrderedDict() + self._last_preloaded = None + self._optimize = optimize + + def __enter__(self): + ''' + Context manager __enter__ method for JarWriter. + ''' + return self + + def __exit__(self, type, value, tb): + ''' + Context manager __exit__ method for JarWriter. + ''' + self.finish() + + def finish(self): + ''' + Flush and close the Jar archive. + + Standard jar archives are laid out like the following: + - Local file header 1 + - File data 1 + - Local file header 2 + - File data 2 + - (...) + - Central directory entry pointing at Local file header 1 + - Central directory entry pointing at Local file header 2 + - (...) + - End of central directory, pointing at first central directory + entry. + + Jar archives optimized for Gecko are laid out like the following: + - 32-bits unsigned integer giving the amount of data to preload. + - Central directory entry pointing at Local file header 1 + - Central directory entry pointing at Local file header 2 + - (...) + - End of central directory, pointing at first central directory + entry. + - Local file header 1 + - File data 1 + - Local file header 2 + - File data 2 + - (...) + - End of central directory, pointing at first central directory + entry. + The duplication of the End of central directory is to accomodate some + Zip reading tools that want an end of central directory structure to + follow the central directory entries. + ''' + offset = 0 + headers = {} + preload_size = 0 + # Prepare central directory entries + for entry, content in self._contents.itervalues(): + header = JarLocalFileHeader() + for name in entry.STRUCT: + if name in header: + header[name] = entry[name] + entry['offset'] = offset + offset += len(content) + header.size + if entry['filename'] == self._last_preloaded: + preload_size = offset + headers[entry] = header + # Prepare end of central directory + end = JarCdirEnd() + end['disk_entries'] = len(self._contents) + end['cdir_entries'] = end['disk_entries'] + end['cdir_size'] = reduce(lambda x, y: x + y[0].size, + self._contents.values(), 0) + # On optimized archives, store the preloaded size and the central + # directory entries, followed by the first end of central directory. + if self._optimize: + end['cdir_offset'] = 4 + offset = end['cdir_size'] + end['cdir_offset'] + end.size + if preload_size: + preload_size += offset + self._data.write(struct.pack('