diff -r 000000000000 -r 6474c204b198 js/src/vm/make_unicode.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/js/src/vm/make_unicode.py Wed Dec 31 06:09:35 2014 +0100 @@ -0,0 +1,377 @@ +# -*- coding: utf-8 -*- +# Based upon makeunicodedata.py +# (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py) +# written by Fredrik Lundh (fredrik@pythonware.com) +# +# Copyright (C) 2011 Tom Schuster +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import print_function +import csv +import sys + +# ECMAScript 5 $ 7.2 +whitespace = [ + # python doesn't support using control character names :( + 0x9, # CHARACTER TABULATION + 0xb, # LINE TABULATION + 0xc, # FORM FEED + ord(u'\N{SPACE}'), + ord(u'\N{NO-BREAK SPACE}'), + ord(u'\N{ZERO WIDTH NO-BREAK SPACE}'), # also BOM +] + +# $ 7.3 +line_terminator = [ + 0xa, # LINE FEED + 0xd, # CARRIAGE RETURN + ord(u'\N{LINE SEPARATOR}'), + ord(u'\N{PARAGRAPH SEPARATOR}'), +] + +# These are also part of IdentifierPart $7.6 +ZWNJ = ord(u'\N{ZERO WIDTH NON-JOINER}') +ZWJ = ord(u'\N{ZERO WIDTH JOINER}') + +FLAG_SPACE = 1 << 0 +FLAG_LETTER = 1 << 1 +FLAG_IDENTIFIER_PART = 1 << 2 + +MAX = 0xffff + +public_domain = """ +/* + * Any copyright is dedicated to the Public Domain. + * http://creativecommons.org/licenses/publicdomain/ + */ +""" + +def read_unicode_data(unicode_file): + """ + If you want to understand how this wonderful file format works checkout + Unicode Standard Annex #44 - Unicode Character Database + http://www.unicode.org/reports/tr44/ + """ + + reader = csv.reader(unicode_data, delimiter=';') + + while True: + row = reader.next() + name = row[1] + + # We need to expand the UAX #44 4.2.3 Code Point Range + if name.startswith('<') and name.endswith('First>'): + next_row = reader.next() + + for i in range(int(row[0], 16), int(next_row[0], 16) + 1): + row[0] = i + row[1] = name[1:-8] + + yield row + else: + row[0] = int(row[0], 16) + yield row + +def generate_unicode_stuff(unicode_data, data_file, test_mapping, test_space): + dummy = (0, 0, 0) + table = [dummy] + cache = {dummy: 0} + index = [0] * (MAX + 1) + test_table = {} + test_space_table = [] + + for row in read_unicode_data(unicode_data): + code = row[0] + name = row[1] + category = row[2] + alias = row[-5] + uppercase = row[-3] + lowercase = row[-2] + flags = 0 + + if code > MAX: + break + + # we combine whitespace and lineterminators because in pratice we don't need them separated + if category == 'Zs' or code in whitespace or code in line_terminator: + flags |= FLAG_SPACE + test_space_table.append(code) + if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter) + flags |= FLAG_LETTER + if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart) + flags |= FLAG_IDENTIFIER_PART + + if uppercase: + upper = int(uppercase, 16) + else: + upper = code + + if lowercase: + lower = int(lowercase, 16) + else: + lower = code + + test_table[code] = (upper, lower, name, alias) + + up_d = upper - code + low_d = lower - code + + assert up_d > -65535 and up_d < 65535 + assert low_d > -65535 and low_d < 65535 + + upper = up_d & 0xffff + lower = low_d & 0xffff + + item = (upper, lower, flags) + + i = cache.get(item) + if i is None: + assert item not in table + cache[item] = i = len(table) + table.append(item) + index[code] = i + + test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') + test_mapping.write(public_domain) + test_mapping.write('var mapping = [\n') + for code in range(0, MAX + 1): + entry = test_table.get(code) + + if entry: + upper, lower, name, alias = entry + test_mapping.write(' [' + hex(upper) + ', ' + hex(lower) + '], /* ' + + name + (' (' + alias + ')' if alias else '') + ' */\n') + else: + test_mapping.write(' [' + hex(code) + ', ' + hex(code) + '],\n') + test_mapping.write('];') + test_mapping.write(""" +assertEq(mapping.length, 0x10000); +for (var i = 0; i <= 0xffff; i++) { + var char = String.fromCharCode(i); + var info = mapping[i]; + + assertEq(char.toUpperCase().charCodeAt(0), info[0]); + assertEq(char.toLowerCase().charCodeAt(0), info[1]); +} + +if (typeof reportCompare === "function") + reportCompare(true, true); +""") + + test_space.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') + test_space.write(public_domain) + test_space.write('var onlySpace = String.fromCharCode(' + + ', '.join(map(lambda c: hex(c), test_space_table)) + ');\n') + test_space.write(""" +assertEq(onlySpace.trim(), ""); +assertEq((onlySpace + 'aaaa').trim(), 'aaaa'); +assertEq(('aaaa' + onlySpace).trim(), 'aaaa'); +assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa'); + +if (typeof reportCompare === "function") + reportCompare(true, true); +""") + + index1, index2, shift = splitbins(index) + + # Don't forget to update CharInfo in Unicode.cpp if you need to change this + assert shift == 5 + + # verify correctness + for char in index: + test = table[index[char]] + + idx = index1[char >> shift] + idx = index2[(idx << shift) + (char & ((1 << shift) - 1))] + + assert test == table[idx] + + + comment = """ +/* + * So how does indexing work? + * First let's have a look at a jschar, 16-bits: + * [................] + * Step 1: + * Extracting the upper 11 bits from the jschar. + * upper = char >> 5 ([***********.....]) + * Step 2: + * Using these bits to get an reduced index from index1. + * index = index1[upper] + * Step 3: + * Combining the index and the bottom 5 bits of the original jschar. + * real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++]) + * + * The advantage here is that the biggest number in index1 doesn't need 10 bits, + * but 7 and we save some memory. + * + * Step 4: + * Get the character informations by looking up real_index in js_charinfo. + * + * Pseudocode of generation: + * + * let table be the mapping of jschar => js_charinfo_index + * let index1 be an empty array + * let index2 be an empty array + * let cache be a hash map + * + * while shift is less then maximal amount you can shift 0xffff before it's 0 + * let chunks be table split in chunks of size 2**shift + * + * for every chunk in chunks + * if chunk is in cache + * let index be cache[chunk] + * else + * let index be the max key of index2 + 1 + * for element in chunk + * push element to index2 + * put index as chunk in cache + * + * push index >> shift to index1 + * + * increase shift + * stop if you found the best shift + */ +""" + data_file.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') + data_file.write(public_domain) + data_file.write('#include "vm/Unicode.h"\n\n') + data_file.write('using namespace js;\n') + data_file.write('using namespace js::unicode;\n') + data_file.write(comment) + data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n') + for d in table: + data_file.write(' {') + data_file.write(', '.join((str(e) for e in d))) + data_file.write('},\n') + data_file.write('};\n') + data_file.write('\n') + + def dump(data, name, file): + file.write('const uint8_t unicode::' + name + '[] = {\n') + + line = pad = ' ' * 4 + lines = [] + for entry in data: + assert entry < 256 + s = str(entry) + s = s.rjust(3) + + if len(line + s) + 5 > 99: + lines.append(line.rstrip()) + line = pad + s + ', ' + else: + line = line + s + ', ' + lines.append(line.rstrip()) + + file.write('\n'.join(lines)) + file.write('\n};\n') + + dump(index1, 'index1', data_file) + data_file.write('\n') + dump(index2, 'index2', data_file) + data_file.write('\n') + + data_file.write('\n') + +def getsize(data): + """ return smallest possible integer size for the given array """ + maxdata = max(data) + assert maxdata < 2**32 + + if maxdata < 256: + return 1 + elif maxdata < 65536: + return 2 + else: + return 4 + +def splitbins(t): + """t -> (t1, t2, shift). Split a table to save space. + + t is a sequence of ints. This function can be useful to save space if + many of the ints are the same. t1 and t2 are lists of ints, and shift + is an int, chosen to minimize the combined size of t1 and t2 (in C + code), and where for each i in range(len(t)), + t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] + where mask is a bitmask isolating the last "shift" bits. + """ + + def dump(t1, t2, shift, bytes): + print("%d+%d bins at shift %d; %d bytes" % ( + len(t1), len(t2), shift, bytes), file=sys.stderr) + print("Size of original table:", len(t)*getsize(t), \ + "bytes", file=sys.stderr) + n = len(t)-1 # last valid index + maxshift = 0 # the most we can shift n and still have something left + if n > 0: + while n >> 1: + n >>= 1 + maxshift += 1 + del n + bytes = sys.maxsize # smallest total size so far + t = tuple(t) # so slices can be dict keys + for shift in range(maxshift + 1): + t1 = [] + t2 = [] + size = 2**shift + bincache = {} + + for i in range(0, len(t), size): + bin = t[i:i + size] + + index = bincache.get(bin) + if index is None: + index = len(t2) + bincache[bin] = index + t2.extend(bin) + t1.append(index >> shift) + + # determine memory size + b = len(t1) * getsize(t1) + len(t2) * getsize(t2) + if b < bytes: + best = t1, t2, shift + bytes = b + t1, t2, shift = best + + print("Best:", end=' ', file=sys.stderr) + dump(t1, t2, shift, bytes) + + # exhaustively verify that the decomposition is correct + mask = 2**shift - 1 + for i in range(len(t)): + assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] + return best + +if __name__ == '__main__': + import urllib2 + + if len(sys.argv) > 1: + print('Always make sure you have the newest UnicodeData.txt!') + unicode_data = open(sys.argv[1], 'r') + else: + print('Downloading...') + reader = urllib2.urlopen('http://unicode.org/Public/UNIDATA/UnicodeData.txt') + data = reader.read() + reader.close() + unicode_data = open('UnicodeData.txt', 'w+') + unicode_data.write(data) + unicode_data.seek(0) + + print('Generating...') + generate_unicode_stuff(unicode_data, + open('Unicode.cpp', 'w'), + open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'), + open('../tests/ecma_5/String/string-space-trim.js', 'w'))