michael@0: # -*- coding: utf-8 -*- michael@0: # Based upon makeunicodedata.py michael@0: # (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py) michael@0: # written by Fredrik Lundh (fredrik@pythonware.com) michael@0: # michael@0: # Copyright (C) 2011 Tom Schuster michael@0: # michael@0: # This program is free software: you can redistribute it and/or modify michael@0: # it under the terms of the GNU General Public License as published by michael@0: # the Free Software Foundation, either version 3 of the License, or michael@0: # (at your option) any later version. michael@0: # michael@0: # This program is distributed in the hope that it will be useful, michael@0: # but WITHOUT ANY WARRANTY; without even the implied warranty of michael@0: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the michael@0: # GNU General Public License for more details. michael@0: # michael@0: # You should have received a copy of the GNU General Public License michael@0: # along with this program. If not, see . michael@0: michael@0: from __future__ import print_function michael@0: import csv michael@0: import sys michael@0: michael@0: # ECMAScript 5 $ 7.2 michael@0: whitespace = [ michael@0: # python doesn't support using control character names :( michael@0: 0x9, # CHARACTER TABULATION michael@0: 0xb, # LINE TABULATION michael@0: 0xc, # FORM FEED michael@0: ord(u'\N{SPACE}'), michael@0: ord(u'\N{NO-BREAK SPACE}'), michael@0: ord(u'\N{ZERO WIDTH NO-BREAK SPACE}'), # also BOM michael@0: ] michael@0: michael@0: # $ 7.3 michael@0: line_terminator = [ michael@0: 0xa, # LINE FEED michael@0: 0xd, # CARRIAGE RETURN michael@0: ord(u'\N{LINE SEPARATOR}'), michael@0: ord(u'\N{PARAGRAPH SEPARATOR}'), michael@0: ] michael@0: michael@0: # These are also part of IdentifierPart $7.6 michael@0: ZWNJ = ord(u'\N{ZERO WIDTH NON-JOINER}') michael@0: ZWJ = ord(u'\N{ZERO WIDTH JOINER}') michael@0: michael@0: FLAG_SPACE = 1 << 0 michael@0: FLAG_LETTER = 1 << 1 michael@0: FLAG_IDENTIFIER_PART = 1 << 2 michael@0: michael@0: MAX = 0xffff michael@0: michael@0: public_domain = """ michael@0: /* michael@0: * Any copyright is dedicated to the Public Domain. michael@0: * http://creativecommons.org/licenses/publicdomain/ michael@0: */ michael@0: """ michael@0: michael@0: def read_unicode_data(unicode_file): michael@0: """ michael@0: If you want to understand how this wonderful file format works checkout michael@0: Unicode Standard Annex #44 - Unicode Character Database michael@0: http://www.unicode.org/reports/tr44/ michael@0: """ michael@0: michael@0: reader = csv.reader(unicode_data, delimiter=';') michael@0: michael@0: while True: michael@0: row = reader.next() michael@0: name = row[1] michael@0: michael@0: # We need to expand the UAX #44 4.2.3 Code Point Range michael@0: if name.startswith('<') and name.endswith('First>'): michael@0: next_row = reader.next() michael@0: michael@0: for i in range(int(row[0], 16), int(next_row[0], 16) + 1): michael@0: row[0] = i michael@0: row[1] = name[1:-8] michael@0: michael@0: yield row michael@0: else: michael@0: row[0] = int(row[0], 16) michael@0: yield row michael@0: michael@0: def generate_unicode_stuff(unicode_data, data_file, test_mapping, test_space): michael@0: dummy = (0, 0, 0) michael@0: table = [dummy] michael@0: cache = {dummy: 0} michael@0: index = [0] * (MAX + 1) michael@0: test_table = {} michael@0: test_space_table = [] michael@0: michael@0: for row in read_unicode_data(unicode_data): michael@0: code = row[0] michael@0: name = row[1] michael@0: category = row[2] michael@0: alias = row[-5] michael@0: uppercase = row[-3] michael@0: lowercase = row[-2] michael@0: flags = 0 michael@0: michael@0: if code > MAX: michael@0: break michael@0: michael@0: # we combine whitespace and lineterminators because in pratice we don't need them separated michael@0: if category == 'Zs' or code in whitespace or code in line_terminator: michael@0: flags |= FLAG_SPACE michael@0: test_space_table.append(code) michael@0: if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter) michael@0: flags |= FLAG_LETTER michael@0: if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart) michael@0: flags |= FLAG_IDENTIFIER_PART michael@0: michael@0: if uppercase: michael@0: upper = int(uppercase, 16) michael@0: else: michael@0: upper = code michael@0: michael@0: if lowercase: michael@0: lower = int(lowercase, 16) michael@0: else: michael@0: lower = code michael@0: michael@0: test_table[code] = (upper, lower, name, alias) michael@0: michael@0: up_d = upper - code michael@0: low_d = lower - code michael@0: michael@0: assert up_d > -65535 and up_d < 65535 michael@0: assert low_d > -65535 and low_d < 65535 michael@0: michael@0: upper = up_d & 0xffff michael@0: lower = low_d & 0xffff michael@0: michael@0: item = (upper, lower, flags) michael@0: michael@0: i = cache.get(item) michael@0: if i is None: michael@0: assert item not in table michael@0: cache[item] = i = len(table) michael@0: table.append(item) michael@0: index[code] = i michael@0: michael@0: test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') michael@0: test_mapping.write(public_domain) michael@0: test_mapping.write('var mapping = [\n') michael@0: for code in range(0, MAX + 1): michael@0: entry = test_table.get(code) michael@0: michael@0: if entry: michael@0: upper, lower, name, alias = entry michael@0: test_mapping.write(' [' + hex(upper) + ', ' + hex(lower) + '], /* ' + michael@0: name + (' (' + alias + ')' if alias else '') + ' */\n') michael@0: else: michael@0: test_mapping.write(' [' + hex(code) + ', ' + hex(code) + '],\n') michael@0: test_mapping.write('];') michael@0: test_mapping.write(""" michael@0: assertEq(mapping.length, 0x10000); michael@0: for (var i = 0; i <= 0xffff; i++) { michael@0: var char = String.fromCharCode(i); michael@0: var info = mapping[i]; michael@0: michael@0: assertEq(char.toUpperCase().charCodeAt(0), info[0]); michael@0: assertEq(char.toLowerCase().charCodeAt(0), info[1]); michael@0: } michael@0: michael@0: if (typeof reportCompare === "function") michael@0: reportCompare(true, true); michael@0: """) michael@0: michael@0: test_space.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') michael@0: test_space.write(public_domain) michael@0: test_space.write('var onlySpace = String.fromCharCode(' + michael@0: ', '.join(map(lambda c: hex(c), test_space_table)) + ');\n') michael@0: test_space.write(""" michael@0: assertEq(onlySpace.trim(), ""); michael@0: assertEq((onlySpace + 'aaaa').trim(), 'aaaa'); michael@0: assertEq(('aaaa' + onlySpace).trim(), 'aaaa'); michael@0: assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa'); michael@0: michael@0: if (typeof reportCompare === "function") michael@0: reportCompare(true, true); michael@0: """) michael@0: michael@0: index1, index2, shift = splitbins(index) michael@0: michael@0: # Don't forget to update CharInfo in Unicode.cpp if you need to change this michael@0: assert shift == 5 michael@0: michael@0: # verify correctness michael@0: for char in index: michael@0: test = table[index[char]] michael@0: michael@0: idx = index1[char >> shift] michael@0: idx = index2[(idx << shift) + (char & ((1 << shift) - 1))] michael@0: michael@0: assert test == table[idx] michael@0: michael@0: michael@0: comment = """ michael@0: /* michael@0: * So how does indexing work? michael@0: * First let's have a look at a jschar, 16-bits: michael@0: * [................] michael@0: * Step 1: michael@0: * Extracting the upper 11 bits from the jschar. michael@0: * upper = char >> 5 ([***********.....]) michael@0: * Step 2: michael@0: * Using these bits to get an reduced index from index1. michael@0: * index = index1[upper] michael@0: * Step 3: michael@0: * Combining the index and the bottom 5 bits of the original jschar. michael@0: * real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++]) michael@0: * michael@0: * The advantage here is that the biggest number in index1 doesn't need 10 bits, michael@0: * but 7 and we save some memory. michael@0: * michael@0: * Step 4: michael@0: * Get the character informations by looking up real_index in js_charinfo. michael@0: * michael@0: * Pseudocode of generation: michael@0: * michael@0: * let table be the mapping of jschar => js_charinfo_index michael@0: * let index1 be an empty array michael@0: * let index2 be an empty array michael@0: * let cache be a hash map michael@0: * michael@0: * while shift is less then maximal amount you can shift 0xffff before it's 0 michael@0: * let chunks be table split in chunks of size 2**shift michael@0: * michael@0: * for every chunk in chunks michael@0: * if chunk is in cache michael@0: * let index be cache[chunk] michael@0: * else michael@0: * let index be the max key of index2 + 1 michael@0: * for element in chunk michael@0: * push element to index2 michael@0: * put index as chunk in cache michael@0: * michael@0: * push index >> shift to index1 michael@0: * michael@0: * increase shift michael@0: * stop if you found the best shift michael@0: */ michael@0: """ michael@0: data_file.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') michael@0: data_file.write(public_domain) michael@0: data_file.write('#include "vm/Unicode.h"\n\n') michael@0: data_file.write('using namespace js;\n') michael@0: data_file.write('using namespace js::unicode;\n') michael@0: data_file.write(comment) michael@0: data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n') michael@0: for d in table: michael@0: data_file.write(' {') michael@0: data_file.write(', '.join((str(e) for e in d))) michael@0: data_file.write('},\n') michael@0: data_file.write('};\n') michael@0: data_file.write('\n') michael@0: michael@0: def dump(data, name, file): michael@0: file.write('const uint8_t unicode::' + name + '[] = {\n') michael@0: michael@0: line = pad = ' ' * 4 michael@0: lines = [] michael@0: for entry in data: michael@0: assert entry < 256 michael@0: s = str(entry) michael@0: s = s.rjust(3) michael@0: michael@0: if len(line + s) + 5 > 99: michael@0: lines.append(line.rstrip()) michael@0: line = pad + s + ', ' michael@0: else: michael@0: line = line + s + ', ' michael@0: lines.append(line.rstrip()) michael@0: michael@0: file.write('\n'.join(lines)) michael@0: file.write('\n};\n') michael@0: michael@0: dump(index1, 'index1', data_file) michael@0: data_file.write('\n') michael@0: dump(index2, 'index2', data_file) michael@0: data_file.write('\n') michael@0: michael@0: data_file.write('\n') michael@0: michael@0: def getsize(data): michael@0: """ return smallest possible integer size for the given array """ michael@0: maxdata = max(data) michael@0: assert maxdata < 2**32 michael@0: michael@0: if maxdata < 256: michael@0: return 1 michael@0: elif maxdata < 65536: michael@0: return 2 michael@0: else: michael@0: return 4 michael@0: michael@0: def splitbins(t): michael@0: """t -> (t1, t2, shift). Split a table to save space. michael@0: michael@0: t is a sequence of ints. This function can be useful to save space if michael@0: many of the ints are the same. t1 and t2 are lists of ints, and shift michael@0: is an int, chosen to minimize the combined size of t1 and t2 (in C michael@0: code), and where for each i in range(len(t)), michael@0: t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] michael@0: where mask is a bitmask isolating the last "shift" bits. michael@0: """ michael@0: michael@0: def dump(t1, t2, shift, bytes): michael@0: print("%d+%d bins at shift %d; %d bytes" % ( michael@0: len(t1), len(t2), shift, bytes), file=sys.stderr) michael@0: print("Size of original table:", len(t)*getsize(t), \ michael@0: "bytes", file=sys.stderr) michael@0: n = len(t)-1 # last valid index michael@0: maxshift = 0 # the most we can shift n and still have something left michael@0: if n > 0: michael@0: while n >> 1: michael@0: n >>= 1 michael@0: maxshift += 1 michael@0: del n michael@0: bytes = sys.maxsize # smallest total size so far michael@0: t = tuple(t) # so slices can be dict keys michael@0: for shift in range(maxshift + 1): michael@0: t1 = [] michael@0: t2 = [] michael@0: size = 2**shift michael@0: bincache = {} michael@0: michael@0: for i in range(0, len(t), size): michael@0: bin = t[i:i + size] michael@0: michael@0: index = bincache.get(bin) michael@0: if index is None: michael@0: index = len(t2) michael@0: bincache[bin] = index michael@0: t2.extend(bin) michael@0: t1.append(index >> shift) michael@0: michael@0: # determine memory size michael@0: b = len(t1) * getsize(t1) + len(t2) * getsize(t2) michael@0: if b < bytes: michael@0: best = t1, t2, shift michael@0: bytes = b michael@0: t1, t2, shift = best michael@0: michael@0: print("Best:", end=' ', file=sys.stderr) michael@0: dump(t1, t2, shift, bytes) michael@0: michael@0: # exhaustively verify that the decomposition is correct michael@0: mask = 2**shift - 1 michael@0: for i in range(len(t)): michael@0: assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] michael@0: return best michael@0: michael@0: if __name__ == '__main__': michael@0: import urllib2 michael@0: michael@0: if len(sys.argv) > 1: michael@0: print('Always make sure you have the newest UnicodeData.txt!') michael@0: unicode_data = open(sys.argv[1], 'r') michael@0: else: michael@0: print('Downloading...') michael@0: reader = urllib2.urlopen('http://unicode.org/Public/UNIDATA/UnicodeData.txt') michael@0: data = reader.read() michael@0: reader.close() michael@0: unicode_data = open('UnicodeData.txt', 'w+') michael@0: unicode_data.write(data) michael@0: unicode_data.seek(0) michael@0: michael@0: print('Generating...') michael@0: generate_unicode_stuff(unicode_data, michael@0: open('Unicode.cpp', 'w'), michael@0: open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'), michael@0: open('../tests/ecma_5/String/string-space-trim.js', 'w'))