js/src/vm/make_unicode.py

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/js/src/vm/make_unicode.py	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,377 @@
     1.4 +# -*- coding: utf-8 -*-
     1.5 +# Based upon makeunicodedata.py
     1.6 +# (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py)
     1.7 +# written by Fredrik Lundh (fredrik@pythonware.com)
     1.8 +#
     1.9 +#    Copyright (C) 2011 Tom Schuster <evilpies@gmail.com>
    1.10 +#
    1.11 +#    This program is free software: you can redistribute it and/or modify
    1.12 +#    it under the terms of the GNU General Public License as published by
    1.13 +#    the Free Software Foundation, either version 3 of the License, or
    1.14 +#    (at your option) any later version.
    1.15 +#
    1.16 +#    This program is distributed in the hope that it will be useful,
    1.17 +#    but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.18 +#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    1.19 +#    GNU General Public License for more details.
    1.20 +#
    1.21 +#    You should have received a copy of the GNU General Public License
    1.22 +#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
    1.23 +
    1.24 +from __future__ import print_function
    1.25 +import csv
    1.26 +import sys
    1.27 +
    1.28 +# ECMAScript 5 $ 7.2
    1.29 +whitespace = [
    1.30 +    # python doesn't support using control character names :(
    1.31 +    0x9, # CHARACTER TABULATION
    1.32 +    0xb, # LINE TABULATION
    1.33 +    0xc, # FORM FEED
    1.34 +    ord(u'\N{SPACE}'),
    1.35 +    ord(u'\N{NO-BREAK SPACE}'),
    1.36 +    ord(u'\N{ZERO WIDTH NO-BREAK SPACE}'), # also BOM
    1.37 +]
    1.38 +
    1.39 +# $ 7.3
    1.40 +line_terminator = [
    1.41 +    0xa, # LINE FEED
    1.42 +    0xd, # CARRIAGE RETURN
    1.43 +    ord(u'\N{LINE SEPARATOR}'),
    1.44 +    ord(u'\N{PARAGRAPH SEPARATOR}'),
    1.45 +]
    1.46 +
    1.47 +# These are also part of IdentifierPart $7.6
    1.48 +ZWNJ = ord(u'\N{ZERO WIDTH NON-JOINER}')
    1.49 +ZWJ = ord(u'\N{ZERO WIDTH JOINER}')
    1.50 +
    1.51 +FLAG_SPACE = 1 << 0
    1.52 +FLAG_LETTER = 1 << 1
    1.53 +FLAG_IDENTIFIER_PART = 1 << 2
    1.54 +
    1.55 +MAX = 0xffff
    1.56 +
    1.57 +public_domain = """
    1.58 +/*
    1.59 + * Any copyright is dedicated to the Public Domain.
    1.60 + * http://creativecommons.org/licenses/publicdomain/
    1.61 + */
    1.62 +"""
    1.63 +
    1.64 +def read_unicode_data(unicode_file):
    1.65 +    """
    1.66 +        If you want to understand how this wonderful file format works checkout
    1.67 +          Unicode Standard Annex #44 - Unicode Character Database
    1.68 +          http://www.unicode.org/reports/tr44/
    1.69 +    """
    1.70 +
    1.71 +    reader = csv.reader(unicode_data, delimiter=';')
    1.72 +
    1.73 +    while True:
    1.74 +        row = reader.next()
    1.75 +        name = row[1]
    1.76 +
    1.77 +        # We need to expand the UAX #44 4.2.3 Code Point Range
    1.78 +        if name.startswith('<') and name.endswith('First>'):
    1.79 +            next_row = reader.next()
    1.80 +
    1.81 +            for i in range(int(row[0], 16), int(next_row[0], 16) + 1):
    1.82 +                row[0] = i
    1.83 +                row[1] = name[1:-8]
    1.84 +
    1.85 +                yield row
    1.86 +        else:
    1.87 +            row[0] = int(row[0], 16)
    1.88 +            yield row
    1.89 +
    1.90 +def generate_unicode_stuff(unicode_data, data_file, test_mapping, test_space):
    1.91 +    dummy = (0, 0, 0)
    1.92 +    table = [dummy]
    1.93 +    cache = {dummy: 0}
    1.94 +    index = [0] * (MAX + 1)
    1.95 +    test_table = {}
    1.96 +    test_space_table = []
    1.97 +
    1.98 +    for row in read_unicode_data(unicode_data):
    1.99 +        code = row[0]
   1.100 +        name = row[1]
   1.101 +        category = row[2]
   1.102 +        alias = row[-5]
   1.103 +        uppercase = row[-3]
   1.104 +        lowercase = row[-2]
   1.105 +        flags = 0
   1.106 +
   1.107 +        if code > MAX:
   1.108 +            break
   1.109 +
   1.110 +        # we combine whitespace and lineterminators because in pratice we don't need them separated
   1.111 +        if category == 'Zs' or code in whitespace or code in line_terminator:
   1.112 +            flags |= FLAG_SPACE
   1.113 +            test_space_table.append(code)
   1.114 +        if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter)
   1.115 +            flags |= FLAG_LETTER
   1.116 +        if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart)
   1.117 +            flags |= FLAG_IDENTIFIER_PART
   1.118 +
   1.119 +        if uppercase:
   1.120 +            upper = int(uppercase, 16)
   1.121 +        else:
   1.122 +            upper = code
   1.123 +
   1.124 +        if lowercase:
   1.125 +            lower = int(lowercase, 16)
   1.126 +        else:
   1.127 +            lower = code
   1.128 +
   1.129 +        test_table[code] = (upper, lower, name, alias)
   1.130 +
   1.131 +        up_d = upper - code
   1.132 +        low_d = lower - code
   1.133 +
   1.134 +        assert up_d > -65535 and up_d < 65535
   1.135 +        assert low_d > -65535 and low_d < 65535
   1.136 +
   1.137 +        upper = up_d & 0xffff
   1.138 +        lower = low_d & 0xffff
   1.139 +
   1.140 +        item = (upper, lower, flags)
   1.141 +
   1.142 +        i = cache.get(item)
   1.143 +        if i is None:
   1.144 +            assert item not in table
   1.145 +            cache[item] = i = len(table)
   1.146 +            table.append(item)
   1.147 +        index[code] = i
   1.148 +
   1.149 +    test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
   1.150 +    test_mapping.write(public_domain)
   1.151 +    test_mapping.write('var mapping = [\n')
   1.152 +    for code in range(0, MAX + 1):
   1.153 +        entry = test_table.get(code)
   1.154 +
   1.155 +        if entry:
   1.156 +            upper, lower, name, alias = entry
   1.157 +            test_mapping.write('  [' + hex(upper) + ', ' + hex(lower) + '], /* ' +
   1.158 +                       name + (' (' + alias + ')' if alias else '') + ' */\n')
   1.159 +        else:
   1.160 +            test_mapping.write('  [' + hex(code) + ', ' + hex(code) + '],\n')
   1.161 +    test_mapping.write('];')
   1.162 +    test_mapping.write("""
   1.163 +assertEq(mapping.length, 0x10000);
   1.164 +for (var i = 0; i <= 0xffff; i++) {
   1.165 +    var char = String.fromCharCode(i);
   1.166 +    var info = mapping[i];
   1.167 +
   1.168 +    assertEq(char.toUpperCase().charCodeAt(0), info[0]);
   1.169 +    assertEq(char.toLowerCase().charCodeAt(0), info[1]);
   1.170 +}
   1.171 +
   1.172 +if (typeof reportCompare === "function")
   1.173 +    reportCompare(true, true);
   1.174 +""")
   1.175 +
   1.176 +    test_space.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
   1.177 +    test_space.write(public_domain)
   1.178 +    test_space.write('var onlySpace = String.fromCharCode(' +
   1.179 +                     ', '.join(map(lambda c: hex(c), test_space_table)) + ');\n')
   1.180 +    test_space.write("""
   1.181 +assertEq(onlySpace.trim(), "");
   1.182 +assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
   1.183 +assertEq(('aaaa' + onlySpace).trim(), 'aaaa');
   1.184 +assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa');
   1.185 +
   1.186 +if (typeof reportCompare === "function")
   1.187 +    reportCompare(true, true);
   1.188 +""")
   1.189 +
   1.190 +    index1, index2, shift = splitbins(index)
   1.191 +
   1.192 +    # Don't forget to update CharInfo in Unicode.cpp if you need to change this
   1.193 +    assert shift == 5
   1.194 +
   1.195 +    # verify correctness
   1.196 +    for char in index:
   1.197 +        test = table[index[char]]
   1.198 +
   1.199 +        idx = index1[char >> shift]
   1.200 +        idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
   1.201 +
   1.202 +        assert test == table[idx]
   1.203 +
   1.204 +
   1.205 +    comment = """
   1.206 +/*
   1.207 + * So how does indexing work?
   1.208 + * First let's have a look at a jschar, 16-bits:
   1.209 + *              [................]
   1.210 + * Step 1:
   1.211 + *  Extracting the upper 11 bits from the jschar.
   1.212 + *   upper = char >>  5 ([***********.....])
   1.213 + * Step 2:
   1.214 + *  Using these bits to get an reduced index from index1.
   1.215 + *   index = index1[upper]
   1.216 + * Step 3:
   1.217 + *  Combining the index and the bottom 5 bits of the original jschar.
   1.218 + *   real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++])
   1.219 + *
   1.220 + * The advantage here is that the biggest number in index1 doesn't need 10 bits,
   1.221 + * but 7 and we save some memory.
   1.222 + *
   1.223 + * Step 4:
   1.224 + *  Get the character informations by looking up real_index in js_charinfo.
   1.225 + *
   1.226 + * Pseudocode of generation:
   1.227 + *
   1.228 + * let table be the mapping of jschar => js_charinfo_index
   1.229 + * let index1 be an empty array
   1.230 + * let index2 be an empty array
   1.231 + * let cache be a hash map
   1.232 + *
   1.233 + * while shift is less then maximal amount you can shift 0xffff before it's 0
   1.234 + *  let chunks be table split in chunks of size 2**shift
   1.235 + *
   1.236 + *  for every chunk in chunks
   1.237 + *   if chunk is in cache
   1.238 + *    let index be cache[chunk]
   1.239 + *   else
   1.240 + *    let index be the max key of index2 + 1
   1.241 + *    for element in chunk
   1.242 + *     push element to index2
   1.243 + *    put index as chunk in cache
   1.244 + *
   1.245 + *   push index >> shift to index1
   1.246 + *
   1.247 + *  increase shift
   1.248 + *  stop if you found the best shift
   1.249 + */
   1.250 +"""
   1.251 +    data_file.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
   1.252 +    data_file.write(public_domain)
   1.253 +    data_file.write('#include "vm/Unicode.h"\n\n')
   1.254 +    data_file.write('using namespace js;\n')
   1.255 +    data_file.write('using namespace js::unicode;\n')
   1.256 +    data_file.write(comment)
   1.257 +    data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n')
   1.258 +    for d in table:
   1.259 +        data_file.write('    {')
   1.260 +        data_file.write(', '.join((str(e) for e in d)))
   1.261 +        data_file.write('},\n')
   1.262 +    data_file.write('};\n')
   1.263 +    data_file.write('\n')
   1.264 +
   1.265 +    def dump(data, name, file):
   1.266 +        file.write('const uint8_t unicode::' + name + '[] = {\n')
   1.267 +
   1.268 +        line = pad = ' ' * 4
   1.269 +        lines = []
   1.270 +        for entry in data:
   1.271 +            assert entry < 256
   1.272 +            s = str(entry)
   1.273 +            s = s.rjust(3)
   1.274 +
   1.275 +            if len(line + s) + 5 > 99:
   1.276 +                lines.append(line.rstrip())
   1.277 +                line = pad + s + ', '
   1.278 +            else:
   1.279 +                line = line + s + ', '
   1.280 +        lines.append(line.rstrip())
   1.281 +
   1.282 +        file.write('\n'.join(lines))
   1.283 +        file.write('\n};\n')
   1.284 +
   1.285 +    dump(index1, 'index1', data_file)
   1.286 +    data_file.write('\n')
   1.287 +    dump(index2, 'index2', data_file)
   1.288 +    data_file.write('\n')
   1.289 +
   1.290 +    data_file.write('\n')
   1.291 +
   1.292 +def getsize(data):
   1.293 +    """ return smallest possible integer size for the given array """
   1.294 +    maxdata = max(data)
   1.295 +    assert maxdata < 2**32
   1.296 +
   1.297 +    if maxdata < 256:
   1.298 +        return 1
   1.299 +    elif maxdata < 65536:
   1.300 +        return 2
   1.301 +    else:
   1.302 +        return 4
   1.303 +
   1.304 +def splitbins(t):
   1.305 +    """t -> (t1, t2, shift).  Split a table to save space.
   1.306 +
   1.307 +    t is a sequence of ints.  This function can be useful to save space if
   1.308 +    many of the ints are the same.  t1 and t2 are lists of ints, and shift
   1.309 +    is an int, chosen to minimize the combined size of t1 and t2 (in C
   1.310 +    code), and where for each i in range(len(t)),
   1.311 +        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
   1.312 +    where mask is a bitmask isolating the last "shift" bits.
   1.313 +    """
   1.314 +
   1.315 +    def dump(t1, t2, shift, bytes):
   1.316 +        print("%d+%d bins at shift %d; %d bytes" % (
   1.317 +            len(t1), len(t2), shift, bytes), file=sys.stderr)
   1.318 +        print("Size of original table:", len(t)*getsize(t), \
   1.319 +            "bytes", file=sys.stderr)
   1.320 +    n = len(t)-1    # last valid index
   1.321 +    maxshift = 0    # the most we can shift n and still have something left
   1.322 +    if n > 0:
   1.323 +        while n >> 1:
   1.324 +            n >>= 1
   1.325 +            maxshift += 1
   1.326 +    del n
   1.327 +    bytes = sys.maxsize  # smallest total size so far
   1.328 +    t = tuple(t)    # so slices can be dict keys
   1.329 +    for shift in range(maxshift + 1):
   1.330 +        t1 = []
   1.331 +        t2 = []
   1.332 +        size = 2**shift
   1.333 +        bincache = {}
   1.334 +
   1.335 +        for i in range(0, len(t), size):
   1.336 +            bin = t[i:i + size]
   1.337 +
   1.338 +            index = bincache.get(bin)
   1.339 +            if index is None:
   1.340 +                index = len(t2)
   1.341 +                bincache[bin] = index
   1.342 +                t2.extend(bin)
   1.343 +            t1.append(index >> shift)
   1.344 +
   1.345 +        # determine memory size
   1.346 +        b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
   1.347 +        if b < bytes:
   1.348 +            best = t1, t2, shift
   1.349 +            bytes = b
   1.350 +    t1, t2, shift = best
   1.351 +
   1.352 +    print("Best:", end=' ', file=sys.stderr)
   1.353 +    dump(t1, t2, shift, bytes)
   1.354 +
   1.355 +    # exhaustively verify that the decomposition is correct
   1.356 +    mask = 2**shift - 1
   1.357 +    for i in range(len(t)):
   1.358 +        assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
   1.359 +    return best
   1.360 +
   1.361 +if __name__ == '__main__':
   1.362 +    import urllib2
   1.363 +
   1.364 +    if len(sys.argv) > 1:
   1.365 +        print('Always make sure you have the newest UnicodeData.txt!')
   1.366 +        unicode_data = open(sys.argv[1], 'r')
   1.367 +    else:
   1.368 +        print('Downloading...')
   1.369 +        reader = urllib2.urlopen('http://unicode.org/Public/UNIDATA/UnicodeData.txt')
   1.370 +        data = reader.read()
   1.371 +        reader.close()
   1.372 +        unicode_data = open('UnicodeData.txt', 'w+')
   1.373 +        unicode_data.write(data)
   1.374 +        unicode_data.seek(0)
   1.375 +
   1.376 +    print('Generating...')
   1.377 +    generate_unicode_stuff(unicode_data,
   1.378 +        open('Unicode.cpp', 'w'),
   1.379 +        open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'),
   1.380 +        open('../tests/ecma_5/String/string-space-trim.js', 'w'))

mercurial