js/src/vm/make_unicode.py

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 # -*- coding: utf-8 -*-
     2 # Based upon makeunicodedata.py
     3 # (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py)
     4 # written by Fredrik Lundh (fredrik@pythonware.com)
     5 #
     6 #    Copyright (C) 2011 Tom Schuster <evilpies@gmail.com>
     7 #
     8 #    This program is free software: you can redistribute it and/or modify
     9 #    it under the terms of the GNU General Public License as published by
    10 #    the Free Software Foundation, either version 3 of the License, or
    11 #    (at your option) any later version.
    12 #
    13 #    This program is distributed in the hope that it will be useful,
    14 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
    15 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    16 #    GNU General Public License for more details.
    17 #
    18 #    You should have received a copy of the GNU General Public License
    19 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
    21 from __future__ import print_function
    22 import csv
    23 import sys
    25 # ECMAScript 5 $ 7.2
    26 whitespace = [
    27     # python doesn't support using control character names :(
    28     0x9, # CHARACTER TABULATION
    29     0xb, # LINE TABULATION
    30     0xc, # FORM FEED
    31     ord(u'\N{SPACE}'),
    32     ord(u'\N{NO-BREAK SPACE}'),
    33     ord(u'\N{ZERO WIDTH NO-BREAK SPACE}'), # also BOM
    34 ]
    36 # $ 7.3
    37 line_terminator = [
    38     0xa, # LINE FEED
    39     0xd, # CARRIAGE RETURN
    40     ord(u'\N{LINE SEPARATOR}'),
    41     ord(u'\N{PARAGRAPH SEPARATOR}'),
    42 ]
    44 # These are also part of IdentifierPart $7.6
    45 ZWNJ = ord(u'\N{ZERO WIDTH NON-JOINER}')
    46 ZWJ = ord(u'\N{ZERO WIDTH JOINER}')
    48 FLAG_SPACE = 1 << 0
    49 FLAG_LETTER = 1 << 1
    50 FLAG_IDENTIFIER_PART = 1 << 2
    52 MAX = 0xffff
    54 public_domain = """
    55 /*
    56  * Any copyright is dedicated to the Public Domain.
    57  * http://creativecommons.org/licenses/publicdomain/
    58  */
    59 """
    61 def read_unicode_data(unicode_file):
    62     """
    63         If you want to understand how this wonderful file format works checkout
    64           Unicode Standard Annex #44 - Unicode Character Database
    65           http://www.unicode.org/reports/tr44/
    66     """
    68     reader = csv.reader(unicode_data, delimiter=';')
    70     while True:
    71         row = reader.next()
    72         name = row[1]
    74         # We need to expand the UAX #44 4.2.3 Code Point Range
    75         if name.startswith('<') and name.endswith('First>'):
    76             next_row = reader.next()
    78             for i in range(int(row[0], 16), int(next_row[0], 16) + 1):
    79                 row[0] = i
    80                 row[1] = name[1:-8]
    82                 yield row
    83         else:
    84             row[0] = int(row[0], 16)
    85             yield row
    87 def generate_unicode_stuff(unicode_data, data_file, test_mapping, test_space):
    88     dummy = (0, 0, 0)
    89     table = [dummy]
    90     cache = {dummy: 0}
    91     index = [0] * (MAX + 1)
    92     test_table = {}
    93     test_space_table = []
    95     for row in read_unicode_data(unicode_data):
    96         code = row[0]
    97         name = row[1]
    98         category = row[2]
    99         alias = row[-5]
   100         uppercase = row[-3]
   101         lowercase = row[-2]
   102         flags = 0
   104         if code > MAX:
   105             break
   107         # we combine whitespace and lineterminators because in pratice we don't need them separated
   108         if category == 'Zs' or code in whitespace or code in line_terminator:
   109             flags |= FLAG_SPACE
   110             test_space_table.append(code)
   111         if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter)
   112             flags |= FLAG_LETTER
   113         if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart)
   114             flags |= FLAG_IDENTIFIER_PART
   116         if uppercase:
   117             upper = int(uppercase, 16)
   118         else:
   119             upper = code
   121         if lowercase:
   122             lower = int(lowercase, 16)
   123         else:
   124             lower = code
   126         test_table[code] = (upper, lower, name, alias)
   128         up_d = upper - code
   129         low_d = lower - code
   131         assert up_d > -65535 and up_d < 65535
   132         assert low_d > -65535 and low_d < 65535
   134         upper = up_d & 0xffff
   135         lower = low_d & 0xffff
   137         item = (upper, lower, flags)
   139         i = cache.get(item)
   140         if i is None:
   141             assert item not in table
   142             cache[item] = i = len(table)
   143             table.append(item)
   144         index[code] = i
   146     test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
   147     test_mapping.write(public_domain)
   148     test_mapping.write('var mapping = [\n')
   149     for code in range(0, MAX + 1):
   150         entry = test_table.get(code)
   152         if entry:
   153             upper, lower, name, alias = entry
   154             test_mapping.write('  [' + hex(upper) + ', ' + hex(lower) + '], /* ' +
   155                        name + (' (' + alias + ')' if alias else '') + ' */\n')
   156         else:
   157             test_mapping.write('  [' + hex(code) + ', ' + hex(code) + '],\n')
   158     test_mapping.write('];')
   159     test_mapping.write("""
   160 assertEq(mapping.length, 0x10000);
   161 for (var i = 0; i <= 0xffff; i++) {
   162     var char = String.fromCharCode(i);
   163     var info = mapping[i];
   165     assertEq(char.toUpperCase().charCodeAt(0), info[0]);
   166     assertEq(char.toLowerCase().charCodeAt(0), info[1]);
   167 }
   169 if (typeof reportCompare === "function")
   170     reportCompare(true, true);
   171 """)
   173     test_space.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
   174     test_space.write(public_domain)
   175     test_space.write('var onlySpace = String.fromCharCode(' +
   176                      ', '.join(map(lambda c: hex(c), test_space_table)) + ');\n')
   177     test_space.write("""
   178 assertEq(onlySpace.trim(), "");
   179 assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
   180 assertEq(('aaaa' + onlySpace).trim(), 'aaaa');
   181 assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa');
   183 if (typeof reportCompare === "function")
   184     reportCompare(true, true);
   185 """)
   187     index1, index2, shift = splitbins(index)
   189     # Don't forget to update CharInfo in Unicode.cpp if you need to change this
   190     assert shift == 5
   192     # verify correctness
   193     for char in index:
   194         test = table[index[char]]
   196         idx = index1[char >> shift]
   197         idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
   199         assert test == table[idx]
   202     comment = """
   203 /*
   204  * So how does indexing work?
   205  * First let's have a look at a jschar, 16-bits:
   206  *              [................]
   207  * Step 1:
   208  *  Extracting the upper 11 bits from the jschar.
   209  *   upper = char >>  5 ([***********.....])
   210  * Step 2:
   211  *  Using these bits to get an reduced index from index1.
   212  *   index = index1[upper]
   213  * Step 3:
   214  *  Combining the index and the bottom 5 bits of the original jschar.
   215  *   real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++])
   216  *
   217  * The advantage here is that the biggest number in index1 doesn't need 10 bits,
   218  * but 7 and we save some memory.
   219  *
   220  * Step 4:
   221  *  Get the character informations by looking up real_index in js_charinfo.
   222  *
   223  * Pseudocode of generation:
   224  *
   225  * let table be the mapping of jschar => js_charinfo_index
   226  * let index1 be an empty array
   227  * let index2 be an empty array
   228  * let cache be a hash map
   229  *
   230  * while shift is less then maximal amount you can shift 0xffff before it's 0
   231  *  let chunks be table split in chunks of size 2**shift
   232  *
   233  *  for every chunk in chunks
   234  *   if chunk is in cache
   235  *    let index be cache[chunk]
   236  *   else
   237  *    let index be the max key of index2 + 1
   238  *    for element in chunk
   239  *     push element to index2
   240  *    put index as chunk in cache
   241  *
   242  *   push index >> shift to index1
   243  *
   244  *  increase shift
   245  *  stop if you found the best shift
   246  */
   247 """
   248     data_file.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
   249     data_file.write(public_domain)
   250     data_file.write('#include "vm/Unicode.h"\n\n')
   251     data_file.write('using namespace js;\n')
   252     data_file.write('using namespace js::unicode;\n')
   253     data_file.write(comment)
   254     data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n')
   255     for d in table:
   256         data_file.write('    {')
   257         data_file.write(', '.join((str(e) for e in d)))
   258         data_file.write('},\n')
   259     data_file.write('};\n')
   260     data_file.write('\n')
   262     def dump(data, name, file):
   263         file.write('const uint8_t unicode::' + name + '[] = {\n')
   265         line = pad = ' ' * 4
   266         lines = []
   267         for entry in data:
   268             assert entry < 256
   269             s = str(entry)
   270             s = s.rjust(3)
   272             if len(line + s) + 5 > 99:
   273                 lines.append(line.rstrip())
   274                 line = pad + s + ', '
   275             else:
   276                 line = line + s + ', '
   277         lines.append(line.rstrip())
   279         file.write('\n'.join(lines))
   280         file.write('\n};\n')
   282     dump(index1, 'index1', data_file)
   283     data_file.write('\n')
   284     dump(index2, 'index2', data_file)
   285     data_file.write('\n')
   287     data_file.write('\n')
   289 def getsize(data):
   290     """ return smallest possible integer size for the given array """
   291     maxdata = max(data)
   292     assert maxdata < 2**32
   294     if maxdata < 256:
   295         return 1
   296     elif maxdata < 65536:
   297         return 2
   298     else:
   299         return 4
   301 def splitbins(t):
   302     """t -> (t1, t2, shift).  Split a table to save space.
   304     t is a sequence of ints.  This function can be useful to save space if
   305     many of the ints are the same.  t1 and t2 are lists of ints, and shift
   306     is an int, chosen to minimize the combined size of t1 and t2 (in C
   307     code), and where for each i in range(len(t)),
   308         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
   309     where mask is a bitmask isolating the last "shift" bits.
   310     """
   312     def dump(t1, t2, shift, bytes):
   313         print("%d+%d bins at shift %d; %d bytes" % (
   314             len(t1), len(t2), shift, bytes), file=sys.stderr)
   315         print("Size of original table:", len(t)*getsize(t), \
   316             "bytes", file=sys.stderr)
   317     n = len(t)-1    # last valid index
   318     maxshift = 0    # the most we can shift n and still have something left
   319     if n > 0:
   320         while n >> 1:
   321             n >>= 1
   322             maxshift += 1
   323     del n
   324     bytes = sys.maxsize  # smallest total size so far
   325     t = tuple(t)    # so slices can be dict keys
   326     for shift in range(maxshift + 1):
   327         t1 = []
   328         t2 = []
   329         size = 2**shift
   330         bincache = {}
   332         for i in range(0, len(t), size):
   333             bin = t[i:i + size]
   335             index = bincache.get(bin)
   336             if index is None:
   337                 index = len(t2)
   338                 bincache[bin] = index
   339                 t2.extend(bin)
   340             t1.append(index >> shift)
   342         # determine memory size
   343         b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
   344         if b < bytes:
   345             best = t1, t2, shift
   346             bytes = b
   347     t1, t2, shift = best
   349     print("Best:", end=' ', file=sys.stderr)
   350     dump(t1, t2, shift, bytes)
   352     # exhaustively verify that the decomposition is correct
   353     mask = 2**shift - 1
   354     for i in range(len(t)):
   355         assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
   356     return best
   358 if __name__ == '__main__':
   359     import urllib2
   361     if len(sys.argv) > 1:
   362         print('Always make sure you have the newest UnicodeData.txt!')
   363         unicode_data = open(sys.argv[1], 'r')
   364     else:
   365         print('Downloading...')
   366         reader = urllib2.urlopen('http://unicode.org/Public/UNIDATA/UnicodeData.txt')
   367         data = reader.read()
   368         reader.close()
   369         unicode_data = open('UnicodeData.txt', 'w+')
   370         unicode_data.write(data)
   371         unicode_data.seek(0)
   373     print('Generating...')
   374     generate_unicode_stuff(unicode_data,
   375         open('Unicode.cpp', 'w'),
   376         open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'),
   377         open('../tests/ecma_5/String/string-space-trim.js', 'w'))

mercurial