The Tor Browser: js/src/vm/make_unicode.py@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 # -*- coding: utf-8 -*-

     2 # Based upon makeunicodedata.py

     3 # (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py)

     4 # written by Fredrik Lundh (fredrik@pythonware.com)

5 #

     6 #    Copyright (C) 2011 Tom Schuster <evilpies@gmail.com>

7 #

     8 #    This program is free software: you can redistribute it and/or modify

     9 #    it under the terms of the GNU General Public License as published by

    10 #    the Free Software Foundation, either version 3 of the License, or

    11 #    (at your option) any later version.

    12 #

    13 #    This program is distributed in the hope that it will be useful,

    14 #    but WITHOUT ANY WARRANTY; without even the implied warranty of

    15 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

    16 #    GNU General Public License for more details.

    17 #

    18 #    You should have received a copy of the GNU General Public License

    19 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.

    21 from __future__ import print_function

    22 import csv

    23 import sys

    25 # ECMAScript 5 $ 7.2

    26 whitespace = [

    27     # python doesn't support using control character names :(

    28     0x9, # CHARACTER TABULATION

    29     0xb, # LINE TABULATION

    30     0xc, # FORM FEED

    31     ord(u'\N{SPACE}'),

    32     ord(u'\N{NO-BREAK SPACE}'),

    33     ord(u'\N{ZERO WIDTH NO-BREAK SPACE}'), # also BOM

    34 ]

    36 # $ 7.3

    37 line_terminator = [

    38     0xa, # LINE FEED

    39     0xd, # CARRIAGE RETURN

    40     ord(u'\N{LINE SEPARATOR}'),

    41     ord(u'\N{PARAGRAPH SEPARATOR}'),

    42 ]

    44 # These are also part of IdentifierPart $7.6

    45 ZWNJ = ord(u'\N{ZERO WIDTH NON-JOINER}')

    46 ZWJ = ord(u'\N{ZERO WIDTH JOINER}')

    48 FLAG_SPACE = 1 << 0

    49 FLAG_LETTER = 1 << 1

    50 FLAG_IDENTIFIER_PART = 1 << 2

    52 MAX = 0xffff

    54 public_domain = """

    55 /*

    56  * Any copyright is dedicated to the Public Domain.

    57  * http://creativecommons.org/licenses/publicdomain/

    58  */

    59 """

    61 def read_unicode_data(unicode_file):

    62     """

    63         If you want to understand how this wonderful file format works checkout

    64           Unicode Standard Annex #44 - Unicode Character Database

    65           http://www.unicode.org/reports/tr44/

    66     """

    68     reader = csv.reader(unicode_data, delimiter=';')

    70     while True:

    71         row = reader.next()

    72         name = row[1]

    74         # We need to expand the UAX #44 4.2.3 Code Point Range

    75         if name.startswith('<') and name.endswith('First>'):

    76             next_row = reader.next()

    78             for i in range(int(row[0], 16), int(next_row[0], 16) + 1):

    79                 row[0] = i

    80                 row[1] = name[1:-8]

    82                 yield row

    83         else:

    84             row[0] = int(row[0], 16)

    85             yield row

    87 def generate_unicode_stuff(unicode_data, data_file, test_mapping, test_space):

    88     dummy = (0, 0, 0)

    89     table = [dummy]

    90     cache = {dummy: 0}

    91     index = [0] * (MAX + 1)

    92     test_table = {}

    93     test_space_table = []

    95     for row in read_unicode_data(unicode_data):

    96         code = row[0]

    97         name = row[1]

    98         category = row[2]

    99         alias = row[-5]

   100         uppercase = row[-3]

   101         lowercase = row[-2]

   102         flags = 0

   104         if code > MAX:

   105             break

   107         # we combine whitespace and lineterminators because in pratice we don't need them separated

   108         if category == 'Zs' or code in whitespace or code in line_terminator:

   109             flags |= FLAG_SPACE

   110             test_space_table.append(code)

   111         if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter)

   112             flags |= FLAG_LETTER

   113         if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart)

   114             flags |= FLAG_IDENTIFIER_PART

   116         if uppercase:

   117             upper = int(uppercase, 16)

   118         else:

   119             upper = code

   121         if lowercase:

   122             lower = int(lowercase, 16)

   123         else:

   124             lower = code

   126         test_table[code] = (upper, lower, name, alias)

   128         up_d = upper - code

   129         low_d = lower - code

   131         assert up_d > -65535 and up_d < 65535

   132         assert low_d > -65535 and low_d < 65535

   134         upper = up_d & 0xffff

   135         lower = low_d & 0xffff

   137         item = (upper, lower, flags)

   139         i = cache.get(item)

   140         if i is None:

   141             assert item not in table

   142             cache[item] = i = len(table)

   143             table.append(item)

   144         index[code] = i

   146     test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')

   147     test_mapping.write(public_domain)

   148     test_mapping.write('var mapping = [\n')

   149     for code in range(0, MAX + 1):

   150         entry = test_table.get(code)

   152         if entry:

   153             upper, lower, name, alias = entry

   154             test_mapping.write('  [' + hex(upper) + ', ' + hex(lower) + '], /* ' +

   155                        name + (' (' + alias + ')' if alias else '') + ' */\n')

   156         else:

   157             test_mapping.write('  [' + hex(code) + ', ' + hex(code) + '],\n')

   158     test_mapping.write('];')

   159     test_mapping.write("""

   160 assertEq(mapping.length, 0x10000);

   161 for (var i = 0; i <= 0xffff; i++) {

   162     var char = String.fromCharCode(i);

   163     var info = mapping[i];

   165     assertEq(char.toUpperCase().charCodeAt(0), info[0]);

   166     assertEq(char.toLowerCase().charCodeAt(0), info[1]);

   167 }

   169 if (typeof reportCompare === "function")

   170     reportCompare(true, true);

   171 """)

   173     test_space.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')

   174     test_space.write(public_domain)

   175     test_space.write('var onlySpace = String.fromCharCode(' +

   176                      ', '.join(map(lambda c: hex(c), test_space_table)) + ');\n')

   177     test_space.write("""

   178 assertEq(onlySpace.trim(), "");

   179 assertEq((onlySpace + 'aaaa').trim(), 'aaaa');

   180 assertEq(('aaaa' + onlySpace).trim(), 'aaaa');

   181 assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa');

   183 if (typeof reportCompare === "function")

   184     reportCompare(true, true);

   185 """)

   187     index1, index2, shift = splitbins(index)

   189     # Don't forget to update CharInfo in Unicode.cpp if you need to change this

   190     assert shift == 5

   192     # verify correctness

   193     for char in index:

   194         test = table[index[char]]

   196         idx = index1[char >> shift]

   197         idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]

   199         assert test == table[idx]

   202     comment = """

   203 /*

   204  * So how does indexing work?

   205  * First let's have a look at a jschar, 16-bits:

   206  *              [................]

   207  * Step 1:

   208  *  Extracting the upper 11 bits from the jschar.

   209  *   upper = char >>  5 ([***********.....])

   210  * Step 2:

   211  *  Using these bits to get an reduced index from index1.

   212  *   index = index1[upper]

   213  * Step 3:

   214  *  Combining the index and the bottom 5 bits of the original jschar.

   215  *   real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++])

   216  *

   217  * The advantage here is that the biggest number in index1 doesn't need 10 bits,

   218  * but 7 and we save some memory.

   219  *

   220  * Step 4:

   221  *  Get the character informations by looking up real_index in js_charinfo.

   222  *

   223  * Pseudocode of generation:

   224  *

   225  * let table be the mapping of jschar => js_charinfo_index

   226  * let index1 be an empty array

   227  * let index2 be an empty array

   228  * let cache be a hash map

   229  *

   230  * while shift is less then maximal amount you can shift 0xffff before it's 0

   231  *  let chunks be table split in chunks of size 2**shift

   232  *

   233  *  for every chunk in chunks

   234  *   if chunk is in cache

   235  *    let index be cache[chunk]

   236  *   else

   237  *    let index be the max key of index2 + 1

   238  *    for element in chunk

   239  *     push element to index2

   240  *    put index as chunk in cache

   241  *

   242  *   push index >> shift to index1

   243  *

   244  *  increase shift

   245  *  stop if you found the best shift

   246  */

   247 """

   248     data_file.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')

   249     data_file.write(public_domain)

   250     data_file.write('#include "vm/Unicode.h"\n\n')

   251     data_file.write('using namespace js;\n')

   252     data_file.write('using namespace js::unicode;\n')

   253     data_file.write(comment)

   254     data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n')

   255     for d in table:

   256         data_file.write('    {')

   257         data_file.write(', '.join((str(e) for e in d)))

   258         data_file.write('},\n')

   259     data_file.write('};\n')

   260     data_file.write('\n')

   262     def dump(data, name, file):

   263         file.write('const uint8_t unicode::' + name + '[] = {\n')

   265         line = pad = ' ' * 4

   266         lines = []

   267         for entry in data:

   268             assert entry < 256

   269             s = str(entry)

   270             s = s.rjust(3)

   272             if len(line + s) + 5 > 99:

   273                 lines.append(line.rstrip())

   274                 line = pad + s + ', '

   275             else:

   276                 line = line + s + ', '

   277         lines.append(line.rstrip())

   279         file.write('\n'.join(lines))

   280         file.write('\n};\n')

   282     dump(index1, 'index1', data_file)

   283     data_file.write('\n')

   284     dump(index2, 'index2', data_file)

   285     data_file.write('\n')

   287     data_file.write('\n')

   289 def getsize(data):

   290     """ return smallest possible integer size for the given array """

   291     maxdata = max(data)

   292     assert maxdata < 2**32

   294     if maxdata < 256:

   295         return 1

   296     elif maxdata < 65536:

   297         return 2

   298     else:

   299         return 4

   301 def splitbins(t):

   302     """t -> (t1, t2, shift).  Split a table to save space.

   304     t is a sequence of ints.  This function can be useful to save space if

   305     many of the ints are the same.  t1 and t2 are lists of ints, and shift

   306     is an int, chosen to minimize the combined size of t1 and t2 (in C

   307     code), and where for each i in range(len(t)),

   308         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

   309     where mask is a bitmask isolating the last "shift" bits.

   310     """

   312     def dump(t1, t2, shift, bytes):

   313         print("%d+%d bins at shift %d; %d bytes" % (

   314             len(t1), len(t2), shift, bytes), file=sys.stderr)

   315         print("Size of original table:", len(t)*getsize(t), \

   316             "bytes", file=sys.stderr)

   317     n = len(t)-1    # last valid index

   318     maxshift = 0    # the most we can shift n and still have something left

   319     if n > 0:

   320         while n >> 1:

   321             n >>= 1

   322             maxshift += 1

   323     del n

   324     bytes = sys.maxsize  # smallest total size so far

   325     t = tuple(t)    # so slices can be dict keys

   326     for shift in range(maxshift + 1):

   327         t1 = []

   328         t2 = []

   329         size = 2**shift

   330         bincache = {}

   332         for i in range(0, len(t), size):

   333             bin = t[i:i + size]

   335             index = bincache.get(bin)

   336             if index is None:

   337                 index = len(t2)

   338                 bincache[bin] = index

   339                 t2.extend(bin)

   340             t1.append(index >> shift)

   342         # determine memory size

   343         b = len(t1) * getsize(t1) + len(t2) * getsize(t2)

   344         if b < bytes:

   345             best = t1, t2, shift

   346             bytes = b

   347     t1, t2, shift = best

   349     print("Best:", end=' ', file=sys.stderr)

   350     dump(t1, t2, shift, bytes)

   352     # exhaustively verify that the decomposition is correct

   353     mask = 2**shift - 1

   354     for i in range(len(t)):

   355         assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]

   356     return best

   358 if __name__ == '__main__':

   359     import urllib2

   361     if len(sys.argv) > 1:

   362         print('Always make sure you have the newest UnicodeData.txt!')

   363         unicode_data = open(sys.argv[1], 'r')

   364     else:

   365         print('Downloading...')

   366         reader = urllib2.urlopen('http://unicode.org/Public/UNIDATA/UnicodeData.txt')

   367         data = reader.read()

   368         reader.close()

   369         unicode_data = open('UnicodeData.txt', 'w+')

   370         unicode_data.write(data)

   371         unicode_data.seek(0)

   373     print('Generating...')

   374     generate_unicode_stuff(unicode_data,

   375         open('Unicode.cpp', 'w'),

   376         open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'),

   377         open('../tests/ecma_5/String/string-space-trim.js', 'w'))

The Tor Browser / file revision

js/src/vm/make_unicode.py@6474c204b198

js/src/vm/make_unicode.py