michael@0: # -*- coding: utf-8 -*-
michael@0: # Based upon makeunicodedata.py
michael@0: # (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py)
michael@0: # written by Fredrik Lundh (fredrik@pythonware.com)
michael@0: #
michael@0: #    Copyright (C) 2011 Tom Schuster <evilpies@gmail.com>
michael@0: #
michael@0: #    This program is free software: you can redistribute it and/or modify
michael@0: #    it under the terms of the GNU General Public License as published by
michael@0: #    the Free Software Foundation, either version 3 of the License, or
michael@0: #    (at your option) any later version.
michael@0: #
michael@0: #    This program is distributed in the hope that it will be useful,
michael@0: #    but WITHOUT ANY WARRANTY; without even the implied warranty of
michael@0: #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
michael@0: #    GNU General Public License for more details.
michael@0: #
michael@0: #    You should have received a copy of the GNU General Public License
michael@0: #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
michael@0: 
michael@0: from __future__ import print_function
michael@0: import csv
michael@0: import sys
michael@0: 
michael@0: # ECMAScript 5 $ 7.2
michael@0: whitespace = [
michael@0:     # python doesn't support using control character names :(
michael@0:     0x9, # CHARACTER TABULATION
michael@0:     0xb, # LINE TABULATION
michael@0:     0xc, # FORM FEED
michael@0:     ord(u'\N{SPACE}'),
michael@0:     ord(u'\N{NO-BREAK SPACE}'),
michael@0:     ord(u'\N{ZERO WIDTH NO-BREAK SPACE}'), # also BOM
michael@0: ]
michael@0: 
michael@0: # $ 7.3
michael@0: line_terminator = [
michael@0:     0xa, # LINE FEED
michael@0:     0xd, # CARRIAGE RETURN
michael@0:     ord(u'\N{LINE SEPARATOR}'),
michael@0:     ord(u'\N{PARAGRAPH SEPARATOR}'),
michael@0: ]
michael@0: 
michael@0: # These are also part of IdentifierPart $7.6
michael@0: ZWNJ = ord(u'\N{ZERO WIDTH NON-JOINER}')
michael@0: ZWJ = ord(u'\N{ZERO WIDTH JOINER}')
michael@0: 
michael@0: FLAG_SPACE = 1 << 0
michael@0: FLAG_LETTER = 1 << 1
michael@0: FLAG_IDENTIFIER_PART = 1 << 2
michael@0: 
michael@0: MAX = 0xffff
michael@0: 
michael@0: public_domain = """
michael@0: /*
michael@0:  * Any copyright is dedicated to the Public Domain.
michael@0:  * http://creativecommons.org/licenses/publicdomain/
michael@0:  */
michael@0: """
michael@0: 
michael@0: def read_unicode_data(unicode_file):
michael@0:     """
michael@0:         If you want to understand how this wonderful file format works checkout
michael@0:           Unicode Standard Annex #44 - Unicode Character Database
michael@0:           http://www.unicode.org/reports/tr44/
michael@0:     """
michael@0: 
michael@0:     reader = csv.reader(unicode_data, delimiter=';')
michael@0: 
michael@0:     while True:
michael@0:         row = reader.next()
michael@0:         name = row[1]
michael@0: 
michael@0:         # We need to expand the UAX #44 4.2.3 Code Point Range
michael@0:         if name.startswith('<') and name.endswith('First>'):
michael@0:             next_row = reader.next()
michael@0: 
michael@0:             for i in range(int(row[0], 16), int(next_row[0], 16) + 1):
michael@0:                 row[0] = i
michael@0:                 row[1] = name[1:-8]
michael@0: 
michael@0:                 yield row
michael@0:         else:
michael@0:             row[0] = int(row[0], 16)
michael@0:             yield row
michael@0: 
michael@0: def generate_unicode_stuff(unicode_data, data_file, test_mapping, test_space):
michael@0:     dummy = (0, 0, 0)
michael@0:     table = [dummy]
michael@0:     cache = {dummy: 0}
michael@0:     index = [0] * (MAX + 1)
michael@0:     test_table = {}
michael@0:     test_space_table = []
michael@0: 
michael@0:     for row in read_unicode_data(unicode_data):
michael@0:         code = row[0]
michael@0:         name = row[1]
michael@0:         category = row[2]
michael@0:         alias = row[-5]
michael@0:         uppercase = row[-3]
michael@0:         lowercase = row[-2]
michael@0:         flags = 0
michael@0: 
michael@0:         if code > MAX:
michael@0:             break
michael@0: 
michael@0:         # we combine whitespace and lineterminators because in pratice we don't need them separated
michael@0:         if category == 'Zs' or code in whitespace or code in line_terminator:
michael@0:             flags |= FLAG_SPACE
michael@0:             test_space_table.append(code)
michael@0:         if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter)
michael@0:             flags |= FLAG_LETTER
michael@0:         if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart)
michael@0:             flags |= FLAG_IDENTIFIER_PART
michael@0: 
michael@0:         if uppercase:
michael@0:             upper = int(uppercase, 16)
michael@0:         else:
michael@0:             upper = code
michael@0: 
michael@0:         if lowercase:
michael@0:             lower = int(lowercase, 16)
michael@0:         else:
michael@0:             lower = code
michael@0: 
michael@0:         test_table[code] = (upper, lower, name, alias)
michael@0: 
michael@0:         up_d = upper - code
michael@0:         low_d = lower - code
michael@0: 
michael@0:         assert up_d > -65535 and up_d < 65535
michael@0:         assert low_d > -65535 and low_d < 65535
michael@0: 
michael@0:         upper = up_d & 0xffff
michael@0:         lower = low_d & 0xffff
michael@0: 
michael@0:         item = (upper, lower, flags)
michael@0: 
michael@0:         i = cache.get(item)
michael@0:         if i is None:
michael@0:             assert item not in table
michael@0:             cache[item] = i = len(table)
michael@0:             table.append(item)
michael@0:         index[code] = i
michael@0: 
michael@0:     test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
michael@0:     test_mapping.write(public_domain)
michael@0:     test_mapping.write('var mapping = [\n')
michael@0:     for code in range(0, MAX + 1):
michael@0:         entry = test_table.get(code)
michael@0: 
michael@0:         if entry:
michael@0:             upper, lower, name, alias = entry
michael@0:             test_mapping.write('  [' + hex(upper) + ', ' + hex(lower) + '], /* ' +
michael@0:                        name + (' (' + alias + ')' if alias else '') + ' */\n')
michael@0:         else:
michael@0:             test_mapping.write('  [' + hex(code) + ', ' + hex(code) + '],\n')
michael@0:     test_mapping.write('];')
michael@0:     test_mapping.write("""
michael@0: assertEq(mapping.length, 0x10000);
michael@0: for (var i = 0; i <= 0xffff; i++) {
michael@0:     var char = String.fromCharCode(i);
michael@0:     var info = mapping[i];
michael@0: 
michael@0:     assertEq(char.toUpperCase().charCodeAt(0), info[0]);
michael@0:     assertEq(char.toLowerCase().charCodeAt(0), info[1]);
michael@0: }
michael@0: 
michael@0: if (typeof reportCompare === "function")
michael@0:     reportCompare(true, true);
michael@0: """)
michael@0: 
michael@0:     test_space.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
michael@0:     test_space.write(public_domain)
michael@0:     test_space.write('var onlySpace = String.fromCharCode(' +
michael@0:                      ', '.join(map(lambda c: hex(c), test_space_table)) + ');\n')
michael@0:     test_space.write("""
michael@0: assertEq(onlySpace.trim(), "");
michael@0: assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
michael@0: assertEq(('aaaa' + onlySpace).trim(), 'aaaa');
michael@0: assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa');
michael@0: 
michael@0: if (typeof reportCompare === "function")
michael@0:     reportCompare(true, true);
michael@0: """)
michael@0: 
michael@0:     index1, index2, shift = splitbins(index)
michael@0: 
michael@0:     # Don't forget to update CharInfo in Unicode.cpp if you need to change this
michael@0:     assert shift == 5
michael@0: 
michael@0:     # verify correctness
michael@0:     for char in index:
michael@0:         test = table[index[char]]
michael@0: 
michael@0:         idx = index1[char >> shift]
michael@0:         idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
michael@0: 
michael@0:         assert test == table[idx]
michael@0: 
michael@0: 
michael@0:     comment = """
michael@0: /*
michael@0:  * So how does indexing work?
michael@0:  * First let's have a look at a jschar, 16-bits:
michael@0:  *              [................]
michael@0:  * Step 1:
michael@0:  *  Extracting the upper 11 bits from the jschar.
michael@0:  *   upper = char >>  5 ([***********.....])
michael@0:  * Step 2:
michael@0:  *  Using these bits to get an reduced index from index1.
michael@0:  *   index = index1[upper]
michael@0:  * Step 3:
michael@0:  *  Combining the index and the bottom 5 bits of the original jschar.
michael@0:  *   real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++])
michael@0:  *
michael@0:  * The advantage here is that the biggest number in index1 doesn't need 10 bits,
michael@0:  * but 7 and we save some memory.
michael@0:  *
michael@0:  * Step 4:
michael@0:  *  Get the character informations by looking up real_index in js_charinfo.
michael@0:  *
michael@0:  * Pseudocode of generation:
michael@0:  *
michael@0:  * let table be the mapping of jschar => js_charinfo_index
michael@0:  * let index1 be an empty array
michael@0:  * let index2 be an empty array
michael@0:  * let cache be a hash map
michael@0:  *
michael@0:  * while shift is less then maximal amount you can shift 0xffff before it's 0
michael@0:  *  let chunks be table split in chunks of size 2**shift
michael@0:  *
michael@0:  *  for every chunk in chunks
michael@0:  *   if chunk is in cache
michael@0:  *    let index be cache[chunk]
michael@0:  *   else
michael@0:  *    let index be the max key of index2 + 1
michael@0:  *    for element in chunk
michael@0:  *     push element to index2
michael@0:  *    put index as chunk in cache
michael@0:  *
michael@0:  *   push index >> shift to index1
michael@0:  *
michael@0:  *  increase shift
michael@0:  *  stop if you found the best shift
michael@0:  */
michael@0: """
michael@0:     data_file.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
michael@0:     data_file.write(public_domain)
michael@0:     data_file.write('#include "vm/Unicode.h"\n\n')
michael@0:     data_file.write('using namespace js;\n')
michael@0:     data_file.write('using namespace js::unicode;\n')
michael@0:     data_file.write(comment)
michael@0:     data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n')
michael@0:     for d in table:
michael@0:         data_file.write('    {')
michael@0:         data_file.write(', '.join((str(e) for e in d)))
michael@0:         data_file.write('},\n')
michael@0:     data_file.write('};\n')
michael@0:     data_file.write('\n')
michael@0: 
michael@0:     def dump(data, name, file):
michael@0:         file.write('const uint8_t unicode::' + name + '[] = {\n')
michael@0: 
michael@0:         line = pad = ' ' * 4
michael@0:         lines = []
michael@0:         for entry in data:
michael@0:             assert entry < 256
michael@0:             s = str(entry)
michael@0:             s = s.rjust(3)
michael@0: 
michael@0:             if len(line + s) + 5 > 99:
michael@0:                 lines.append(line.rstrip())
michael@0:                 line = pad + s + ', '
michael@0:             else:
michael@0:                 line = line + s + ', '
michael@0:         lines.append(line.rstrip())
michael@0: 
michael@0:         file.write('\n'.join(lines))
michael@0:         file.write('\n};\n')
michael@0: 
michael@0:     dump(index1, 'index1', data_file)
michael@0:     data_file.write('\n')
michael@0:     dump(index2, 'index2', data_file)
michael@0:     data_file.write('\n')
michael@0: 
michael@0:     data_file.write('\n')
michael@0: 
michael@0: def getsize(data):
michael@0:     """ return smallest possible integer size for the given array """
michael@0:     maxdata = max(data)
michael@0:     assert maxdata < 2**32
michael@0: 
michael@0:     if maxdata < 256:
michael@0:         return 1
michael@0:     elif maxdata < 65536:
michael@0:         return 2
michael@0:     else:
michael@0:         return 4
michael@0: 
michael@0: def splitbins(t):
michael@0:     """t -> (t1, t2, shift).  Split a table to save space.
michael@0: 
michael@0:     t is a sequence of ints.  This function can be useful to save space if
michael@0:     many of the ints are the same.  t1 and t2 are lists of ints, and shift
michael@0:     is an int, chosen to minimize the combined size of t1 and t2 (in C
michael@0:     code), and where for each i in range(len(t)),
michael@0:         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
michael@0:     where mask is a bitmask isolating the last "shift" bits.
michael@0:     """
michael@0: 
michael@0:     def dump(t1, t2, shift, bytes):
michael@0:         print("%d+%d bins at shift %d; %d bytes" % (
michael@0:             len(t1), len(t2), shift, bytes), file=sys.stderr)
michael@0:         print("Size of original table:", len(t)*getsize(t), \
michael@0:             "bytes", file=sys.stderr)
michael@0:     n = len(t)-1    # last valid index
michael@0:     maxshift = 0    # the most we can shift n and still have something left
michael@0:     if n > 0:
michael@0:         while n >> 1:
michael@0:             n >>= 1
michael@0:             maxshift += 1
michael@0:     del n
michael@0:     bytes = sys.maxsize  # smallest total size so far
michael@0:     t = tuple(t)    # so slices can be dict keys
michael@0:     for shift in range(maxshift + 1):
michael@0:         t1 = []
michael@0:         t2 = []
michael@0:         size = 2**shift
michael@0:         bincache = {}
michael@0: 
michael@0:         for i in range(0, len(t), size):
michael@0:             bin = t[i:i + size]
michael@0: 
michael@0:             index = bincache.get(bin)
michael@0:             if index is None:
michael@0:                 index = len(t2)
michael@0:                 bincache[bin] = index
michael@0:                 t2.extend(bin)
michael@0:             t1.append(index >> shift)
michael@0: 
michael@0:         # determine memory size
michael@0:         b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
michael@0:         if b < bytes:
michael@0:             best = t1, t2, shift
michael@0:             bytes = b
michael@0:     t1, t2, shift = best
michael@0: 
michael@0:     print("Best:", end=' ', file=sys.stderr)
michael@0:     dump(t1, t2, shift, bytes)
michael@0: 
michael@0:     # exhaustively verify that the decomposition is correct
michael@0:     mask = 2**shift - 1
michael@0:     for i in range(len(t)):
michael@0:         assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
michael@0:     return best
michael@0: 
michael@0: if __name__ == '__main__':
michael@0:     import urllib2
michael@0: 
michael@0:     if len(sys.argv) > 1:
michael@0:         print('Always make sure you have the newest UnicodeData.txt!')
michael@0:         unicode_data = open(sys.argv[1], 'r')
michael@0:     else:
michael@0:         print('Downloading...')
michael@0:         reader = urllib2.urlopen('http://unicode.org/Public/UNIDATA/UnicodeData.txt')
michael@0:         data = reader.read()
michael@0:         reader.close()
michael@0:         unicode_data = open('UnicodeData.txt', 'w+')
michael@0:         unicode_data.write(data)
michael@0:         unicode_data.seek(0)
michael@0: 
michael@0:     print('Generating...')
michael@0:     generate_unicode_stuff(unicode_data,
michael@0:         open('Unicode.cpp', 'w'),
michael@0:         open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'),
michael@0:         open('../tests/ecma_5/String/string-space-trim.js', 'w'))