1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/js/src/vm/make_unicode.py Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,377 @@ 1.4 +# -*- coding: utf-8 -*- 1.5 +# Based upon makeunicodedata.py 1.6 +# (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py) 1.7 +# written by Fredrik Lundh (fredrik@pythonware.com) 1.8 +# 1.9 +# Copyright (C) 2011 Tom Schuster <evilpies@gmail.com> 1.10 +# 1.11 +# This program is free software: you can redistribute it and/or modify 1.12 +# it under the terms of the GNU General Public License as published by 1.13 +# the Free Software Foundation, either version 3 of the License, or 1.14 +# (at your option) any later version. 1.15 +# 1.16 +# This program is distributed in the hope that it will be useful, 1.17 +# but WITHOUT ANY WARRANTY; without even the implied warranty of 1.18 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1.19 +# GNU General Public License for more details. 1.20 +# 1.21 +# You should have received a copy of the GNU General Public License 1.22 +# along with this program. If not, see <http://www.gnu.org/licenses/>. 1.23 + 1.24 +from __future__ import print_function 1.25 +import csv 1.26 +import sys 1.27 + 1.28 +# ECMAScript 5 $ 7.2 1.29 +whitespace = [ 1.30 + # python doesn't support using control character names :( 1.31 + 0x9, # CHARACTER TABULATION 1.32 + 0xb, # LINE TABULATION 1.33 + 0xc, # FORM FEED 1.34 + ord(u'\N{SPACE}'), 1.35 + ord(u'\N{NO-BREAK SPACE}'), 1.36 + ord(u'\N{ZERO WIDTH NO-BREAK SPACE}'), # also BOM 1.37 +] 1.38 + 1.39 +# $ 7.3 1.40 +line_terminator = [ 1.41 + 0xa, # LINE FEED 1.42 + 0xd, # CARRIAGE RETURN 1.43 + ord(u'\N{LINE SEPARATOR}'), 1.44 + ord(u'\N{PARAGRAPH SEPARATOR}'), 1.45 +] 1.46 + 1.47 +# These are also part of IdentifierPart $7.6 1.48 +ZWNJ = ord(u'\N{ZERO WIDTH NON-JOINER}') 1.49 +ZWJ = ord(u'\N{ZERO WIDTH JOINER}') 1.50 + 1.51 +FLAG_SPACE = 1 << 0 1.52 +FLAG_LETTER = 1 << 1 1.53 +FLAG_IDENTIFIER_PART = 1 << 2 1.54 + 1.55 +MAX = 0xffff 1.56 + 1.57 +public_domain = """ 1.58 +/* 1.59 + * Any copyright is dedicated to the Public Domain. 1.60 + * http://creativecommons.org/licenses/publicdomain/ 1.61 + */ 1.62 +""" 1.63 + 1.64 +def read_unicode_data(unicode_file): 1.65 + """ 1.66 + If you want to understand how this wonderful file format works checkout 1.67 + Unicode Standard Annex #44 - Unicode Character Database 1.68 + http://www.unicode.org/reports/tr44/ 1.69 + """ 1.70 + 1.71 + reader = csv.reader(unicode_data, delimiter=';') 1.72 + 1.73 + while True: 1.74 + row = reader.next() 1.75 + name = row[1] 1.76 + 1.77 + # We need to expand the UAX #44 4.2.3 Code Point Range 1.78 + if name.startswith('<') and name.endswith('First>'): 1.79 + next_row = reader.next() 1.80 + 1.81 + for i in range(int(row[0], 16), int(next_row[0], 16) + 1): 1.82 + row[0] = i 1.83 + row[1] = name[1:-8] 1.84 + 1.85 + yield row 1.86 + else: 1.87 + row[0] = int(row[0], 16) 1.88 + yield row 1.89 + 1.90 +def generate_unicode_stuff(unicode_data, data_file, test_mapping, test_space): 1.91 + dummy = (0, 0, 0) 1.92 + table = [dummy] 1.93 + cache = {dummy: 0} 1.94 + index = [0] * (MAX + 1) 1.95 + test_table = {} 1.96 + test_space_table = [] 1.97 + 1.98 + for row in read_unicode_data(unicode_data): 1.99 + code = row[0] 1.100 + name = row[1] 1.101 + category = row[2] 1.102 + alias = row[-5] 1.103 + uppercase = row[-3] 1.104 + lowercase = row[-2] 1.105 + flags = 0 1.106 + 1.107 + if code > MAX: 1.108 + break 1.109 + 1.110 + # we combine whitespace and lineterminators because in pratice we don't need them separated 1.111 + if category == 'Zs' or code in whitespace or code in line_terminator: 1.112 + flags |= FLAG_SPACE 1.113 + test_space_table.append(code) 1.114 + if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter) 1.115 + flags |= FLAG_LETTER 1.116 + if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart) 1.117 + flags |= FLAG_IDENTIFIER_PART 1.118 + 1.119 + if uppercase: 1.120 + upper = int(uppercase, 16) 1.121 + else: 1.122 + upper = code 1.123 + 1.124 + if lowercase: 1.125 + lower = int(lowercase, 16) 1.126 + else: 1.127 + lower = code 1.128 + 1.129 + test_table[code] = (upper, lower, name, alias) 1.130 + 1.131 + up_d = upper - code 1.132 + low_d = lower - code 1.133 + 1.134 + assert up_d > -65535 and up_d < 65535 1.135 + assert low_d > -65535 and low_d < 65535 1.136 + 1.137 + upper = up_d & 0xffff 1.138 + lower = low_d & 0xffff 1.139 + 1.140 + item = (upper, lower, flags) 1.141 + 1.142 + i = cache.get(item) 1.143 + if i is None: 1.144 + assert item not in table 1.145 + cache[item] = i = len(table) 1.146 + table.append(item) 1.147 + index[code] = i 1.148 + 1.149 + test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') 1.150 + test_mapping.write(public_domain) 1.151 + test_mapping.write('var mapping = [\n') 1.152 + for code in range(0, MAX + 1): 1.153 + entry = test_table.get(code) 1.154 + 1.155 + if entry: 1.156 + upper, lower, name, alias = entry 1.157 + test_mapping.write(' [' + hex(upper) + ', ' + hex(lower) + '], /* ' + 1.158 + name + (' (' + alias + ')' if alias else '') + ' */\n') 1.159 + else: 1.160 + test_mapping.write(' [' + hex(code) + ', ' + hex(code) + '],\n') 1.161 + test_mapping.write('];') 1.162 + test_mapping.write(""" 1.163 +assertEq(mapping.length, 0x10000); 1.164 +for (var i = 0; i <= 0xffff; i++) { 1.165 + var char = String.fromCharCode(i); 1.166 + var info = mapping[i]; 1.167 + 1.168 + assertEq(char.toUpperCase().charCodeAt(0), info[0]); 1.169 + assertEq(char.toLowerCase().charCodeAt(0), info[1]); 1.170 +} 1.171 + 1.172 +if (typeof reportCompare === "function") 1.173 + reportCompare(true, true); 1.174 +""") 1.175 + 1.176 + test_space.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') 1.177 + test_space.write(public_domain) 1.178 + test_space.write('var onlySpace = String.fromCharCode(' + 1.179 + ', '.join(map(lambda c: hex(c), test_space_table)) + ');\n') 1.180 + test_space.write(""" 1.181 +assertEq(onlySpace.trim(), ""); 1.182 +assertEq((onlySpace + 'aaaa').trim(), 'aaaa'); 1.183 +assertEq(('aaaa' + onlySpace).trim(), 'aaaa'); 1.184 +assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa'); 1.185 + 1.186 +if (typeof reportCompare === "function") 1.187 + reportCompare(true, true); 1.188 +""") 1.189 + 1.190 + index1, index2, shift = splitbins(index) 1.191 + 1.192 + # Don't forget to update CharInfo in Unicode.cpp if you need to change this 1.193 + assert shift == 5 1.194 + 1.195 + # verify correctness 1.196 + for char in index: 1.197 + test = table[index[char]] 1.198 + 1.199 + idx = index1[char >> shift] 1.200 + idx = index2[(idx << shift) + (char & ((1 << shift) - 1))] 1.201 + 1.202 + assert test == table[idx] 1.203 + 1.204 + 1.205 + comment = """ 1.206 +/* 1.207 + * So how does indexing work? 1.208 + * First let's have a look at a jschar, 16-bits: 1.209 + * [................] 1.210 + * Step 1: 1.211 + * Extracting the upper 11 bits from the jschar. 1.212 + * upper = char >> 5 ([***********.....]) 1.213 + * Step 2: 1.214 + * Using these bits to get an reduced index from index1. 1.215 + * index = index1[upper] 1.216 + * Step 3: 1.217 + * Combining the index and the bottom 5 bits of the original jschar. 1.218 + * real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++]) 1.219 + * 1.220 + * The advantage here is that the biggest number in index1 doesn't need 10 bits, 1.221 + * but 7 and we save some memory. 1.222 + * 1.223 + * Step 4: 1.224 + * Get the character informations by looking up real_index in js_charinfo. 1.225 + * 1.226 + * Pseudocode of generation: 1.227 + * 1.228 + * let table be the mapping of jschar => js_charinfo_index 1.229 + * let index1 be an empty array 1.230 + * let index2 be an empty array 1.231 + * let cache be a hash map 1.232 + * 1.233 + * while shift is less then maximal amount you can shift 0xffff before it's 0 1.234 + * let chunks be table split in chunks of size 2**shift 1.235 + * 1.236 + * for every chunk in chunks 1.237 + * if chunk is in cache 1.238 + * let index be cache[chunk] 1.239 + * else 1.240 + * let index be the max key of index2 + 1 1.241 + * for element in chunk 1.242 + * push element to index2 1.243 + * put index as chunk in cache 1.244 + * 1.245 + * push index >> shift to index1 1.246 + * 1.247 + * increase shift 1.248 + * stop if you found the best shift 1.249 + */ 1.250 +""" 1.251 + data_file.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') 1.252 + data_file.write(public_domain) 1.253 + data_file.write('#include "vm/Unicode.h"\n\n') 1.254 + data_file.write('using namespace js;\n') 1.255 + data_file.write('using namespace js::unicode;\n') 1.256 + data_file.write(comment) 1.257 + data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n') 1.258 + for d in table: 1.259 + data_file.write(' {') 1.260 + data_file.write(', '.join((str(e) for e in d))) 1.261 + data_file.write('},\n') 1.262 + data_file.write('};\n') 1.263 + data_file.write('\n') 1.264 + 1.265 + def dump(data, name, file): 1.266 + file.write('const uint8_t unicode::' + name + '[] = {\n') 1.267 + 1.268 + line = pad = ' ' * 4 1.269 + lines = [] 1.270 + for entry in data: 1.271 + assert entry < 256 1.272 + s = str(entry) 1.273 + s = s.rjust(3) 1.274 + 1.275 + if len(line + s) + 5 > 99: 1.276 + lines.append(line.rstrip()) 1.277 + line = pad + s + ', ' 1.278 + else: 1.279 + line = line + s + ', ' 1.280 + lines.append(line.rstrip()) 1.281 + 1.282 + file.write('\n'.join(lines)) 1.283 + file.write('\n};\n') 1.284 + 1.285 + dump(index1, 'index1', data_file) 1.286 + data_file.write('\n') 1.287 + dump(index2, 'index2', data_file) 1.288 + data_file.write('\n') 1.289 + 1.290 + data_file.write('\n') 1.291 + 1.292 +def getsize(data): 1.293 + """ return smallest possible integer size for the given array """ 1.294 + maxdata = max(data) 1.295 + assert maxdata < 2**32 1.296 + 1.297 + if maxdata < 256: 1.298 + return 1 1.299 + elif maxdata < 65536: 1.300 + return 2 1.301 + else: 1.302 + return 4 1.303 + 1.304 +def splitbins(t): 1.305 + """t -> (t1, t2, shift). Split a table to save space. 1.306 + 1.307 + t is a sequence of ints. This function can be useful to save space if 1.308 + many of the ints are the same. t1 and t2 are lists of ints, and shift 1.309 + is an int, chosen to minimize the combined size of t1 and t2 (in C 1.310 + code), and where for each i in range(len(t)), 1.311 + t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1.312 + where mask is a bitmask isolating the last "shift" bits. 1.313 + """ 1.314 + 1.315 + def dump(t1, t2, shift, bytes): 1.316 + print("%d+%d bins at shift %d; %d bytes" % ( 1.317 + len(t1), len(t2), shift, bytes), file=sys.stderr) 1.318 + print("Size of original table:", len(t)*getsize(t), \ 1.319 + "bytes", file=sys.stderr) 1.320 + n = len(t)-1 # last valid index 1.321 + maxshift = 0 # the most we can shift n and still have something left 1.322 + if n > 0: 1.323 + while n >> 1: 1.324 + n >>= 1 1.325 + maxshift += 1 1.326 + del n 1.327 + bytes = sys.maxsize # smallest total size so far 1.328 + t = tuple(t) # so slices can be dict keys 1.329 + for shift in range(maxshift + 1): 1.330 + t1 = [] 1.331 + t2 = [] 1.332 + size = 2**shift 1.333 + bincache = {} 1.334 + 1.335 + for i in range(0, len(t), size): 1.336 + bin = t[i:i + size] 1.337 + 1.338 + index = bincache.get(bin) 1.339 + if index is None: 1.340 + index = len(t2) 1.341 + bincache[bin] = index 1.342 + t2.extend(bin) 1.343 + t1.append(index >> shift) 1.344 + 1.345 + # determine memory size 1.346 + b = len(t1) * getsize(t1) + len(t2) * getsize(t2) 1.347 + if b < bytes: 1.348 + best = t1, t2, shift 1.349 + bytes = b 1.350 + t1, t2, shift = best 1.351 + 1.352 + print("Best:", end=' ', file=sys.stderr) 1.353 + dump(t1, t2, shift, bytes) 1.354 + 1.355 + # exhaustively verify that the decomposition is correct 1.356 + mask = 2**shift - 1 1.357 + for i in range(len(t)): 1.358 + assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1.359 + return best 1.360 + 1.361 +if __name__ == '__main__': 1.362 + import urllib2 1.363 + 1.364 + if len(sys.argv) > 1: 1.365 + print('Always make sure you have the newest UnicodeData.txt!') 1.366 + unicode_data = open(sys.argv[1], 'r') 1.367 + else: 1.368 + print('Downloading...') 1.369 + reader = urllib2.urlopen('http://unicode.org/Public/UNIDATA/UnicodeData.txt') 1.370 + data = reader.read() 1.371 + reader.close() 1.372 + unicode_data = open('UnicodeData.txt', 'w+') 1.373 + unicode_data.write(data) 1.374 + unicode_data.seek(0) 1.375 + 1.376 + print('Generating...') 1.377 + generate_unicode_stuff(unicode_data, 1.378 + open('Unicode.cpp', 'w'), 1.379 + open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'), 1.380 + open('../tests/ecma_5/String/string-space-trim.js', 'w'))