js/src/vm/make_unicode.py

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 # -*- coding: utf-8 -*-
michael@0 2 # Based upon makeunicodedata.py
michael@0 3 # (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py)
michael@0 4 # written by Fredrik Lundh (fredrik@pythonware.com)
michael@0 5 #
michael@0 6 # Copyright (C) 2011 Tom Schuster <evilpies@gmail.com>
michael@0 7 #
michael@0 8 # This program is free software: you can redistribute it and/or modify
michael@0 9 # it under the terms of the GNU General Public License as published by
michael@0 10 # the Free Software Foundation, either version 3 of the License, or
michael@0 11 # (at your option) any later version.
michael@0 12 #
michael@0 13 # This program is distributed in the hope that it will be useful,
michael@0 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
michael@0 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
michael@0 16 # GNU General Public License for more details.
michael@0 17 #
michael@0 18 # You should have received a copy of the GNU General Public License
michael@0 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
michael@0 20
michael@0 21 from __future__ import print_function
michael@0 22 import csv
michael@0 23 import sys
michael@0 24
michael@0 25 # ECMAScript 5 $ 7.2
michael@0 26 whitespace = [
michael@0 27 # python doesn't support using control character names :(
michael@0 28 0x9, # CHARACTER TABULATION
michael@0 29 0xb, # LINE TABULATION
michael@0 30 0xc, # FORM FEED
michael@0 31 ord(u'\N{SPACE}'),
michael@0 32 ord(u'\N{NO-BREAK SPACE}'),
michael@0 33 ord(u'\N{ZERO WIDTH NO-BREAK SPACE}'), # also BOM
michael@0 34 ]
michael@0 35
michael@0 36 # $ 7.3
michael@0 37 line_terminator = [
michael@0 38 0xa, # LINE FEED
michael@0 39 0xd, # CARRIAGE RETURN
michael@0 40 ord(u'\N{LINE SEPARATOR}'),
michael@0 41 ord(u'\N{PARAGRAPH SEPARATOR}'),
michael@0 42 ]
michael@0 43
michael@0 44 # These are also part of IdentifierPart $7.6
michael@0 45 ZWNJ = ord(u'\N{ZERO WIDTH NON-JOINER}')
michael@0 46 ZWJ = ord(u'\N{ZERO WIDTH JOINER}')
michael@0 47
michael@0 48 FLAG_SPACE = 1 << 0
michael@0 49 FLAG_LETTER = 1 << 1
michael@0 50 FLAG_IDENTIFIER_PART = 1 << 2
michael@0 51
michael@0 52 MAX = 0xffff
michael@0 53
michael@0 54 public_domain = """
michael@0 55 /*
michael@0 56 * Any copyright is dedicated to the Public Domain.
michael@0 57 * http://creativecommons.org/licenses/publicdomain/
michael@0 58 */
michael@0 59 """
michael@0 60
michael@0 61 def read_unicode_data(unicode_file):
michael@0 62 """
michael@0 63 If you want to understand how this wonderful file format works checkout
michael@0 64 Unicode Standard Annex #44 - Unicode Character Database
michael@0 65 http://www.unicode.org/reports/tr44/
michael@0 66 """
michael@0 67
michael@0 68 reader = csv.reader(unicode_data, delimiter=';')
michael@0 69
michael@0 70 while True:
michael@0 71 row = reader.next()
michael@0 72 name = row[1]
michael@0 73
michael@0 74 # We need to expand the UAX #44 4.2.3 Code Point Range
michael@0 75 if name.startswith('<') and name.endswith('First>'):
michael@0 76 next_row = reader.next()
michael@0 77
michael@0 78 for i in range(int(row[0], 16), int(next_row[0], 16) + 1):
michael@0 79 row[0] = i
michael@0 80 row[1] = name[1:-8]
michael@0 81
michael@0 82 yield row
michael@0 83 else:
michael@0 84 row[0] = int(row[0], 16)
michael@0 85 yield row
michael@0 86
michael@0 87 def generate_unicode_stuff(unicode_data, data_file, test_mapping, test_space):
michael@0 88 dummy = (0, 0, 0)
michael@0 89 table = [dummy]
michael@0 90 cache = {dummy: 0}
michael@0 91 index = [0] * (MAX + 1)
michael@0 92 test_table = {}
michael@0 93 test_space_table = []
michael@0 94
michael@0 95 for row in read_unicode_data(unicode_data):
michael@0 96 code = row[0]
michael@0 97 name = row[1]
michael@0 98 category = row[2]
michael@0 99 alias = row[-5]
michael@0 100 uppercase = row[-3]
michael@0 101 lowercase = row[-2]
michael@0 102 flags = 0
michael@0 103
michael@0 104 if code > MAX:
michael@0 105 break
michael@0 106
michael@0 107 # we combine whitespace and lineterminators because in pratice we don't need them separated
michael@0 108 if category == 'Zs' or code in whitespace or code in line_terminator:
michael@0 109 flags |= FLAG_SPACE
michael@0 110 test_space_table.append(code)
michael@0 111 if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter)
michael@0 112 flags |= FLAG_LETTER
michael@0 113 if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart)
michael@0 114 flags |= FLAG_IDENTIFIER_PART
michael@0 115
michael@0 116 if uppercase:
michael@0 117 upper = int(uppercase, 16)
michael@0 118 else:
michael@0 119 upper = code
michael@0 120
michael@0 121 if lowercase:
michael@0 122 lower = int(lowercase, 16)
michael@0 123 else:
michael@0 124 lower = code
michael@0 125
michael@0 126 test_table[code] = (upper, lower, name, alias)
michael@0 127
michael@0 128 up_d = upper - code
michael@0 129 low_d = lower - code
michael@0 130
michael@0 131 assert up_d > -65535 and up_d < 65535
michael@0 132 assert low_d > -65535 and low_d < 65535
michael@0 133
michael@0 134 upper = up_d & 0xffff
michael@0 135 lower = low_d & 0xffff
michael@0 136
michael@0 137 item = (upper, lower, flags)
michael@0 138
michael@0 139 i = cache.get(item)
michael@0 140 if i is None:
michael@0 141 assert item not in table
michael@0 142 cache[item] = i = len(table)
michael@0 143 table.append(item)
michael@0 144 index[code] = i
michael@0 145
michael@0 146 test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
michael@0 147 test_mapping.write(public_domain)
michael@0 148 test_mapping.write('var mapping = [\n')
michael@0 149 for code in range(0, MAX + 1):
michael@0 150 entry = test_table.get(code)
michael@0 151
michael@0 152 if entry:
michael@0 153 upper, lower, name, alias = entry
michael@0 154 test_mapping.write(' [' + hex(upper) + ', ' + hex(lower) + '], /* ' +
michael@0 155 name + (' (' + alias + ')' if alias else '') + ' */\n')
michael@0 156 else:
michael@0 157 test_mapping.write(' [' + hex(code) + ', ' + hex(code) + '],\n')
michael@0 158 test_mapping.write('];')
michael@0 159 test_mapping.write("""
michael@0 160 assertEq(mapping.length, 0x10000);
michael@0 161 for (var i = 0; i <= 0xffff; i++) {
michael@0 162 var char = String.fromCharCode(i);
michael@0 163 var info = mapping[i];
michael@0 164
michael@0 165 assertEq(char.toUpperCase().charCodeAt(0), info[0]);
michael@0 166 assertEq(char.toLowerCase().charCodeAt(0), info[1]);
michael@0 167 }
michael@0 168
michael@0 169 if (typeof reportCompare === "function")
michael@0 170 reportCompare(true, true);
michael@0 171 """)
michael@0 172
michael@0 173 test_space.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
michael@0 174 test_space.write(public_domain)
michael@0 175 test_space.write('var onlySpace = String.fromCharCode(' +
michael@0 176 ', '.join(map(lambda c: hex(c), test_space_table)) + ');\n')
michael@0 177 test_space.write("""
michael@0 178 assertEq(onlySpace.trim(), "");
michael@0 179 assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
michael@0 180 assertEq(('aaaa' + onlySpace).trim(), 'aaaa');
michael@0 181 assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa');
michael@0 182
michael@0 183 if (typeof reportCompare === "function")
michael@0 184 reportCompare(true, true);
michael@0 185 """)
michael@0 186
michael@0 187 index1, index2, shift = splitbins(index)
michael@0 188
michael@0 189 # Don't forget to update CharInfo in Unicode.cpp if you need to change this
michael@0 190 assert shift == 5
michael@0 191
michael@0 192 # verify correctness
michael@0 193 for char in index:
michael@0 194 test = table[index[char]]
michael@0 195
michael@0 196 idx = index1[char >> shift]
michael@0 197 idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
michael@0 198
michael@0 199 assert test == table[idx]
michael@0 200
michael@0 201
michael@0 202 comment = """
michael@0 203 /*
michael@0 204 * So how does indexing work?
michael@0 205 * First let's have a look at a jschar, 16-bits:
michael@0 206 * [................]
michael@0 207 * Step 1:
michael@0 208 * Extracting the upper 11 bits from the jschar.
michael@0 209 * upper = char >> 5 ([***********.....])
michael@0 210 * Step 2:
michael@0 211 * Using these bits to get an reduced index from index1.
michael@0 212 * index = index1[upper]
michael@0 213 * Step 3:
michael@0 214 * Combining the index and the bottom 5 bits of the original jschar.
michael@0 215 * real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++])
michael@0 216 *
michael@0 217 * The advantage here is that the biggest number in index1 doesn't need 10 bits,
michael@0 218 * but 7 and we save some memory.
michael@0 219 *
michael@0 220 * Step 4:
michael@0 221 * Get the character informations by looking up real_index in js_charinfo.
michael@0 222 *
michael@0 223 * Pseudocode of generation:
michael@0 224 *
michael@0 225 * let table be the mapping of jschar => js_charinfo_index
michael@0 226 * let index1 be an empty array
michael@0 227 * let index2 be an empty array
michael@0 228 * let cache be a hash map
michael@0 229 *
michael@0 230 * while shift is less then maximal amount you can shift 0xffff before it's 0
michael@0 231 * let chunks be table split in chunks of size 2**shift
michael@0 232 *
michael@0 233 * for every chunk in chunks
michael@0 234 * if chunk is in cache
michael@0 235 * let index be cache[chunk]
michael@0 236 * else
michael@0 237 * let index be the max key of index2 + 1
michael@0 238 * for element in chunk
michael@0 239 * push element to index2
michael@0 240 * put index as chunk in cache
michael@0 241 *
michael@0 242 * push index >> shift to index1
michael@0 243 *
michael@0 244 * increase shift
michael@0 245 * stop if you found the best shift
michael@0 246 */
michael@0 247 """
michael@0 248 data_file.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
michael@0 249 data_file.write(public_domain)
michael@0 250 data_file.write('#include "vm/Unicode.h"\n\n')
michael@0 251 data_file.write('using namespace js;\n')
michael@0 252 data_file.write('using namespace js::unicode;\n')
michael@0 253 data_file.write(comment)
michael@0 254 data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n')
michael@0 255 for d in table:
michael@0 256 data_file.write(' {')
michael@0 257 data_file.write(', '.join((str(e) for e in d)))
michael@0 258 data_file.write('},\n')
michael@0 259 data_file.write('};\n')
michael@0 260 data_file.write('\n')
michael@0 261
michael@0 262 def dump(data, name, file):
michael@0 263 file.write('const uint8_t unicode::' + name + '[] = {\n')
michael@0 264
michael@0 265 line = pad = ' ' * 4
michael@0 266 lines = []
michael@0 267 for entry in data:
michael@0 268 assert entry < 256
michael@0 269 s = str(entry)
michael@0 270 s = s.rjust(3)
michael@0 271
michael@0 272 if len(line + s) + 5 > 99:
michael@0 273 lines.append(line.rstrip())
michael@0 274 line = pad + s + ', '
michael@0 275 else:
michael@0 276 line = line + s + ', '
michael@0 277 lines.append(line.rstrip())
michael@0 278
michael@0 279 file.write('\n'.join(lines))
michael@0 280 file.write('\n};\n')
michael@0 281
michael@0 282 dump(index1, 'index1', data_file)
michael@0 283 data_file.write('\n')
michael@0 284 dump(index2, 'index2', data_file)
michael@0 285 data_file.write('\n')
michael@0 286
michael@0 287 data_file.write('\n')
michael@0 288
michael@0 289 def getsize(data):
michael@0 290 """ return smallest possible integer size for the given array """
michael@0 291 maxdata = max(data)
michael@0 292 assert maxdata < 2**32
michael@0 293
michael@0 294 if maxdata < 256:
michael@0 295 return 1
michael@0 296 elif maxdata < 65536:
michael@0 297 return 2
michael@0 298 else:
michael@0 299 return 4
michael@0 300
michael@0 301 def splitbins(t):
michael@0 302 """t -> (t1, t2, shift). Split a table to save space.
michael@0 303
michael@0 304 t is a sequence of ints. This function can be useful to save space if
michael@0 305 many of the ints are the same. t1 and t2 are lists of ints, and shift
michael@0 306 is an int, chosen to minimize the combined size of t1 and t2 (in C
michael@0 307 code), and where for each i in range(len(t)),
michael@0 308 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
michael@0 309 where mask is a bitmask isolating the last "shift" bits.
michael@0 310 """
michael@0 311
michael@0 312 def dump(t1, t2, shift, bytes):
michael@0 313 print("%d+%d bins at shift %d; %d bytes" % (
michael@0 314 len(t1), len(t2), shift, bytes), file=sys.stderr)
michael@0 315 print("Size of original table:", len(t)*getsize(t), \
michael@0 316 "bytes", file=sys.stderr)
michael@0 317 n = len(t)-1 # last valid index
michael@0 318 maxshift = 0 # the most we can shift n and still have something left
michael@0 319 if n > 0:
michael@0 320 while n >> 1:
michael@0 321 n >>= 1
michael@0 322 maxshift += 1
michael@0 323 del n
michael@0 324 bytes = sys.maxsize # smallest total size so far
michael@0 325 t = tuple(t) # so slices can be dict keys
michael@0 326 for shift in range(maxshift + 1):
michael@0 327 t1 = []
michael@0 328 t2 = []
michael@0 329 size = 2**shift
michael@0 330 bincache = {}
michael@0 331
michael@0 332 for i in range(0, len(t), size):
michael@0 333 bin = t[i:i + size]
michael@0 334
michael@0 335 index = bincache.get(bin)
michael@0 336 if index is None:
michael@0 337 index = len(t2)
michael@0 338 bincache[bin] = index
michael@0 339 t2.extend(bin)
michael@0 340 t1.append(index >> shift)
michael@0 341
michael@0 342 # determine memory size
michael@0 343 b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
michael@0 344 if b < bytes:
michael@0 345 best = t1, t2, shift
michael@0 346 bytes = b
michael@0 347 t1, t2, shift = best
michael@0 348
michael@0 349 print("Best:", end=' ', file=sys.stderr)
michael@0 350 dump(t1, t2, shift, bytes)
michael@0 351
michael@0 352 # exhaustively verify that the decomposition is correct
michael@0 353 mask = 2**shift - 1
michael@0 354 for i in range(len(t)):
michael@0 355 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
michael@0 356 return best
michael@0 357
michael@0 358 if __name__ == '__main__':
michael@0 359 import urllib2
michael@0 360
michael@0 361 if len(sys.argv) > 1:
michael@0 362 print('Always make sure you have the newest UnicodeData.txt!')
michael@0 363 unicode_data = open(sys.argv[1], 'r')
michael@0 364 else:
michael@0 365 print('Downloading...')
michael@0 366 reader = urllib2.urlopen('http://unicode.org/Public/UNIDATA/UnicodeData.txt')
michael@0 367 data = reader.read()
michael@0 368 reader.close()
michael@0 369 unicode_data = open('UnicodeData.txt', 'w+')
michael@0 370 unicode_data.write(data)
michael@0 371 unicode_data.seek(0)
michael@0 372
michael@0 373 print('Generating...')
michael@0 374 generate_unicode_stuff(unicode_data,
michael@0 375 open('Unicode.cpp', 'w'),
michael@0 376 open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'),
michael@0 377 open('../tests/ecma_5/String/string-space-trim.js', 'w'))

mercurial