Sat, 03 Jan 2015 20:18:00 +0100
Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.
michael@0 | 1 | # -*- coding: utf-8 -*- |
michael@0 | 2 | # Based upon makeunicodedata.py |
michael@0 | 3 | # (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py) |
michael@0 | 4 | # written by Fredrik Lundh (fredrik@pythonware.com) |
michael@0 | 5 | # |
michael@0 | 6 | # Copyright (C) 2011 Tom Schuster <evilpies@gmail.com> |
michael@0 | 7 | # |
michael@0 | 8 | # This program is free software: you can redistribute it and/or modify |
michael@0 | 9 | # it under the terms of the GNU General Public License as published by |
michael@0 | 10 | # the Free Software Foundation, either version 3 of the License, or |
michael@0 | 11 | # (at your option) any later version. |
michael@0 | 12 | # |
michael@0 | 13 | # This program is distributed in the hope that it will be useful, |
michael@0 | 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
michael@0 | 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
michael@0 | 16 | # GNU General Public License for more details. |
michael@0 | 17 | # |
michael@0 | 18 | # You should have received a copy of the GNU General Public License |
michael@0 | 19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
michael@0 | 20 | |
michael@0 | 21 | from __future__ import print_function |
michael@0 | 22 | import csv |
michael@0 | 23 | import sys |
michael@0 | 24 | |
michael@0 | 25 | # ECMAScript 5 $ 7.2 |
michael@0 | 26 | whitespace = [ |
michael@0 | 27 | # python doesn't support using control character names :( |
michael@0 | 28 | 0x9, # CHARACTER TABULATION |
michael@0 | 29 | 0xb, # LINE TABULATION |
michael@0 | 30 | 0xc, # FORM FEED |
michael@0 | 31 | ord(u'\N{SPACE}'), |
michael@0 | 32 | ord(u'\N{NO-BREAK SPACE}'), |
michael@0 | 33 | ord(u'\N{ZERO WIDTH NO-BREAK SPACE}'), # also BOM |
michael@0 | 34 | ] |
michael@0 | 35 | |
michael@0 | 36 | # $ 7.3 |
michael@0 | 37 | line_terminator = [ |
michael@0 | 38 | 0xa, # LINE FEED |
michael@0 | 39 | 0xd, # CARRIAGE RETURN |
michael@0 | 40 | ord(u'\N{LINE SEPARATOR}'), |
michael@0 | 41 | ord(u'\N{PARAGRAPH SEPARATOR}'), |
michael@0 | 42 | ] |
michael@0 | 43 | |
michael@0 | 44 | # These are also part of IdentifierPart $7.6 |
michael@0 | 45 | ZWNJ = ord(u'\N{ZERO WIDTH NON-JOINER}') |
michael@0 | 46 | ZWJ = ord(u'\N{ZERO WIDTH JOINER}') |
michael@0 | 47 | |
michael@0 | 48 | FLAG_SPACE = 1 << 0 |
michael@0 | 49 | FLAG_LETTER = 1 << 1 |
michael@0 | 50 | FLAG_IDENTIFIER_PART = 1 << 2 |
michael@0 | 51 | |
michael@0 | 52 | MAX = 0xffff |
michael@0 | 53 | |
michael@0 | 54 | public_domain = """ |
michael@0 | 55 | /* |
michael@0 | 56 | * Any copyright is dedicated to the Public Domain. |
michael@0 | 57 | * http://creativecommons.org/licenses/publicdomain/ |
michael@0 | 58 | */ |
michael@0 | 59 | """ |
michael@0 | 60 | |
michael@0 | 61 | def read_unicode_data(unicode_file): |
michael@0 | 62 | """ |
michael@0 | 63 | If you want to understand how this wonderful file format works checkout |
michael@0 | 64 | Unicode Standard Annex #44 - Unicode Character Database |
michael@0 | 65 | http://www.unicode.org/reports/tr44/ |
michael@0 | 66 | """ |
michael@0 | 67 | |
michael@0 | 68 | reader = csv.reader(unicode_data, delimiter=';') |
michael@0 | 69 | |
michael@0 | 70 | while True: |
michael@0 | 71 | row = reader.next() |
michael@0 | 72 | name = row[1] |
michael@0 | 73 | |
michael@0 | 74 | # We need to expand the UAX #44 4.2.3 Code Point Range |
michael@0 | 75 | if name.startswith('<') and name.endswith('First>'): |
michael@0 | 76 | next_row = reader.next() |
michael@0 | 77 | |
michael@0 | 78 | for i in range(int(row[0], 16), int(next_row[0], 16) + 1): |
michael@0 | 79 | row[0] = i |
michael@0 | 80 | row[1] = name[1:-8] |
michael@0 | 81 | |
michael@0 | 82 | yield row |
michael@0 | 83 | else: |
michael@0 | 84 | row[0] = int(row[0], 16) |
michael@0 | 85 | yield row |
michael@0 | 86 | |
michael@0 | 87 | def generate_unicode_stuff(unicode_data, data_file, test_mapping, test_space): |
michael@0 | 88 | dummy = (0, 0, 0) |
michael@0 | 89 | table = [dummy] |
michael@0 | 90 | cache = {dummy: 0} |
michael@0 | 91 | index = [0] * (MAX + 1) |
michael@0 | 92 | test_table = {} |
michael@0 | 93 | test_space_table = [] |
michael@0 | 94 | |
michael@0 | 95 | for row in read_unicode_data(unicode_data): |
michael@0 | 96 | code = row[0] |
michael@0 | 97 | name = row[1] |
michael@0 | 98 | category = row[2] |
michael@0 | 99 | alias = row[-5] |
michael@0 | 100 | uppercase = row[-3] |
michael@0 | 101 | lowercase = row[-2] |
michael@0 | 102 | flags = 0 |
michael@0 | 103 | |
michael@0 | 104 | if code > MAX: |
michael@0 | 105 | break |
michael@0 | 106 | |
michael@0 | 107 | # we combine whitespace and lineterminators because in pratice we don't need them separated |
michael@0 | 108 | if category == 'Zs' or code in whitespace or code in line_terminator: |
michael@0 | 109 | flags |= FLAG_SPACE |
michael@0 | 110 | test_space_table.append(code) |
michael@0 | 111 | if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter) |
michael@0 | 112 | flags |= FLAG_LETTER |
michael@0 | 113 | if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart) |
michael@0 | 114 | flags |= FLAG_IDENTIFIER_PART |
michael@0 | 115 | |
michael@0 | 116 | if uppercase: |
michael@0 | 117 | upper = int(uppercase, 16) |
michael@0 | 118 | else: |
michael@0 | 119 | upper = code |
michael@0 | 120 | |
michael@0 | 121 | if lowercase: |
michael@0 | 122 | lower = int(lowercase, 16) |
michael@0 | 123 | else: |
michael@0 | 124 | lower = code |
michael@0 | 125 | |
michael@0 | 126 | test_table[code] = (upper, lower, name, alias) |
michael@0 | 127 | |
michael@0 | 128 | up_d = upper - code |
michael@0 | 129 | low_d = lower - code |
michael@0 | 130 | |
michael@0 | 131 | assert up_d > -65535 and up_d < 65535 |
michael@0 | 132 | assert low_d > -65535 and low_d < 65535 |
michael@0 | 133 | |
michael@0 | 134 | upper = up_d & 0xffff |
michael@0 | 135 | lower = low_d & 0xffff |
michael@0 | 136 | |
michael@0 | 137 | item = (upper, lower, flags) |
michael@0 | 138 | |
michael@0 | 139 | i = cache.get(item) |
michael@0 | 140 | if i is None: |
michael@0 | 141 | assert item not in table |
michael@0 | 142 | cache[item] = i = len(table) |
michael@0 | 143 | table.append(item) |
michael@0 | 144 | index[code] = i |
michael@0 | 145 | |
michael@0 | 146 | test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') |
michael@0 | 147 | test_mapping.write(public_domain) |
michael@0 | 148 | test_mapping.write('var mapping = [\n') |
michael@0 | 149 | for code in range(0, MAX + 1): |
michael@0 | 150 | entry = test_table.get(code) |
michael@0 | 151 | |
michael@0 | 152 | if entry: |
michael@0 | 153 | upper, lower, name, alias = entry |
michael@0 | 154 | test_mapping.write(' [' + hex(upper) + ', ' + hex(lower) + '], /* ' + |
michael@0 | 155 | name + (' (' + alias + ')' if alias else '') + ' */\n') |
michael@0 | 156 | else: |
michael@0 | 157 | test_mapping.write(' [' + hex(code) + ', ' + hex(code) + '],\n') |
michael@0 | 158 | test_mapping.write('];') |
michael@0 | 159 | test_mapping.write(""" |
michael@0 | 160 | assertEq(mapping.length, 0x10000); |
michael@0 | 161 | for (var i = 0; i <= 0xffff; i++) { |
michael@0 | 162 | var char = String.fromCharCode(i); |
michael@0 | 163 | var info = mapping[i]; |
michael@0 | 164 | |
michael@0 | 165 | assertEq(char.toUpperCase().charCodeAt(0), info[0]); |
michael@0 | 166 | assertEq(char.toLowerCase().charCodeAt(0), info[1]); |
michael@0 | 167 | } |
michael@0 | 168 | |
michael@0 | 169 | if (typeof reportCompare === "function") |
michael@0 | 170 | reportCompare(true, true); |
michael@0 | 171 | """) |
michael@0 | 172 | |
michael@0 | 173 | test_space.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') |
michael@0 | 174 | test_space.write(public_domain) |
michael@0 | 175 | test_space.write('var onlySpace = String.fromCharCode(' + |
michael@0 | 176 | ', '.join(map(lambda c: hex(c), test_space_table)) + ');\n') |
michael@0 | 177 | test_space.write(""" |
michael@0 | 178 | assertEq(onlySpace.trim(), ""); |
michael@0 | 179 | assertEq((onlySpace + 'aaaa').trim(), 'aaaa'); |
michael@0 | 180 | assertEq(('aaaa' + onlySpace).trim(), 'aaaa'); |
michael@0 | 181 | assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa'); |
michael@0 | 182 | |
michael@0 | 183 | if (typeof reportCompare === "function") |
michael@0 | 184 | reportCompare(true, true); |
michael@0 | 185 | """) |
michael@0 | 186 | |
michael@0 | 187 | index1, index2, shift = splitbins(index) |
michael@0 | 188 | |
michael@0 | 189 | # Don't forget to update CharInfo in Unicode.cpp if you need to change this |
michael@0 | 190 | assert shift == 5 |
michael@0 | 191 | |
michael@0 | 192 | # verify correctness |
michael@0 | 193 | for char in index: |
michael@0 | 194 | test = table[index[char]] |
michael@0 | 195 | |
michael@0 | 196 | idx = index1[char >> shift] |
michael@0 | 197 | idx = index2[(idx << shift) + (char & ((1 << shift) - 1))] |
michael@0 | 198 | |
michael@0 | 199 | assert test == table[idx] |
michael@0 | 200 | |
michael@0 | 201 | |
michael@0 | 202 | comment = """ |
michael@0 | 203 | /* |
michael@0 | 204 | * So how does indexing work? |
michael@0 | 205 | * First let's have a look at a jschar, 16-bits: |
michael@0 | 206 | * [................] |
michael@0 | 207 | * Step 1: |
michael@0 | 208 | * Extracting the upper 11 bits from the jschar. |
michael@0 | 209 | * upper = char >> 5 ([***********.....]) |
michael@0 | 210 | * Step 2: |
michael@0 | 211 | * Using these bits to get an reduced index from index1. |
michael@0 | 212 | * index = index1[upper] |
michael@0 | 213 | * Step 3: |
michael@0 | 214 | * Combining the index and the bottom 5 bits of the original jschar. |
michael@0 | 215 | * real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++]) |
michael@0 | 216 | * |
michael@0 | 217 | * The advantage here is that the biggest number in index1 doesn't need 10 bits, |
michael@0 | 218 | * but 7 and we save some memory. |
michael@0 | 219 | * |
michael@0 | 220 | * Step 4: |
michael@0 | 221 | * Get the character informations by looking up real_index in js_charinfo. |
michael@0 | 222 | * |
michael@0 | 223 | * Pseudocode of generation: |
michael@0 | 224 | * |
michael@0 | 225 | * let table be the mapping of jschar => js_charinfo_index |
michael@0 | 226 | * let index1 be an empty array |
michael@0 | 227 | * let index2 be an empty array |
michael@0 | 228 | * let cache be a hash map |
michael@0 | 229 | * |
michael@0 | 230 | * while shift is less then maximal amount you can shift 0xffff before it's 0 |
michael@0 | 231 | * let chunks be table split in chunks of size 2**shift |
michael@0 | 232 | * |
michael@0 | 233 | * for every chunk in chunks |
michael@0 | 234 | * if chunk is in cache |
michael@0 | 235 | * let index be cache[chunk] |
michael@0 | 236 | * else |
michael@0 | 237 | * let index be the max key of index2 + 1 |
michael@0 | 238 | * for element in chunk |
michael@0 | 239 | * push element to index2 |
michael@0 | 240 | * put index as chunk in cache |
michael@0 | 241 | * |
michael@0 | 242 | * push index >> shift to index1 |
michael@0 | 243 | * |
michael@0 | 244 | * increase shift |
michael@0 | 245 | * stop if you found the best shift |
michael@0 | 246 | */ |
michael@0 | 247 | """ |
michael@0 | 248 | data_file.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') |
michael@0 | 249 | data_file.write(public_domain) |
michael@0 | 250 | data_file.write('#include "vm/Unicode.h"\n\n') |
michael@0 | 251 | data_file.write('using namespace js;\n') |
michael@0 | 252 | data_file.write('using namespace js::unicode;\n') |
michael@0 | 253 | data_file.write(comment) |
michael@0 | 254 | data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n') |
michael@0 | 255 | for d in table: |
michael@0 | 256 | data_file.write(' {') |
michael@0 | 257 | data_file.write(', '.join((str(e) for e in d))) |
michael@0 | 258 | data_file.write('},\n') |
michael@0 | 259 | data_file.write('};\n') |
michael@0 | 260 | data_file.write('\n') |
michael@0 | 261 | |
michael@0 | 262 | def dump(data, name, file): |
michael@0 | 263 | file.write('const uint8_t unicode::' + name + '[] = {\n') |
michael@0 | 264 | |
michael@0 | 265 | line = pad = ' ' * 4 |
michael@0 | 266 | lines = [] |
michael@0 | 267 | for entry in data: |
michael@0 | 268 | assert entry < 256 |
michael@0 | 269 | s = str(entry) |
michael@0 | 270 | s = s.rjust(3) |
michael@0 | 271 | |
michael@0 | 272 | if len(line + s) + 5 > 99: |
michael@0 | 273 | lines.append(line.rstrip()) |
michael@0 | 274 | line = pad + s + ', ' |
michael@0 | 275 | else: |
michael@0 | 276 | line = line + s + ', ' |
michael@0 | 277 | lines.append(line.rstrip()) |
michael@0 | 278 | |
michael@0 | 279 | file.write('\n'.join(lines)) |
michael@0 | 280 | file.write('\n};\n') |
michael@0 | 281 | |
michael@0 | 282 | dump(index1, 'index1', data_file) |
michael@0 | 283 | data_file.write('\n') |
michael@0 | 284 | dump(index2, 'index2', data_file) |
michael@0 | 285 | data_file.write('\n') |
michael@0 | 286 | |
michael@0 | 287 | data_file.write('\n') |
michael@0 | 288 | |
michael@0 | 289 | def getsize(data): |
michael@0 | 290 | """ return smallest possible integer size for the given array """ |
michael@0 | 291 | maxdata = max(data) |
michael@0 | 292 | assert maxdata < 2**32 |
michael@0 | 293 | |
michael@0 | 294 | if maxdata < 256: |
michael@0 | 295 | return 1 |
michael@0 | 296 | elif maxdata < 65536: |
michael@0 | 297 | return 2 |
michael@0 | 298 | else: |
michael@0 | 299 | return 4 |
michael@0 | 300 | |
michael@0 | 301 | def splitbins(t): |
michael@0 | 302 | """t -> (t1, t2, shift). Split a table to save space. |
michael@0 | 303 | |
michael@0 | 304 | t is a sequence of ints. This function can be useful to save space if |
michael@0 | 305 | many of the ints are the same. t1 and t2 are lists of ints, and shift |
michael@0 | 306 | is an int, chosen to minimize the combined size of t1 and t2 (in C |
michael@0 | 307 | code), and where for each i in range(len(t)), |
michael@0 | 308 | t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] |
michael@0 | 309 | where mask is a bitmask isolating the last "shift" bits. |
michael@0 | 310 | """ |
michael@0 | 311 | |
michael@0 | 312 | def dump(t1, t2, shift, bytes): |
michael@0 | 313 | print("%d+%d bins at shift %d; %d bytes" % ( |
michael@0 | 314 | len(t1), len(t2), shift, bytes), file=sys.stderr) |
michael@0 | 315 | print("Size of original table:", len(t)*getsize(t), \ |
michael@0 | 316 | "bytes", file=sys.stderr) |
michael@0 | 317 | n = len(t)-1 # last valid index |
michael@0 | 318 | maxshift = 0 # the most we can shift n and still have something left |
michael@0 | 319 | if n > 0: |
michael@0 | 320 | while n >> 1: |
michael@0 | 321 | n >>= 1 |
michael@0 | 322 | maxshift += 1 |
michael@0 | 323 | del n |
michael@0 | 324 | bytes = sys.maxsize # smallest total size so far |
michael@0 | 325 | t = tuple(t) # so slices can be dict keys |
michael@0 | 326 | for shift in range(maxshift + 1): |
michael@0 | 327 | t1 = [] |
michael@0 | 328 | t2 = [] |
michael@0 | 329 | size = 2**shift |
michael@0 | 330 | bincache = {} |
michael@0 | 331 | |
michael@0 | 332 | for i in range(0, len(t), size): |
michael@0 | 333 | bin = t[i:i + size] |
michael@0 | 334 | |
michael@0 | 335 | index = bincache.get(bin) |
michael@0 | 336 | if index is None: |
michael@0 | 337 | index = len(t2) |
michael@0 | 338 | bincache[bin] = index |
michael@0 | 339 | t2.extend(bin) |
michael@0 | 340 | t1.append(index >> shift) |
michael@0 | 341 | |
michael@0 | 342 | # determine memory size |
michael@0 | 343 | b = len(t1) * getsize(t1) + len(t2) * getsize(t2) |
michael@0 | 344 | if b < bytes: |
michael@0 | 345 | best = t1, t2, shift |
michael@0 | 346 | bytes = b |
michael@0 | 347 | t1, t2, shift = best |
michael@0 | 348 | |
michael@0 | 349 | print("Best:", end=' ', file=sys.stderr) |
michael@0 | 350 | dump(t1, t2, shift, bytes) |
michael@0 | 351 | |
michael@0 | 352 | # exhaustively verify that the decomposition is correct |
michael@0 | 353 | mask = 2**shift - 1 |
michael@0 | 354 | for i in range(len(t)): |
michael@0 | 355 | assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] |
michael@0 | 356 | return best |
michael@0 | 357 | |
michael@0 | 358 | if __name__ == '__main__': |
michael@0 | 359 | import urllib2 |
michael@0 | 360 | |
michael@0 | 361 | if len(sys.argv) > 1: |
michael@0 | 362 | print('Always make sure you have the newest UnicodeData.txt!') |
michael@0 | 363 | unicode_data = open(sys.argv[1], 'r') |
michael@0 | 364 | else: |
michael@0 | 365 | print('Downloading...') |
michael@0 | 366 | reader = urllib2.urlopen('http://unicode.org/Public/UNIDATA/UnicodeData.txt') |
michael@0 | 367 | data = reader.read() |
michael@0 | 368 | reader.close() |
michael@0 | 369 | unicode_data = open('UnicodeData.txt', 'w+') |
michael@0 | 370 | unicode_data.write(data) |
michael@0 | 371 | unicode_data.seek(0) |
michael@0 | 372 | |
michael@0 | 373 | print('Generating...') |
michael@0 | 374 | generate_unicode_stuff(unicode_data, |
michael@0 | 375 | open('Unicode.cpp', 'w'), |
michael@0 | 376 | open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'), |
michael@0 | 377 | open('../tests/ecma_5/String/string-space-trim.js', 'w')) |