|
1 # -*- coding: utf-8 -*- |
|
2 # Based upon makeunicodedata.py |
|
3 # (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py) |
|
4 # written by Fredrik Lundh (fredrik@pythonware.com) |
|
5 # |
|
6 # Copyright (C) 2011 Tom Schuster <evilpies@gmail.com> |
|
7 # |
|
8 # This program is free software: you can redistribute it and/or modify |
|
9 # it under the terms of the GNU General Public License as published by |
|
10 # the Free Software Foundation, either version 3 of the License, or |
|
11 # (at your option) any later version. |
|
12 # |
|
13 # This program is distributed in the hope that it will be useful, |
|
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
16 # GNU General Public License for more details. |
|
17 # |
|
18 # You should have received a copy of the GNU General Public License |
|
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. |
|
20 |
|
21 from __future__ import print_function |
|
22 import csv |
|
23 import sys |
|
24 |
|
25 # ECMAScript 5 $ 7.2 |
|
26 whitespace = [ |
|
27 # python doesn't support using control character names :( |
|
28 0x9, # CHARACTER TABULATION |
|
29 0xb, # LINE TABULATION |
|
30 0xc, # FORM FEED |
|
31 ord(u'\N{SPACE}'), |
|
32 ord(u'\N{NO-BREAK SPACE}'), |
|
33 ord(u'\N{ZERO WIDTH NO-BREAK SPACE}'), # also BOM |
|
34 ] |
|
35 |
|
36 # $ 7.3 |
|
37 line_terminator = [ |
|
38 0xa, # LINE FEED |
|
39 0xd, # CARRIAGE RETURN |
|
40 ord(u'\N{LINE SEPARATOR}'), |
|
41 ord(u'\N{PARAGRAPH SEPARATOR}'), |
|
42 ] |
|
43 |
|
44 # These are also part of IdentifierPart $7.6 |
|
45 ZWNJ = ord(u'\N{ZERO WIDTH NON-JOINER}') |
|
46 ZWJ = ord(u'\N{ZERO WIDTH JOINER}') |
|
47 |
|
48 FLAG_SPACE = 1 << 0 |
|
49 FLAG_LETTER = 1 << 1 |
|
50 FLAG_IDENTIFIER_PART = 1 << 2 |
|
51 |
|
52 MAX = 0xffff |
|
53 |
|
54 public_domain = """ |
|
55 /* |
|
56 * Any copyright is dedicated to the Public Domain. |
|
57 * http://creativecommons.org/licenses/publicdomain/ |
|
58 */ |
|
59 """ |
|
60 |
|
61 def read_unicode_data(unicode_file): |
|
62 """ |
|
63 If you want to understand how this wonderful file format works checkout |
|
64 Unicode Standard Annex #44 - Unicode Character Database |
|
65 http://www.unicode.org/reports/tr44/ |
|
66 """ |
|
67 |
|
68 reader = csv.reader(unicode_data, delimiter=';') |
|
69 |
|
70 while True: |
|
71 row = reader.next() |
|
72 name = row[1] |
|
73 |
|
74 # We need to expand the UAX #44 4.2.3 Code Point Range |
|
75 if name.startswith('<') and name.endswith('First>'): |
|
76 next_row = reader.next() |
|
77 |
|
78 for i in range(int(row[0], 16), int(next_row[0], 16) + 1): |
|
79 row[0] = i |
|
80 row[1] = name[1:-8] |
|
81 |
|
82 yield row |
|
83 else: |
|
84 row[0] = int(row[0], 16) |
|
85 yield row |
|
86 |
|
87 def generate_unicode_stuff(unicode_data, data_file, test_mapping, test_space): |
|
88 dummy = (0, 0, 0) |
|
89 table = [dummy] |
|
90 cache = {dummy: 0} |
|
91 index = [0] * (MAX + 1) |
|
92 test_table = {} |
|
93 test_space_table = [] |
|
94 |
|
95 for row in read_unicode_data(unicode_data): |
|
96 code = row[0] |
|
97 name = row[1] |
|
98 category = row[2] |
|
99 alias = row[-5] |
|
100 uppercase = row[-3] |
|
101 lowercase = row[-2] |
|
102 flags = 0 |
|
103 |
|
104 if code > MAX: |
|
105 break |
|
106 |
|
107 # we combine whitespace and lineterminators because in pratice we don't need them separated |
|
108 if category == 'Zs' or code in whitespace or code in line_terminator: |
|
109 flags |= FLAG_SPACE |
|
110 test_space_table.append(code) |
|
111 if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter) |
|
112 flags |= FLAG_LETTER |
|
113 if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart) |
|
114 flags |= FLAG_IDENTIFIER_PART |
|
115 |
|
116 if uppercase: |
|
117 upper = int(uppercase, 16) |
|
118 else: |
|
119 upper = code |
|
120 |
|
121 if lowercase: |
|
122 lower = int(lowercase, 16) |
|
123 else: |
|
124 lower = code |
|
125 |
|
126 test_table[code] = (upper, lower, name, alias) |
|
127 |
|
128 up_d = upper - code |
|
129 low_d = lower - code |
|
130 |
|
131 assert up_d > -65535 and up_d < 65535 |
|
132 assert low_d > -65535 and low_d < 65535 |
|
133 |
|
134 upper = up_d & 0xffff |
|
135 lower = low_d & 0xffff |
|
136 |
|
137 item = (upper, lower, flags) |
|
138 |
|
139 i = cache.get(item) |
|
140 if i is None: |
|
141 assert item not in table |
|
142 cache[item] = i = len(table) |
|
143 table.append(item) |
|
144 index[code] = i |
|
145 |
|
146 test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') |
|
147 test_mapping.write(public_domain) |
|
148 test_mapping.write('var mapping = [\n') |
|
149 for code in range(0, MAX + 1): |
|
150 entry = test_table.get(code) |
|
151 |
|
152 if entry: |
|
153 upper, lower, name, alias = entry |
|
154 test_mapping.write(' [' + hex(upper) + ', ' + hex(lower) + '], /* ' + |
|
155 name + (' (' + alias + ')' if alias else '') + ' */\n') |
|
156 else: |
|
157 test_mapping.write(' [' + hex(code) + ', ' + hex(code) + '],\n') |
|
158 test_mapping.write('];') |
|
159 test_mapping.write(""" |
|
160 assertEq(mapping.length, 0x10000); |
|
161 for (var i = 0; i <= 0xffff; i++) { |
|
162 var char = String.fromCharCode(i); |
|
163 var info = mapping[i]; |
|
164 |
|
165 assertEq(char.toUpperCase().charCodeAt(0), info[0]); |
|
166 assertEq(char.toLowerCase().charCodeAt(0), info[1]); |
|
167 } |
|
168 |
|
169 if (typeof reportCompare === "function") |
|
170 reportCompare(true, true); |
|
171 """) |
|
172 |
|
173 test_space.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') |
|
174 test_space.write(public_domain) |
|
175 test_space.write('var onlySpace = String.fromCharCode(' + |
|
176 ', '.join(map(lambda c: hex(c), test_space_table)) + ');\n') |
|
177 test_space.write(""" |
|
178 assertEq(onlySpace.trim(), ""); |
|
179 assertEq((onlySpace + 'aaaa').trim(), 'aaaa'); |
|
180 assertEq(('aaaa' + onlySpace).trim(), 'aaaa'); |
|
181 assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa'); |
|
182 |
|
183 if (typeof reportCompare === "function") |
|
184 reportCompare(true, true); |
|
185 """) |
|
186 |
|
187 index1, index2, shift = splitbins(index) |
|
188 |
|
189 # Don't forget to update CharInfo in Unicode.cpp if you need to change this |
|
190 assert shift == 5 |
|
191 |
|
192 # verify correctness |
|
193 for char in index: |
|
194 test = table[index[char]] |
|
195 |
|
196 idx = index1[char >> shift] |
|
197 idx = index2[(idx << shift) + (char & ((1 << shift) - 1))] |
|
198 |
|
199 assert test == table[idx] |
|
200 |
|
201 |
|
202 comment = """ |
|
203 /* |
|
204 * So how does indexing work? |
|
205 * First let's have a look at a jschar, 16-bits: |
|
206 * [................] |
|
207 * Step 1: |
|
208 * Extracting the upper 11 bits from the jschar. |
|
209 * upper = char >> 5 ([***********.....]) |
|
210 * Step 2: |
|
211 * Using these bits to get an reduced index from index1. |
|
212 * index = index1[upper] |
|
213 * Step 3: |
|
214 * Combining the index and the bottom 5 bits of the original jschar. |
|
215 * real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++]) |
|
216 * |
|
217 * The advantage here is that the biggest number in index1 doesn't need 10 bits, |
|
218 * but 7 and we save some memory. |
|
219 * |
|
220 * Step 4: |
|
221 * Get the character informations by looking up real_index in js_charinfo. |
|
222 * |
|
223 * Pseudocode of generation: |
|
224 * |
|
225 * let table be the mapping of jschar => js_charinfo_index |
|
226 * let index1 be an empty array |
|
227 * let index2 be an empty array |
|
228 * let cache be a hash map |
|
229 * |
|
230 * while shift is less then maximal amount you can shift 0xffff before it's 0 |
|
231 * let chunks be table split in chunks of size 2**shift |
|
232 * |
|
233 * for every chunk in chunks |
|
234 * if chunk is in cache |
|
235 * let index be cache[chunk] |
|
236 * else |
|
237 * let index be the max key of index2 + 1 |
|
238 * for element in chunk |
|
239 * push element to index2 |
|
240 * put index as chunk in cache |
|
241 * |
|
242 * push index >> shift to index1 |
|
243 * |
|
244 * increase shift |
|
245 * stop if you found the best shift |
|
246 */ |
|
247 """ |
|
248 data_file.write('/* Generated by make_unicode.py DO NOT MODIFY */\n') |
|
249 data_file.write(public_domain) |
|
250 data_file.write('#include "vm/Unicode.h"\n\n') |
|
251 data_file.write('using namespace js;\n') |
|
252 data_file.write('using namespace js::unicode;\n') |
|
253 data_file.write(comment) |
|
254 data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n') |
|
255 for d in table: |
|
256 data_file.write(' {') |
|
257 data_file.write(', '.join((str(e) for e in d))) |
|
258 data_file.write('},\n') |
|
259 data_file.write('};\n') |
|
260 data_file.write('\n') |
|
261 |
|
262 def dump(data, name, file): |
|
263 file.write('const uint8_t unicode::' + name + '[] = {\n') |
|
264 |
|
265 line = pad = ' ' * 4 |
|
266 lines = [] |
|
267 for entry in data: |
|
268 assert entry < 256 |
|
269 s = str(entry) |
|
270 s = s.rjust(3) |
|
271 |
|
272 if len(line + s) + 5 > 99: |
|
273 lines.append(line.rstrip()) |
|
274 line = pad + s + ', ' |
|
275 else: |
|
276 line = line + s + ', ' |
|
277 lines.append(line.rstrip()) |
|
278 |
|
279 file.write('\n'.join(lines)) |
|
280 file.write('\n};\n') |
|
281 |
|
282 dump(index1, 'index1', data_file) |
|
283 data_file.write('\n') |
|
284 dump(index2, 'index2', data_file) |
|
285 data_file.write('\n') |
|
286 |
|
287 data_file.write('\n') |
|
288 |
|
289 def getsize(data): |
|
290 """ return smallest possible integer size for the given array """ |
|
291 maxdata = max(data) |
|
292 assert maxdata < 2**32 |
|
293 |
|
294 if maxdata < 256: |
|
295 return 1 |
|
296 elif maxdata < 65536: |
|
297 return 2 |
|
298 else: |
|
299 return 4 |
|
300 |
|
301 def splitbins(t): |
|
302 """t -> (t1, t2, shift). Split a table to save space. |
|
303 |
|
304 t is a sequence of ints. This function can be useful to save space if |
|
305 many of the ints are the same. t1 and t2 are lists of ints, and shift |
|
306 is an int, chosen to minimize the combined size of t1 and t2 (in C |
|
307 code), and where for each i in range(len(t)), |
|
308 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] |
|
309 where mask is a bitmask isolating the last "shift" bits. |
|
310 """ |
|
311 |
|
312 def dump(t1, t2, shift, bytes): |
|
313 print("%d+%d bins at shift %d; %d bytes" % ( |
|
314 len(t1), len(t2), shift, bytes), file=sys.stderr) |
|
315 print("Size of original table:", len(t)*getsize(t), \ |
|
316 "bytes", file=sys.stderr) |
|
317 n = len(t)-1 # last valid index |
|
318 maxshift = 0 # the most we can shift n and still have something left |
|
319 if n > 0: |
|
320 while n >> 1: |
|
321 n >>= 1 |
|
322 maxshift += 1 |
|
323 del n |
|
324 bytes = sys.maxsize # smallest total size so far |
|
325 t = tuple(t) # so slices can be dict keys |
|
326 for shift in range(maxshift + 1): |
|
327 t1 = [] |
|
328 t2 = [] |
|
329 size = 2**shift |
|
330 bincache = {} |
|
331 |
|
332 for i in range(0, len(t), size): |
|
333 bin = t[i:i + size] |
|
334 |
|
335 index = bincache.get(bin) |
|
336 if index is None: |
|
337 index = len(t2) |
|
338 bincache[bin] = index |
|
339 t2.extend(bin) |
|
340 t1.append(index >> shift) |
|
341 |
|
342 # determine memory size |
|
343 b = len(t1) * getsize(t1) + len(t2) * getsize(t2) |
|
344 if b < bytes: |
|
345 best = t1, t2, shift |
|
346 bytes = b |
|
347 t1, t2, shift = best |
|
348 |
|
349 print("Best:", end=' ', file=sys.stderr) |
|
350 dump(t1, t2, shift, bytes) |
|
351 |
|
352 # exhaustively verify that the decomposition is correct |
|
353 mask = 2**shift - 1 |
|
354 for i in range(len(t)): |
|
355 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] |
|
356 return best |
|
357 |
|
358 if __name__ == '__main__': |
|
359 import urllib2 |
|
360 |
|
361 if len(sys.argv) > 1: |
|
362 print('Always make sure you have the newest UnicodeData.txt!') |
|
363 unicode_data = open(sys.argv[1], 'r') |
|
364 else: |
|
365 print('Downloading...') |
|
366 reader = urllib2.urlopen('http://unicode.org/Public/UNIDATA/UnicodeData.txt') |
|
367 data = reader.read() |
|
368 reader.close() |
|
369 unicode_data = open('UnicodeData.txt', 'w+') |
|
370 unicode_data.write(data) |
|
371 unicode_data.seek(0) |
|
372 |
|
373 print('Generating...') |
|
374 generate_unicode_stuff(unicode_data, |
|
375 open('Unicode.cpp', 'w'), |
|
376 open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'), |
|
377 open('../tests/ecma_5/String/string-space-trim.js', 'w')) |