|
1 #!/usr/bin/env perl |
|
2 |
|
3 # This Source Code Form is subject to the terms of the Mozilla Public |
|
4 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
6 |
|
7 # This tool is used to prepare lookup tables of Unicode character properties |
|
8 # needed by gfx code to support text shaping operations. The properties are |
|
9 # read from the Unicode Character Database and compiled into multi-level arrays |
|
10 # for efficient lookup. |
|
11 # |
|
12 # To regenerate the tables in nsUnicodePropertyData.cpp: |
|
13 # |
|
14 # (1) Download the current Unicode data files from |
|
15 # |
|
16 # http://www.unicode.org/Public/UNIDATA/ |
|
17 # |
|
18 # NB: not all the files are actually needed; currently, we require |
|
19 # - UnicodeData.txt |
|
20 # - Scripts.txt |
|
21 # - EastAsianWidth.txt |
|
22 # - BidiMirroring.txt |
|
23 # - HangulSyllableType.txt |
|
24 # - ReadMe.txt (to record version/date of the UCD) |
|
25 # - Unihan_Variants.txt (from Unihan.zip) |
|
26 # though this may change if we find a need for additional properties. |
|
27 # |
|
28 # The Unicode data files listed above should be together in one directory. |
|
29 # We also require the file |
|
30 # http://www.unicode.org/Public/security/latest/xidmodifications.txt |
|
31 # This file should be in a sub-directory "security" immediately below the |
|
32 # directory containing the other Unicode data files. |
|
33 # |
|
34 # (2) Run this tool using a command line of the form |
|
35 # |
|
36 # perl genUnicodePropertyData.pl \ |
|
37 # /path/to/harfbuzz/src \ |
|
38 # /path/to/UCD-directory |
|
39 # |
|
40 # This will generate (or overwrite!) the files |
|
41 # |
|
42 # nsUnicodePropertyData.cpp |
|
43 # nsUnicodeScriptCodes.h |
|
44 # |
|
45 # in the current directory. |
|
46 |
|
47 use strict; |
|
48 use List::Util qw(first); |
|
49 |
|
50 if ($#ARGV != 1) { |
|
51 print <<__EOT; |
|
52 # Run this tool using a command line of the form |
|
53 # |
|
54 # perl genUnicodePropertyData.pl \ |
|
55 # /path/to/harfbuzz/src \ |
|
56 # /path/to/UCD-directory |
|
57 # |
|
58 # where harfbuzz/src is the directory containing harfbuzz .cc and .hh files, |
|
59 # and UCD-directory is a directory containing the current Unicode Character |
|
60 # Database files (UnicodeData.txt, etc), available from |
|
61 # http://www.unicode.org/Public/UNIDATA/ |
|
62 # |
|
63 # This will generate (or overwrite!) the files |
|
64 # |
|
65 # nsUnicodePropertyData.cpp |
|
66 # nsUnicodeScriptCodes.h |
|
67 # |
|
68 # in the current directory. |
|
69 __EOT |
|
70 exit 0; |
|
71 } |
|
72 |
|
73 # load HB_Script and HB_Category constants |
|
74 |
|
75 # NOTE that HB_SCRIPT_* constants are now "tag" values, NOT sequentially-allocated |
|
76 # script codes as used by Glib/Pango/etc. |
|
77 # We therefore define a set of MOZ_SCRIPT_* constants that are script _codes_ |
|
78 # compatible with those libraries, and map these to HB_SCRIPT_* _tags_ as needed. |
|
79 |
|
80 # CHECK that this matches Pango source (as found for example at |
|
81 # http://git.gnome.org/browse/pango/tree/pango/pango-script.h) |
|
82 # for as many codes as that defines (currently up through Unicode 5.1) |
|
83 # and the GLib enumeration |
|
84 # http://developer.gnome.org/glib/2.30/glib-Unicode-Manipulation.html#GUnicodeScript |
|
85 # (currently defined up through Unicode 6.0). |
|
86 # Constants beyond these may be regarded as unstable for now, but we don't actually |
|
87 # depend on the specific values. |
|
88 my %scriptCode = ( |
|
89 INVALID => -1, |
|
90 COMMON => 0, |
|
91 INHERITED => 1, |
|
92 ARABIC => 2, |
|
93 ARMENIAN => 3, |
|
94 BENGALI => 4, |
|
95 BOPOMOFO => 5, |
|
96 CHEROKEE => 6, |
|
97 COPTIC => 7, |
|
98 CYRILLIC => 8, |
|
99 DESERET => 9, |
|
100 DEVANAGARI => 10, |
|
101 ETHIOPIC => 11, |
|
102 GEORGIAN => 12, |
|
103 GOTHIC => 13, |
|
104 GREEK => 14, |
|
105 GUJARATI => 15, |
|
106 GURMUKHI => 16, |
|
107 HAN => 17, |
|
108 HANGUL => 18, |
|
109 HEBREW => 19, |
|
110 HIRAGANA => 20, |
|
111 KANNADA => 21, |
|
112 KATAKANA => 22, |
|
113 KHMER => 23, |
|
114 LAO => 24, |
|
115 LATIN => 25, |
|
116 MALAYALAM => 26, |
|
117 MONGOLIAN => 27, |
|
118 MYANMAR => 28, |
|
119 OGHAM => 29, |
|
120 OLD_ITALIC => 30, |
|
121 ORIYA => 31, |
|
122 RUNIC => 32, |
|
123 SINHALA => 33, |
|
124 SYRIAC => 34, |
|
125 TAMIL => 35, |
|
126 TELUGU => 36, |
|
127 THAANA => 37, |
|
128 THAI => 38, |
|
129 TIBETAN => 39, |
|
130 CANADIAN_ABORIGINAL => 40, |
|
131 YI => 41, |
|
132 TAGALOG => 42, |
|
133 HANUNOO => 43, |
|
134 BUHID => 44, |
|
135 TAGBANWA => 45, |
|
136 # unicode 4.0 additions |
|
137 BRAILLE => 46, |
|
138 CYPRIOT => 47, |
|
139 LIMBU => 48, |
|
140 OSMANYA => 49, |
|
141 SHAVIAN => 50, |
|
142 LINEAR_B => 51, |
|
143 TAI_LE => 52, |
|
144 UGARITIC => 53, |
|
145 # unicode 4.1 additions |
|
146 NEW_TAI_LUE => 54, |
|
147 BUGINESE => 55, |
|
148 GLAGOLITIC => 56, |
|
149 TIFINAGH => 57, |
|
150 SYLOTI_NAGRI => 58, |
|
151 OLD_PERSIAN => 59, |
|
152 KHAROSHTHI => 60, |
|
153 # unicode 5.0 additions |
|
154 UNKNOWN => 61, |
|
155 BALINESE => 62, |
|
156 CUNEIFORM => 63, |
|
157 PHOENICIAN => 64, |
|
158 PHAGS_PA => 65, |
|
159 NKO => 66, |
|
160 # unicode 5.1 additions |
|
161 KAYAH_LI => 67, |
|
162 LEPCHA => 68, |
|
163 REJANG => 69, |
|
164 SUNDANESE => 70, |
|
165 SAURASHTRA => 71, |
|
166 CHAM => 72, |
|
167 OL_CHIKI => 73, |
|
168 VAI => 74, |
|
169 CARIAN => 75, |
|
170 LYCIAN => 76, |
|
171 LYDIAN => 77, |
|
172 # unicode 5.2 additions |
|
173 AVESTAN => 78, |
|
174 BAMUM => 79, |
|
175 EGYPTIAN_HIEROGLYPHS => 80, |
|
176 IMPERIAL_ARAMAIC => 81, |
|
177 INSCRIPTIONAL_PAHLAVI => 82, |
|
178 INSCRIPTIONAL_PARTHIAN => 83, |
|
179 JAVANESE => 84, |
|
180 KAITHI => 85, |
|
181 LISU => 86, |
|
182 MEETEI_MAYEK => 87, |
|
183 OLD_SOUTH_ARABIAN => 88, |
|
184 OLD_TURKIC => 89, |
|
185 SAMARITAN => 90, |
|
186 TAI_THAM => 91, |
|
187 TAI_VIET => 92, |
|
188 # unicode 6.0 additions |
|
189 BATAK => 93, |
|
190 BRAHMI => 94, |
|
191 MANDAIC => 95, |
|
192 # unicode 6.1 additions |
|
193 CHAKMA => 96, |
|
194 MEROITIC_CURSIVE => 97, |
|
195 MEROITIC_HIEROGLYPHS => 98, |
|
196 MIAO => 99, |
|
197 SHARADA => 100, |
|
198 SORA_SOMPENG => 101, |
|
199 TAKRI => 102 |
|
200 ); |
|
201 |
|
202 my $sc = -1; |
|
203 my $cc = -1; |
|
204 my %catCode; |
|
205 my @scriptCodeToTag; |
|
206 my @scriptCodeToName; |
|
207 |
|
208 sub readHarfBuzzHeader |
|
209 { |
|
210 my $file = shift; |
|
211 open FH, "< $ARGV[0]/$file" or die "can't open harfbuzz header $ARGV[0]/$file\n"; |
|
212 while (<FH>) { |
|
213 if (m/HB_SCRIPT_([A-Z_]+)\s*=\s*HB_TAG\s*\(('.','.','.','.')\)\s*,/) { |
|
214 unless (exists $scriptCode{$1}) { |
|
215 warn "unknown script name $1 found in $file\n"; |
|
216 next; |
|
217 } |
|
218 $sc = $scriptCode{$1}; |
|
219 $scriptCodeToTag[$sc] = $2; |
|
220 $scriptCodeToName[$sc] = $1; |
|
221 } |
|
222 if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) { |
|
223 $cc++; |
|
224 $catCode{$1} = $cc; |
|
225 } |
|
226 } |
|
227 close FH; |
|
228 } |
|
229 |
|
230 &readHarfBuzzHeader("hb-common.h"); |
|
231 &readHarfBuzzHeader("hb-unicode.h"); |
|
232 |
|
233 die "didn't find HarfBuzz script codes\n" if $sc == -1; |
|
234 die "didn't find HarfBuzz category codes\n" if $cc == -1; |
|
235 |
|
236 my %xidmodCode = ( |
|
237 'inclusion' => 0, |
|
238 'recommended' => 1, |
|
239 'default-ignorable' => 2, |
|
240 'historic' => 3, |
|
241 'limited-use' => 4, |
|
242 'not-NFKC' => 5, |
|
243 'not-xid' => 6, |
|
244 'obsolete' => 7, |
|
245 'technical' => 8, |
|
246 'not-chars' => 9 |
|
247 ); |
|
248 |
|
249 my %bidicategoryCode = ( |
|
250 "L" => "0", # Left-to-Right |
|
251 "R" => "1", # Right-to-Left |
|
252 "EN" => "2", # European Number |
|
253 "ES" => "3", # European Number Separator |
|
254 "ET" => "4", # European Number Terminator |
|
255 "AN" => "5", # Arabic Number |
|
256 "CS" => "6", # Common Number Separator |
|
257 "B" => "7", # Paragraph Separator |
|
258 "S" => "8", # Segment Separator |
|
259 "WS" => "9", # Whitespace |
|
260 "ON" => "10", # Other Neutrals |
|
261 "LRE" => "11", # Left-to-Right Embedding |
|
262 "LRO" => "12", # Left-to-Right Override |
|
263 "AL" => "13", # Right-to-Left Arabic |
|
264 "RLE" => "14", # Right-to-Left Embedding |
|
265 "RLO" => "15", # Right-to-Left Override |
|
266 "PDF" => "16", # Pop Directional Format |
|
267 "NSM" => "17", # Non-Spacing Mark |
|
268 "BN" => "18" # Boundary Neutral |
|
269 ); |
|
270 |
|
271 # initialize default properties |
|
272 my @script; |
|
273 my @category; |
|
274 my @combining; |
|
275 my @eaw; |
|
276 my @mirror; |
|
277 my @hangul; |
|
278 my @casemap; |
|
279 my @xidmod; |
|
280 my @numericvalue; |
|
281 my @hanVariant; |
|
282 my @bidicategory; |
|
283 my @fullWidth; |
|
284 for (my $i = 0; $i < 0x110000; ++$i) { |
|
285 $script[$i] = $scriptCode{"UNKNOWN"}; |
|
286 $category[$i] = $catCode{"UNASSIGNED"}; |
|
287 $combining[$i] = 0; |
|
288 $casemap[$i] = 0; |
|
289 $xidmod[$i] = $xidmodCode{"not-chars"}; |
|
290 $numericvalue[$i] = -1; |
|
291 $hanVariant[$i] = 0; |
|
292 $bidicategory[$i] = $bidicategoryCode{"L"}; |
|
293 $fullWidth[$i] = 0; |
|
294 } |
|
295 |
|
296 # blocks where the default for bidi category is not L |
|
297 for my $i (0x0600..0x07BF, 0x08A0..0x08FF, 0xFB50..0xFDCF, 0xFDF0..0xFDFF, 0xFE70..0xFEFF, 0x1EE00..0x0001EEFF) { |
|
298 $bidicategory[$i] = $bidicategoryCode{"AL"}; |
|
299 } |
|
300 for my $i (0x0590..0x05FF, 0x07C0..0x089F, 0xFB1D..0xFB4F, 0x00010800..0x00010FFF, 0x0001E800..0x0001EDFF, 0x0001EF00..0x0001EFFF) { |
|
301 $bidicategory[$i] = $bidicategoryCode{"R"}; |
|
302 } |
|
303 for my $i (0x20A0..0x20CF) { |
|
304 $bidicategory[$i] = $bidicategoryCode{"ET"}; |
|
305 } |
|
306 |
|
307 my %ucd2hb = ( |
|
308 'Cc' => 'CONTROL', |
|
309 'Cf' => 'FORMAT', |
|
310 'Cn' => 'UNASSIGNED', |
|
311 'Co' => 'PRIVATE_USE', |
|
312 'Cs' => 'SURROGATE', |
|
313 'Ll' => 'LOWERCASE_LETTER', |
|
314 'Lm' => 'MODIFIER_LETTER', |
|
315 'Lo' => 'OTHER_LETTER', |
|
316 'Lt' => 'TITLECASE_LETTER', |
|
317 'Lu' => 'UPPERCASE_LETTER', |
|
318 'Mc' => 'SPACING_MARK', |
|
319 'Me' => 'ENCLOSING_MARK', |
|
320 'Mn' => 'NON_SPACING_MARK', |
|
321 'Nd' => 'DECIMAL_NUMBER', |
|
322 'Nl' => 'LETTER_NUMBER', |
|
323 'No' => 'OTHER_NUMBER', |
|
324 'Pc' => 'CONNECT_PUNCTUATION', |
|
325 'Pd' => 'DASH_PUNCTUATION', |
|
326 'Pe' => 'CLOSE_PUNCTUATION', |
|
327 'Pf' => 'FINAL_PUNCTUATION', |
|
328 'Pi' => 'INITIAL_PUNCTUATION', |
|
329 'Po' => 'OTHER_PUNCTUATION', |
|
330 'Ps' => 'OPEN_PUNCTUATION', |
|
331 'Sc' => 'CURRENCY_SYMBOL', |
|
332 'Sk' => 'MODIFIER_SYMBOL', |
|
333 'Sm' => 'MATH_SYMBOL', |
|
334 'So' => 'OTHER_SYMBOL', |
|
335 'Zl' => 'LINE_SEPARATOR', |
|
336 'Zp' => 'PARAGRAPH_SEPARATOR', |
|
337 'Zs' => 'SPACE_SEPARATOR' |
|
338 ); |
|
339 |
|
340 # read ReadMe.txt |
|
341 my @versionInfo; |
|
342 open FH, "< $ARGV[1]/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n"; |
|
343 while (<FH>) { |
|
344 chomp; |
|
345 push @versionInfo, $_; |
|
346 } |
|
347 close FH; |
|
348 |
|
349 my $kTitleToUpper = 0x80000000; |
|
350 my $kUpperToLower = 0x40000000; |
|
351 my $kLowerToTitle = 0x20000000; |
|
352 my $kLowerToUpper = 0x10000000; |
|
353 my $kCaseMapCharMask = 0x001fffff; |
|
354 |
|
355 # read UnicodeData.txt |
|
356 open FH, "< $ARGV[1]/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n"; |
|
357 while (<FH>) { |
|
358 chomp; |
|
359 my @fields = split /;/; |
|
360 if ($fields[1] =~ /First/) { |
|
361 my $first = hex "0x$fields[0]"; |
|
362 $_ = <FH>; |
|
363 @fields = split /;/; |
|
364 if ($fields[1] =~ /Last/) { |
|
365 my $last = hex "0x$fields[0]"; |
|
366 do { |
|
367 $category[$first] = $catCode{$ucd2hb{$fields[2]}}; |
|
368 $combining[$first] = $fields[3]; |
|
369 $bidicategory[$first] = $bidicategoryCode{$fields[4]}; |
|
370 unless (length($fields[7]) == 0) { |
|
371 $numericvalue[$first] = $fields[7]; |
|
372 } |
|
373 if ($fields[1] =~ /CJK/) { |
|
374 @hanVariant[$first] = 3; |
|
375 } |
|
376 $first++; |
|
377 } while ($first <= $last); |
|
378 } else { |
|
379 die "didn't find Last code for range!\n"; |
|
380 } |
|
381 } else { |
|
382 my $usv = hex "0x$fields[0]"; |
|
383 $category[$usv] = $catCode{$ucd2hb{$fields[2]}}; |
|
384 $combining[$usv] = $fields[3]; |
|
385 my $upper = hex $fields[12]; |
|
386 my $lower = hex $fields[13]; |
|
387 my $title = hex $fields[14]; |
|
388 # we only store one mapping for each character, |
|
389 # but also record what kind of mapping it is |
|
390 if ($upper && $lower) { |
|
391 $casemap[$usv] |= $kTitleToUpper; |
|
392 $casemap[$usv] |= ($usv ^ $upper); |
|
393 } |
|
394 elsif ($lower) { |
|
395 $casemap[$usv] |= $kUpperToLower; |
|
396 $casemap[$usv] |= ($usv ^ $lower); |
|
397 } |
|
398 elsif ($title && ($title != $upper)) { |
|
399 $casemap[$usv] |= $kLowerToTitle; |
|
400 $casemap[$usv] |= ($usv ^ $title); |
|
401 } |
|
402 elsif ($upper) { |
|
403 $casemap[$usv] |= $kLowerToUpper; |
|
404 $casemap[$usv] |= ($usv ^ $upper); |
|
405 } |
|
406 $bidicategory[$usv] = $bidicategoryCode{$fields[4]}; |
|
407 unless (length($fields[7]) == 0) { |
|
408 $numericvalue[$usv] = $fields[7]; |
|
409 } |
|
410 if ($fields[1] =~ /CJK/) { |
|
411 @hanVariant[$usv] = 3; |
|
412 } |
|
413 if ($fields[5] =~ /^<narrow>/) { |
|
414 my $wideChar = hex(substr($fields[5], 9)); |
|
415 die "didn't expect supplementary-plane values here" if $usv > 0xffff || $wideChar > 0xffff; |
|
416 $fullWidth[$usv] = $wideChar; |
|
417 } |
|
418 elsif ($fields[5] =~ /^<wide>/) { |
|
419 my $narrowChar = hex(substr($fields[5], 7)); |
|
420 die "didn't expect supplementary-plane values here" if $usv > 0xffff || $narrowChar > 0xffff; |
|
421 $fullWidth[$narrowChar] = $usv; |
|
422 } |
|
423 } |
|
424 } |
|
425 close FH; |
|
426 |
|
427 # read Scripts.txt |
|
428 open FH, "< $ARGV[1]/Scripts.txt" or die "can't open UCD file Scripts.txt\n"; |
|
429 push @versionInfo, ""; |
|
430 while (<FH>) { |
|
431 chomp; |
|
432 push @versionInfo, $_; |
|
433 last if /Date:/; |
|
434 } |
|
435 while (<FH>) { |
|
436 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) { |
|
437 my $script = uc($3); |
|
438 warn "unknown script $script" unless exists $scriptCode{$script}; |
|
439 $script = $scriptCode{$script}; |
|
440 my $start = hex "0x$1"; |
|
441 my $end = (defined $2) ? hex "0x$2" : $start; |
|
442 for (my $i = $start; $i <= $end; ++$i) { |
|
443 $script[$i] = $script; |
|
444 } |
|
445 } |
|
446 } |
|
447 close FH; |
|
448 |
|
449 # read EastAsianWidth.txt |
|
450 my %eawCode = ( |
|
451 'A' => 0, # ; Ambiguous |
|
452 'F' => 1, # ; Fullwidth |
|
453 'H' => 2, # ; Halfwidth |
|
454 'N' => 3, # ; Neutral |
|
455 'NA'=> 4, # ; Narrow |
|
456 'W' => 5 # ; Wide |
|
457 ); |
|
458 open FH, "< $ARGV[1]/EastAsianWidth.txt" or die "can't open UCD file EastAsianWidth.txt\n"; |
|
459 push @versionInfo, ""; |
|
460 while (<FH>) { |
|
461 chomp; |
|
462 push @versionInfo, $_; |
|
463 last if /Date:/; |
|
464 } |
|
465 while (<FH>) { |
|
466 s/#.*//; |
|
467 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) { |
|
468 my $eaw = uc($3); |
|
469 warn "unknown EAW code $eaw" unless exists $eawCode{$eaw}; |
|
470 $eaw = $eawCode{$eaw}; |
|
471 my $start = hex "0x$1"; |
|
472 my $end = (defined $2) ? hex "0x$2" : $start; |
|
473 for (my $i = $start; $i <= $end; ++$i) { |
|
474 $eaw[$i] = $eaw; |
|
475 } |
|
476 } |
|
477 } |
|
478 close FH; |
|
479 |
|
480 # read BidiMirroring.txt |
|
481 my @offsets = (); |
|
482 push @offsets, 0; |
|
483 |
|
484 open FH, "< $ARGV[1]/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n"; |
|
485 push @versionInfo, ""; |
|
486 while (<FH>) { |
|
487 chomp; |
|
488 push @versionInfo, $_; |
|
489 last if /Date:/; |
|
490 } |
|
491 while (<FH>) { |
|
492 s/#.*//; |
|
493 if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6})/) { |
|
494 my $mirrorOffset = hex("0x$2") - hex("0x$1"); |
|
495 my $offsetIndex = first { $offsets[$_] eq $mirrorOffset } 0..$#offsets; |
|
496 if ($offsetIndex == undef) { |
|
497 die "too many offset codes\n" if scalar @offsets == 31; |
|
498 push @offsets, $mirrorOffset; |
|
499 $offsetIndex = $#offsets; |
|
500 } |
|
501 $mirror[hex "0x$1"] = $offsetIndex; |
|
502 } |
|
503 } |
|
504 close FH; |
|
505 |
|
506 # read HangulSyllableType.txt |
|
507 my %hangulType = ( |
|
508 'L' => 0x01, |
|
509 'V' => 0x02, |
|
510 'T' => 0x04, |
|
511 'LV' => 0x03, |
|
512 'LVT' => 0x07 |
|
513 ); |
|
514 open FH, "< $ARGV[1]/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n"; |
|
515 push @versionInfo, ""; |
|
516 while (<FH>) { |
|
517 chomp; |
|
518 push @versionInfo, $_; |
|
519 last if /Date:/; |
|
520 } |
|
521 while (<FH>) { |
|
522 s/#.*//; |
|
523 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) { |
|
524 my $hangul = uc($3); |
|
525 warn "unknown Hangul syllable type" unless exists $hangulType{$hangul}; |
|
526 $hangul = $hangulType{$hangul}; |
|
527 my $start = hex "0x$1"; |
|
528 my $end = (defined $2) ? hex "0x$2" : $start; |
|
529 for (my $i = $start; $i <= $end; ++$i) { |
|
530 $hangul[$i] = $hangul; |
|
531 } |
|
532 } |
|
533 } |
|
534 close FH; |
|
535 |
|
536 # read xidmodifications.txt |
|
537 open FH, "< $ARGV[1]/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n"; |
|
538 push @versionInfo, ""; |
|
539 while (<FH>) { |
|
540 chomp; |
|
541 unless (/\xef\xbb\xbf/) { |
|
542 push @versionInfo, $_; |
|
543 } |
|
544 last if /Generated:/; |
|
545 } |
|
546 while (<FH>) { |
|
547 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) { |
|
548 my $xidmod = $3; |
|
549 warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod}; |
|
550 $xidmod = $xidmodCode{$xidmod}; |
|
551 my $start = hex "0x$1"; |
|
552 my $end = (defined $2) ? hex "0x$2" : $start; |
|
553 for (my $i = $start; $i <= $end; ++$i) { |
|
554 $xidmod[$i] = $xidmod; |
|
555 } |
|
556 } |
|
557 } |
|
558 close FH; |
|
559 # special case U+30FB KATAKANA MIDDLE DOT -- see bug 857490 |
|
560 $xidmod[0x30FB] = 1; |
|
561 |
|
562 open FH, "< $ARGV[1]/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n"; |
|
563 push @versionInfo, ""; |
|
564 while (<FH>) { |
|
565 chomp; |
|
566 push @versionInfo, $_; |
|
567 last if /Date:/; |
|
568 } |
|
569 my $savedusv = 0; |
|
570 my $hasTC = 0; |
|
571 my $hasSC = 0; |
|
572 while (<FH>) { |
|
573 chomp; |
|
574 if (m/U\+([0-9A-F]{4,6})\s+k([^ ]+)Variant/) { |
|
575 my $usv = hex "0x$1"; |
|
576 if ($usv != $savedusv) { |
|
577 unless ($savedusv == 0) { |
|
578 if ($hasTC && !$hasSC) { |
|
579 $hanVariant[$savedusv] = 1; |
|
580 } elsif (!$hasTC && $hasSC) { |
|
581 $hanVariant[$savedusv] = 2; |
|
582 } |
|
583 } |
|
584 $savedusv = $usv; |
|
585 $hasTC = 0; |
|
586 $hasSC = 0; |
|
587 } |
|
588 if ($2 eq "Traditional") { |
|
589 $hasTC = 1; |
|
590 } |
|
591 if ($2 eq "Simplified") { |
|
592 $hasSC = 1; |
|
593 } |
|
594 } |
|
595 } |
|
596 close FH; |
|
597 |
|
598 my $timestamp = gmtime(); |
|
599 |
|
600 open DATA_TABLES, "> nsUnicodePropertyData.cpp" or die "unable to open nsUnicodePropertyData.cpp for output"; |
|
601 |
|
602 my $licenseBlock = q[ |
|
603 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
|
604 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
605 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
606 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
607 |
|
608 /* |
|
609 * Derived from the Unicode Character Database by genUnicodePropertyData.pl |
|
610 * |
|
611 * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html |
|
612 */ |
|
613 ]; |
|
614 |
|
615 my $versionInfo = join("\n", @versionInfo); |
|
616 |
|
617 print DATA_TABLES <<__END; |
|
618 $licenseBlock |
|
619 /* |
|
620 * Created on $timestamp from UCD data files with version info: |
|
621 * |
|
622 |
|
623 $versionInfo |
|
624 |
|
625 * |
|
626 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * |
|
627 */ |
|
628 |
|
629 #include <stdint.h> |
|
630 #include "harfbuzz/hb.h" |
|
631 |
|
632 __END |
|
633 |
|
634 open HEADER, "> nsUnicodeScriptCodes.h" or die "unable to open nsUnicodeScriptCodes.h for output"; |
|
635 |
|
636 print HEADER <<__END; |
|
637 $licenseBlock |
|
638 /* |
|
639 * Created on $timestamp from UCD data files with version info: |
|
640 * |
|
641 |
|
642 $versionInfo |
|
643 |
|
644 * |
|
645 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * |
|
646 */ |
|
647 |
|
648 #ifndef NS_UNICODE_SCRIPT_CODES |
|
649 #define NS_UNICODE_SCRIPT_CODES |
|
650 |
|
651 __END |
|
652 |
|
653 print DATA_TABLES "static const uint32_t sScriptCodeToTag[] = {\n"; |
|
654 for (my $i = 0; $i < scalar @scriptCodeToTag; ++$i) { |
|
655 printf DATA_TABLES " HB_TAG(%s)", $scriptCodeToTag[$i]; |
|
656 print DATA_TABLES $i < $#scriptCodeToTag ? ",\n" : "\n"; |
|
657 } |
|
658 print DATA_TABLES "};\n\n"; |
|
659 |
|
660 our $totalData = 0; |
|
661 |
|
662 print DATA_TABLES "static const int16_t sMirrorOffsets[] = {\n"; |
|
663 for (my $i = 0; $i < scalar @offsets; ++$i) { |
|
664 printf DATA_TABLES " $offsets[$i]"; |
|
665 print DATA_TABLES $i < $#offsets ? ",\n" : "\n"; |
|
666 } |
|
667 print DATA_TABLES "};\n\n"; |
|
668 |
|
669 print HEADER "#pragma pack(1)\n\n"; |
|
670 |
|
671 sub sprintCharProps1 |
|
672 { |
|
673 my $usv = shift; |
|
674 return sprintf("{%d,%d,%d}, ", $mirror[$usv], $hangul[$usv], $combining[$usv]); |
|
675 } |
|
676 &genTables("CharProp1", "struct nsCharProps1 {\n unsigned char mMirrorOffsetIndex:5;\n unsigned char mHangulType:3;\n unsigned char mCombiningClass:8;\n};", |
|
677 "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1); |
|
678 |
|
679 sub sprintCharProps2 |
|
680 { |
|
681 my $usv = shift; |
|
682 return sprintf("{%d,%d,%d,%d,%d,%d},", |
|
683 $script[$usv], $eaw[$usv], $category[$usv], |
|
684 $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv]); |
|
685 } |
|
686 &genTables("CharProp2", "struct nsCharProps2 {\n unsigned char mScriptCode:8;\n unsigned char mEAW:3;\n unsigned char mCategory:5;\n unsigned char mBidiCategory:5;\n unsigned char mXidmod:4;\n signed char mNumericValue:5;\n unsigned char mHanVariant:2;\n};", |
|
687 "nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1); |
|
688 |
|
689 print HEADER "#pragma pack()\n\n"; |
|
690 |
|
691 sub sprintHanVariants |
|
692 { |
|
693 my $baseUsv = shift; |
|
694 my $varShift = 0; |
|
695 my $val = 0; |
|
696 while ($varShift < 8) { |
|
697 $val |= $hanVariant[$baseUsv++] << $varShift; |
|
698 $varShift += 2; |
|
699 } |
|
700 return sprintf("0x%02x,", $val); |
|
701 } |
|
702 &genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4); |
|
703 |
|
704 sub sprintFullWidth |
|
705 { |
|
706 my $usv = shift; |
|
707 return sprintf("0x%04x,", $fullWidth[$usv]); |
|
708 } |
|
709 &genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1); |
|
710 |
|
711 sub sprintCasemap |
|
712 { |
|
713 my $usv = shift; |
|
714 return sprintf("0x%08x,", $casemap[$usv]); |
|
715 } |
|
716 &genTables("CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1); |
|
717 |
|
718 print STDERR "Total data = $totalData\n"; |
|
719 |
|
720 printf DATA_TABLES "const uint32_t kTitleToUpper = 0x%08x;\n", $kTitleToUpper; |
|
721 printf DATA_TABLES "const uint32_t kUpperToLower = 0x%08x;\n", $kUpperToLower; |
|
722 printf DATA_TABLES "const uint32_t kLowerToTitle = 0x%08x;\n", $kLowerToTitle; |
|
723 printf DATA_TABLES "const uint32_t kLowerToUpper = 0x%08x;\n", $kLowerToUpper; |
|
724 printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCharMask; |
|
725 |
|
726 sub genTables |
|
727 { |
|
728 my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_; |
|
729 |
|
730 print DATA_TABLES "#define k${prefix}MaxPlane $maxPlane\n"; |
|
731 print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n"; |
|
732 print DATA_TABLES "#define k${prefix}CharBits $charBits\n"; |
|
733 |
|
734 my $indexLen = 1 << $indexBits; |
|
735 my $charsPerPage = 1 << $charBits; |
|
736 my %charIndex = (); |
|
737 my %pageMapIndex = (); |
|
738 my @pageMap = (); |
|
739 my @char = (); |
|
740 |
|
741 my $planeMap = "\x00" x $maxPlane; |
|
742 foreach my $plane (0 .. $maxPlane) { |
|
743 my $pageMap = "\x00" x $indexLen * 2; |
|
744 foreach my $page (0 .. $indexLen - 1) { |
|
745 my $charValues = ""; |
|
746 for (my $ch = 0; $ch < $charsPerPage; $ch += $charsPerEntry) { |
|
747 my $usv = $plane * 0x10000 + $page * $charsPerPage + $ch; |
|
748 $charValues .= &$func($usv); |
|
749 } |
|
750 chop $charValues; |
|
751 |
|
752 unless (exists $charIndex{$charValues}) { |
|
753 $charIndex{$charValues} = scalar keys %charIndex; |
|
754 $char[$charIndex{$charValues}] = $charValues; |
|
755 } |
|
756 substr($pageMap, $page * 2, 2) = pack('S', $charIndex{$charValues}); |
|
757 } |
|
758 |
|
759 unless (exists $pageMapIndex{$pageMap}) { |
|
760 $pageMapIndex{$pageMap} = scalar keys %pageMapIndex; |
|
761 $pageMap[$pageMapIndex{$pageMap}] = $pageMap; |
|
762 } |
|
763 if ($plane > 0) { |
|
764 substr($planeMap, $plane - 1, 1) = pack('C', $pageMapIndex{$pageMap}); |
|
765 } |
|
766 } |
|
767 |
|
768 if ($maxPlane) { |
|
769 print DATA_TABLES "static const uint8_t s${prefix}Planes[$maxPlane] = {"; |
|
770 print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('C*', $planeMap)); |
|
771 print DATA_TABLES "};\n\n"; |
|
772 } |
|
773 |
|
774 my $chCount = scalar @char; |
|
775 my $pmBits = $chCount > 255 ? 16 : 8; |
|
776 my $pmCount = scalar @pageMap; |
|
777 if ($maxPlane == 0) { |
|
778 die "there should only be one pageMap entry!" if $pmCount > 1; |
|
779 print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$indexLen] = {\n"; |
|
780 } else { |
|
781 print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$pmCount][$indexLen] = {\n"; |
|
782 } |
|
783 for (my $i = 0; $i < scalar @pageMap; ++$i) { |
|
784 print DATA_TABLES $maxPlane > 0 ? " {" : " "; |
|
785 print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('S*', $pageMap[$i])); |
|
786 print DATA_TABLES $maxPlane > 0 ? ($i < $#pageMap ? "},\n" : "}\n") : "\n"; |
|
787 } |
|
788 print DATA_TABLES "};\n\n"; |
|
789 |
|
790 print HEADER "$typedef\n\n" if $typedef ne ''; |
|
791 |
|
792 my $pageLen = $charsPerPage / $charsPerEntry; |
|
793 print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n"; |
|
794 for (my $i = 0; $i < scalar @char; ++$i) { |
|
795 print DATA_TABLES " {"; |
|
796 print DATA_TABLES $char[$i]; |
|
797 print DATA_TABLES $i < $#char ? "},\n" : "}\n"; |
|
798 } |
|
799 print DATA_TABLES "};\n\n"; |
|
800 |
|
801 my $dataSize = $pmCount * $indexLen * $pmBits/8 + |
|
802 $chCount * $pageLen * $bytesPerEntry + |
|
803 $maxPlane; |
|
804 $totalData += $dataSize; |
|
805 |
|
806 print STDERR "Data for $prefix = $dataSize\n"; |
|
807 } |
|
808 |
|
809 print DATA_TABLES <<__END; |
|
810 /* |
|
811 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * |
|
812 */ |
|
813 __END |
|
814 |
|
815 close DATA_TABLES; |
|
816 |
|
817 print HEADER "enum {\n"; |
|
818 for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) { |
|
819 print HEADER " MOZ_SCRIPT_", $scriptCodeToName[$i], " = ", $i, ",\n"; |
|
820 } |
|
821 print HEADER "\n MOZ_NUM_SCRIPT_CODES = ", scalar @scriptCodeToName, ",\n"; |
|
822 print HEADER "\n MOZ_SCRIPT_INVALID = -1\n"; |
|
823 print HEADER "};\n\n"; |
|
824 |
|
825 print HEADER <<__END; |
|
826 #endif |
|
827 /* |
|
828 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * |
|
829 */ |
|
830 __END |
|
831 |
|
832 close HEADER; |
|
833 |