intl/unicharutil/tools/genUnicodePropertyData.pl

branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
equal deleted inserted replaced
-1:000000000000 0:9e758d5e69e3
1 #!/usr/bin/env perl
2
3 # This Source Code Form is subject to the terms of the Mozilla Public
4 # License, v. 2.0. If a copy of the MPL was not distributed with this
5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
6
7 # This tool is used to prepare lookup tables of Unicode character properties
8 # needed by gfx code to support text shaping operations. The properties are
9 # read from the Unicode Character Database and compiled into multi-level arrays
10 # for efficient lookup.
11 #
12 # To regenerate the tables in nsUnicodePropertyData.cpp:
13 #
14 # (1) Download the current Unicode data files from
15 #
16 # http://www.unicode.org/Public/UNIDATA/
17 #
18 # NB: not all the files are actually needed; currently, we require
19 # - UnicodeData.txt
20 # - Scripts.txt
21 # - EastAsianWidth.txt
22 # - BidiMirroring.txt
23 # - HangulSyllableType.txt
24 # - ReadMe.txt (to record version/date of the UCD)
25 # - Unihan_Variants.txt (from Unihan.zip)
26 # though this may change if we find a need for additional properties.
27 #
28 # The Unicode data files listed above should be together in one directory.
29 # We also require the file
30 # http://www.unicode.org/Public/security/latest/xidmodifications.txt
31 # This file should be in a sub-directory "security" immediately below the
32 # directory containing the other Unicode data files.
33 #
34 # (2) Run this tool using a command line of the form
35 #
36 # perl genUnicodePropertyData.pl \
37 # /path/to/harfbuzz/src \
38 # /path/to/UCD-directory
39 #
40 # This will generate (or overwrite!) the files
41 #
42 # nsUnicodePropertyData.cpp
43 # nsUnicodeScriptCodes.h
44 #
45 # in the current directory.
46
47 use strict;
48 use List::Util qw(first);
49
50 if ($#ARGV != 1) {
51 print <<__EOT;
52 # Run this tool using a command line of the form
53 #
54 # perl genUnicodePropertyData.pl \
55 # /path/to/harfbuzz/src \
56 # /path/to/UCD-directory
57 #
58 # where harfbuzz/src is the directory containing harfbuzz .cc and .hh files,
59 # and UCD-directory is a directory containing the current Unicode Character
60 # Database files (UnicodeData.txt, etc), available from
61 # http://www.unicode.org/Public/UNIDATA/
62 #
63 # This will generate (or overwrite!) the files
64 #
65 # nsUnicodePropertyData.cpp
66 # nsUnicodeScriptCodes.h
67 #
68 # in the current directory.
69 __EOT
70 exit 0;
71 }
72
73 # load HB_Script and HB_Category constants
74
75 # NOTE that HB_SCRIPT_* constants are now "tag" values, NOT sequentially-allocated
76 # script codes as used by Glib/Pango/etc.
77 # We therefore define a set of MOZ_SCRIPT_* constants that are script _codes_
78 # compatible with those libraries, and map these to HB_SCRIPT_* _tags_ as needed.
79
80 # CHECK that this matches Pango source (as found for example at
81 # http://git.gnome.org/browse/pango/tree/pango/pango-script.h)
82 # for as many codes as that defines (currently up through Unicode 5.1)
83 # and the GLib enumeration
84 # http://developer.gnome.org/glib/2.30/glib-Unicode-Manipulation.html#GUnicodeScript
85 # (currently defined up through Unicode 6.0).
86 # Constants beyond these may be regarded as unstable for now, but we don't actually
87 # depend on the specific values.
88 my %scriptCode = (
89 INVALID => -1,
90 COMMON => 0,
91 INHERITED => 1,
92 ARABIC => 2,
93 ARMENIAN => 3,
94 BENGALI => 4,
95 BOPOMOFO => 5,
96 CHEROKEE => 6,
97 COPTIC => 7,
98 CYRILLIC => 8,
99 DESERET => 9,
100 DEVANAGARI => 10,
101 ETHIOPIC => 11,
102 GEORGIAN => 12,
103 GOTHIC => 13,
104 GREEK => 14,
105 GUJARATI => 15,
106 GURMUKHI => 16,
107 HAN => 17,
108 HANGUL => 18,
109 HEBREW => 19,
110 HIRAGANA => 20,
111 KANNADA => 21,
112 KATAKANA => 22,
113 KHMER => 23,
114 LAO => 24,
115 LATIN => 25,
116 MALAYALAM => 26,
117 MONGOLIAN => 27,
118 MYANMAR => 28,
119 OGHAM => 29,
120 OLD_ITALIC => 30,
121 ORIYA => 31,
122 RUNIC => 32,
123 SINHALA => 33,
124 SYRIAC => 34,
125 TAMIL => 35,
126 TELUGU => 36,
127 THAANA => 37,
128 THAI => 38,
129 TIBETAN => 39,
130 CANADIAN_ABORIGINAL => 40,
131 YI => 41,
132 TAGALOG => 42,
133 HANUNOO => 43,
134 BUHID => 44,
135 TAGBANWA => 45,
136 # unicode 4.0 additions
137 BRAILLE => 46,
138 CYPRIOT => 47,
139 LIMBU => 48,
140 OSMANYA => 49,
141 SHAVIAN => 50,
142 LINEAR_B => 51,
143 TAI_LE => 52,
144 UGARITIC => 53,
145 # unicode 4.1 additions
146 NEW_TAI_LUE => 54,
147 BUGINESE => 55,
148 GLAGOLITIC => 56,
149 TIFINAGH => 57,
150 SYLOTI_NAGRI => 58,
151 OLD_PERSIAN => 59,
152 KHAROSHTHI => 60,
153 # unicode 5.0 additions
154 UNKNOWN => 61,
155 BALINESE => 62,
156 CUNEIFORM => 63,
157 PHOENICIAN => 64,
158 PHAGS_PA => 65,
159 NKO => 66,
160 # unicode 5.1 additions
161 KAYAH_LI => 67,
162 LEPCHA => 68,
163 REJANG => 69,
164 SUNDANESE => 70,
165 SAURASHTRA => 71,
166 CHAM => 72,
167 OL_CHIKI => 73,
168 VAI => 74,
169 CARIAN => 75,
170 LYCIAN => 76,
171 LYDIAN => 77,
172 # unicode 5.2 additions
173 AVESTAN => 78,
174 BAMUM => 79,
175 EGYPTIAN_HIEROGLYPHS => 80,
176 IMPERIAL_ARAMAIC => 81,
177 INSCRIPTIONAL_PAHLAVI => 82,
178 INSCRIPTIONAL_PARTHIAN => 83,
179 JAVANESE => 84,
180 KAITHI => 85,
181 LISU => 86,
182 MEETEI_MAYEK => 87,
183 OLD_SOUTH_ARABIAN => 88,
184 OLD_TURKIC => 89,
185 SAMARITAN => 90,
186 TAI_THAM => 91,
187 TAI_VIET => 92,
188 # unicode 6.0 additions
189 BATAK => 93,
190 BRAHMI => 94,
191 MANDAIC => 95,
192 # unicode 6.1 additions
193 CHAKMA => 96,
194 MEROITIC_CURSIVE => 97,
195 MEROITIC_HIEROGLYPHS => 98,
196 MIAO => 99,
197 SHARADA => 100,
198 SORA_SOMPENG => 101,
199 TAKRI => 102
200 );
201
202 my $sc = -1;
203 my $cc = -1;
204 my %catCode;
205 my @scriptCodeToTag;
206 my @scriptCodeToName;
207
208 sub readHarfBuzzHeader
209 {
210 my $file = shift;
211 open FH, "< $ARGV[0]/$file" or die "can't open harfbuzz header $ARGV[0]/$file\n";
212 while (<FH>) {
213 if (m/HB_SCRIPT_([A-Z_]+)\s*=\s*HB_TAG\s*\(('.','.','.','.')\)\s*,/) {
214 unless (exists $scriptCode{$1}) {
215 warn "unknown script name $1 found in $file\n";
216 next;
217 }
218 $sc = $scriptCode{$1};
219 $scriptCodeToTag[$sc] = $2;
220 $scriptCodeToName[$sc] = $1;
221 }
222 if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) {
223 $cc++;
224 $catCode{$1} = $cc;
225 }
226 }
227 close FH;
228 }
229
230 &readHarfBuzzHeader("hb-common.h");
231 &readHarfBuzzHeader("hb-unicode.h");
232
233 die "didn't find HarfBuzz script codes\n" if $sc == -1;
234 die "didn't find HarfBuzz category codes\n" if $cc == -1;
235
236 my %xidmodCode = (
237 'inclusion' => 0,
238 'recommended' => 1,
239 'default-ignorable' => 2,
240 'historic' => 3,
241 'limited-use' => 4,
242 'not-NFKC' => 5,
243 'not-xid' => 6,
244 'obsolete' => 7,
245 'technical' => 8,
246 'not-chars' => 9
247 );
248
249 my %bidicategoryCode = (
250 "L" => "0", # Left-to-Right
251 "R" => "1", # Right-to-Left
252 "EN" => "2", # European Number
253 "ES" => "3", # European Number Separator
254 "ET" => "4", # European Number Terminator
255 "AN" => "5", # Arabic Number
256 "CS" => "6", # Common Number Separator
257 "B" => "7", # Paragraph Separator
258 "S" => "8", # Segment Separator
259 "WS" => "9", # Whitespace
260 "ON" => "10", # Other Neutrals
261 "LRE" => "11", # Left-to-Right Embedding
262 "LRO" => "12", # Left-to-Right Override
263 "AL" => "13", # Right-to-Left Arabic
264 "RLE" => "14", # Right-to-Left Embedding
265 "RLO" => "15", # Right-to-Left Override
266 "PDF" => "16", # Pop Directional Format
267 "NSM" => "17", # Non-Spacing Mark
268 "BN" => "18" # Boundary Neutral
269 );
270
271 # initialize default properties
272 my @script;
273 my @category;
274 my @combining;
275 my @eaw;
276 my @mirror;
277 my @hangul;
278 my @casemap;
279 my @xidmod;
280 my @numericvalue;
281 my @hanVariant;
282 my @bidicategory;
283 my @fullWidth;
284 for (my $i = 0; $i < 0x110000; ++$i) {
285 $script[$i] = $scriptCode{"UNKNOWN"};
286 $category[$i] = $catCode{"UNASSIGNED"};
287 $combining[$i] = 0;
288 $casemap[$i] = 0;
289 $xidmod[$i] = $xidmodCode{"not-chars"};
290 $numericvalue[$i] = -1;
291 $hanVariant[$i] = 0;
292 $bidicategory[$i] = $bidicategoryCode{"L"};
293 $fullWidth[$i] = 0;
294 }
295
296 # blocks where the default for bidi category is not L
297 for my $i (0x0600..0x07BF, 0x08A0..0x08FF, 0xFB50..0xFDCF, 0xFDF0..0xFDFF, 0xFE70..0xFEFF, 0x1EE00..0x0001EEFF) {
298 $bidicategory[$i] = $bidicategoryCode{"AL"};
299 }
300 for my $i (0x0590..0x05FF, 0x07C0..0x089F, 0xFB1D..0xFB4F, 0x00010800..0x00010FFF, 0x0001E800..0x0001EDFF, 0x0001EF00..0x0001EFFF) {
301 $bidicategory[$i] = $bidicategoryCode{"R"};
302 }
303 for my $i (0x20A0..0x20CF) {
304 $bidicategory[$i] = $bidicategoryCode{"ET"};
305 }
306
307 my %ucd2hb = (
308 'Cc' => 'CONTROL',
309 'Cf' => 'FORMAT',
310 'Cn' => 'UNASSIGNED',
311 'Co' => 'PRIVATE_USE',
312 'Cs' => 'SURROGATE',
313 'Ll' => 'LOWERCASE_LETTER',
314 'Lm' => 'MODIFIER_LETTER',
315 'Lo' => 'OTHER_LETTER',
316 'Lt' => 'TITLECASE_LETTER',
317 'Lu' => 'UPPERCASE_LETTER',
318 'Mc' => 'SPACING_MARK',
319 'Me' => 'ENCLOSING_MARK',
320 'Mn' => 'NON_SPACING_MARK',
321 'Nd' => 'DECIMAL_NUMBER',
322 'Nl' => 'LETTER_NUMBER',
323 'No' => 'OTHER_NUMBER',
324 'Pc' => 'CONNECT_PUNCTUATION',
325 'Pd' => 'DASH_PUNCTUATION',
326 'Pe' => 'CLOSE_PUNCTUATION',
327 'Pf' => 'FINAL_PUNCTUATION',
328 'Pi' => 'INITIAL_PUNCTUATION',
329 'Po' => 'OTHER_PUNCTUATION',
330 'Ps' => 'OPEN_PUNCTUATION',
331 'Sc' => 'CURRENCY_SYMBOL',
332 'Sk' => 'MODIFIER_SYMBOL',
333 'Sm' => 'MATH_SYMBOL',
334 'So' => 'OTHER_SYMBOL',
335 'Zl' => 'LINE_SEPARATOR',
336 'Zp' => 'PARAGRAPH_SEPARATOR',
337 'Zs' => 'SPACE_SEPARATOR'
338 );
339
340 # read ReadMe.txt
341 my @versionInfo;
342 open FH, "< $ARGV[1]/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
343 while (<FH>) {
344 chomp;
345 push @versionInfo, $_;
346 }
347 close FH;
348
349 my $kTitleToUpper = 0x80000000;
350 my $kUpperToLower = 0x40000000;
351 my $kLowerToTitle = 0x20000000;
352 my $kLowerToUpper = 0x10000000;
353 my $kCaseMapCharMask = 0x001fffff;
354
355 # read UnicodeData.txt
356 open FH, "< $ARGV[1]/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
357 while (<FH>) {
358 chomp;
359 my @fields = split /;/;
360 if ($fields[1] =~ /First/) {
361 my $first = hex "0x$fields[0]";
362 $_ = <FH>;
363 @fields = split /;/;
364 if ($fields[1] =~ /Last/) {
365 my $last = hex "0x$fields[0]";
366 do {
367 $category[$first] = $catCode{$ucd2hb{$fields[2]}};
368 $combining[$first] = $fields[3];
369 $bidicategory[$first] = $bidicategoryCode{$fields[4]};
370 unless (length($fields[7]) == 0) {
371 $numericvalue[$first] = $fields[7];
372 }
373 if ($fields[1] =~ /CJK/) {
374 @hanVariant[$first] = 3;
375 }
376 $first++;
377 } while ($first <= $last);
378 } else {
379 die "didn't find Last code for range!\n";
380 }
381 } else {
382 my $usv = hex "0x$fields[0]";
383 $category[$usv] = $catCode{$ucd2hb{$fields[2]}};
384 $combining[$usv] = $fields[3];
385 my $upper = hex $fields[12];
386 my $lower = hex $fields[13];
387 my $title = hex $fields[14];
388 # we only store one mapping for each character,
389 # but also record what kind of mapping it is
390 if ($upper && $lower) {
391 $casemap[$usv] |= $kTitleToUpper;
392 $casemap[$usv] |= ($usv ^ $upper);
393 }
394 elsif ($lower) {
395 $casemap[$usv] |= $kUpperToLower;
396 $casemap[$usv] |= ($usv ^ $lower);
397 }
398 elsif ($title && ($title != $upper)) {
399 $casemap[$usv] |= $kLowerToTitle;
400 $casemap[$usv] |= ($usv ^ $title);
401 }
402 elsif ($upper) {
403 $casemap[$usv] |= $kLowerToUpper;
404 $casemap[$usv] |= ($usv ^ $upper);
405 }
406 $bidicategory[$usv] = $bidicategoryCode{$fields[4]};
407 unless (length($fields[7]) == 0) {
408 $numericvalue[$usv] = $fields[7];
409 }
410 if ($fields[1] =~ /CJK/) {
411 @hanVariant[$usv] = 3;
412 }
413 if ($fields[5] =~ /^<narrow>/) {
414 my $wideChar = hex(substr($fields[5], 9));
415 die "didn't expect supplementary-plane values here" if $usv > 0xffff || $wideChar > 0xffff;
416 $fullWidth[$usv] = $wideChar;
417 }
418 elsif ($fields[5] =~ /^<wide>/) {
419 my $narrowChar = hex(substr($fields[5], 7));
420 die "didn't expect supplementary-plane values here" if $usv > 0xffff || $narrowChar > 0xffff;
421 $fullWidth[$narrowChar] = $usv;
422 }
423 }
424 }
425 close FH;
426
427 # read Scripts.txt
428 open FH, "< $ARGV[1]/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
429 push @versionInfo, "";
430 while (<FH>) {
431 chomp;
432 push @versionInfo, $_;
433 last if /Date:/;
434 }
435 while (<FH>) {
436 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) {
437 my $script = uc($3);
438 warn "unknown script $script" unless exists $scriptCode{$script};
439 $script = $scriptCode{$script};
440 my $start = hex "0x$1";
441 my $end = (defined $2) ? hex "0x$2" : $start;
442 for (my $i = $start; $i <= $end; ++$i) {
443 $script[$i] = $script;
444 }
445 }
446 }
447 close FH;
448
449 # read EastAsianWidth.txt
450 my %eawCode = (
451 'A' => 0, # ; Ambiguous
452 'F' => 1, # ; Fullwidth
453 'H' => 2, # ; Halfwidth
454 'N' => 3, # ; Neutral
455 'NA'=> 4, # ; Narrow
456 'W' => 5 # ; Wide
457 );
458 open FH, "< $ARGV[1]/EastAsianWidth.txt" or die "can't open UCD file EastAsianWidth.txt\n";
459 push @versionInfo, "";
460 while (<FH>) {
461 chomp;
462 push @versionInfo, $_;
463 last if /Date:/;
464 }
465 while (<FH>) {
466 s/#.*//;
467 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
468 my $eaw = uc($3);
469 warn "unknown EAW code $eaw" unless exists $eawCode{$eaw};
470 $eaw = $eawCode{$eaw};
471 my $start = hex "0x$1";
472 my $end = (defined $2) ? hex "0x$2" : $start;
473 for (my $i = $start; $i <= $end; ++$i) {
474 $eaw[$i] = $eaw;
475 }
476 }
477 }
478 close FH;
479
480 # read BidiMirroring.txt
481 my @offsets = ();
482 push @offsets, 0;
483
484 open FH, "< $ARGV[1]/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
485 push @versionInfo, "";
486 while (<FH>) {
487 chomp;
488 push @versionInfo, $_;
489 last if /Date:/;
490 }
491 while (<FH>) {
492 s/#.*//;
493 if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6})/) {
494 my $mirrorOffset = hex("0x$2") - hex("0x$1");
495 my $offsetIndex = first { $offsets[$_] eq $mirrorOffset } 0..$#offsets;
496 if ($offsetIndex == undef) {
497 die "too many offset codes\n" if scalar @offsets == 31;
498 push @offsets, $mirrorOffset;
499 $offsetIndex = $#offsets;
500 }
501 $mirror[hex "0x$1"] = $offsetIndex;
502 }
503 }
504 close FH;
505
506 # read HangulSyllableType.txt
507 my %hangulType = (
508 'L' => 0x01,
509 'V' => 0x02,
510 'T' => 0x04,
511 'LV' => 0x03,
512 'LVT' => 0x07
513 );
514 open FH, "< $ARGV[1]/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
515 push @versionInfo, "";
516 while (<FH>) {
517 chomp;
518 push @versionInfo, $_;
519 last if /Date:/;
520 }
521 while (<FH>) {
522 s/#.*//;
523 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
524 my $hangul = uc($3);
525 warn "unknown Hangul syllable type" unless exists $hangulType{$hangul};
526 $hangul = $hangulType{$hangul};
527 my $start = hex "0x$1";
528 my $end = (defined $2) ? hex "0x$2" : $start;
529 for (my $i = $start; $i <= $end; ++$i) {
530 $hangul[$i] = $hangul;
531 }
532 }
533 }
534 close FH;
535
536 # read xidmodifications.txt
537 open FH, "< $ARGV[1]/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
538 push @versionInfo, "";
539 while (<FH>) {
540 chomp;
541 unless (/\xef\xbb\xbf/) {
542 push @versionInfo, $_;
543 }
544 last if /Generated:/;
545 }
546 while (<FH>) {
547 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) {
548 my $xidmod = $3;
549 warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod};
550 $xidmod = $xidmodCode{$xidmod};
551 my $start = hex "0x$1";
552 my $end = (defined $2) ? hex "0x$2" : $start;
553 for (my $i = $start; $i <= $end; ++$i) {
554 $xidmod[$i] = $xidmod;
555 }
556 }
557 }
558 close FH;
559 # special case U+30FB KATAKANA MIDDLE DOT -- see bug 857490
560 $xidmod[0x30FB] = 1;
561
562 open FH, "< $ARGV[1]/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
563 push @versionInfo, "";
564 while (<FH>) {
565 chomp;
566 push @versionInfo, $_;
567 last if /Date:/;
568 }
569 my $savedusv = 0;
570 my $hasTC = 0;
571 my $hasSC = 0;
572 while (<FH>) {
573 chomp;
574 if (m/U\+([0-9A-F]{4,6})\s+k([^ ]+)Variant/) {
575 my $usv = hex "0x$1";
576 if ($usv != $savedusv) {
577 unless ($savedusv == 0) {
578 if ($hasTC && !$hasSC) {
579 $hanVariant[$savedusv] = 1;
580 } elsif (!$hasTC && $hasSC) {
581 $hanVariant[$savedusv] = 2;
582 }
583 }
584 $savedusv = $usv;
585 $hasTC = 0;
586 $hasSC = 0;
587 }
588 if ($2 eq "Traditional") {
589 $hasTC = 1;
590 }
591 if ($2 eq "Simplified") {
592 $hasSC = 1;
593 }
594 }
595 }
596 close FH;
597
598 my $timestamp = gmtime();
599
600 open DATA_TABLES, "> nsUnicodePropertyData.cpp" or die "unable to open nsUnicodePropertyData.cpp for output";
601
602 my $licenseBlock = q[
603 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
604 /* This Source Code Form is subject to the terms of the Mozilla Public
605 * License, v. 2.0. If a copy of the MPL was not distributed with this
606 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
607
608 /*
609 * Derived from the Unicode Character Database by genUnicodePropertyData.pl
610 *
611 * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html
612 */
613 ];
614
615 my $versionInfo = join("\n", @versionInfo);
616
617 print DATA_TABLES <<__END;
618 $licenseBlock
619 /*
620 * Created on $timestamp from UCD data files with version info:
621 *
622
623 $versionInfo
624
625 *
626 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
627 */
628
629 #include <stdint.h>
630 #include "harfbuzz/hb.h"
631
632 __END
633
634 open HEADER, "> nsUnicodeScriptCodes.h" or die "unable to open nsUnicodeScriptCodes.h for output";
635
636 print HEADER <<__END;
637 $licenseBlock
638 /*
639 * Created on $timestamp from UCD data files with version info:
640 *
641
642 $versionInfo
643
644 *
645 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
646 */
647
648 #ifndef NS_UNICODE_SCRIPT_CODES
649 #define NS_UNICODE_SCRIPT_CODES
650
651 __END
652
653 print DATA_TABLES "static const uint32_t sScriptCodeToTag[] = {\n";
654 for (my $i = 0; $i < scalar @scriptCodeToTag; ++$i) {
655 printf DATA_TABLES " HB_TAG(%s)", $scriptCodeToTag[$i];
656 print DATA_TABLES $i < $#scriptCodeToTag ? ",\n" : "\n";
657 }
658 print DATA_TABLES "};\n\n";
659
660 our $totalData = 0;
661
662 print DATA_TABLES "static const int16_t sMirrorOffsets[] = {\n";
663 for (my $i = 0; $i < scalar @offsets; ++$i) {
664 printf DATA_TABLES " $offsets[$i]";
665 print DATA_TABLES $i < $#offsets ? ",\n" : "\n";
666 }
667 print DATA_TABLES "};\n\n";
668
669 print HEADER "#pragma pack(1)\n\n";
670
671 sub sprintCharProps1
672 {
673 my $usv = shift;
674 return sprintf("{%d,%d,%d}, ", $mirror[$usv], $hangul[$usv], $combining[$usv]);
675 }
676 &genTables("CharProp1", "struct nsCharProps1 {\n unsigned char mMirrorOffsetIndex:5;\n unsigned char mHangulType:3;\n unsigned char mCombiningClass:8;\n};",
677 "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1);
678
679 sub sprintCharProps2
680 {
681 my $usv = shift;
682 return sprintf("{%d,%d,%d,%d,%d,%d},",
683 $script[$usv], $eaw[$usv], $category[$usv],
684 $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv]);
685 }
686 &genTables("CharProp2", "struct nsCharProps2 {\n unsigned char mScriptCode:8;\n unsigned char mEAW:3;\n unsigned char mCategory:5;\n unsigned char mBidiCategory:5;\n unsigned char mXidmod:4;\n signed char mNumericValue:5;\n unsigned char mHanVariant:2;\n};",
687 "nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1);
688
689 print HEADER "#pragma pack()\n\n";
690
691 sub sprintHanVariants
692 {
693 my $baseUsv = shift;
694 my $varShift = 0;
695 my $val = 0;
696 while ($varShift < 8) {
697 $val |= $hanVariant[$baseUsv++] << $varShift;
698 $varShift += 2;
699 }
700 return sprintf("0x%02x,", $val);
701 }
702 &genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
703
704 sub sprintFullWidth
705 {
706 my $usv = shift;
707 return sprintf("0x%04x,", $fullWidth[$usv]);
708 }
709 &genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
710
711 sub sprintCasemap
712 {
713 my $usv = shift;
714 return sprintf("0x%08x,", $casemap[$usv]);
715 }
716 &genTables("CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1);
717
718 print STDERR "Total data = $totalData\n";
719
720 printf DATA_TABLES "const uint32_t kTitleToUpper = 0x%08x;\n", $kTitleToUpper;
721 printf DATA_TABLES "const uint32_t kUpperToLower = 0x%08x;\n", $kUpperToLower;
722 printf DATA_TABLES "const uint32_t kLowerToTitle = 0x%08x;\n", $kLowerToTitle;
723 printf DATA_TABLES "const uint32_t kLowerToUpper = 0x%08x;\n", $kLowerToUpper;
724 printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCharMask;
725
726 sub genTables
727 {
728 my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
729
730 print DATA_TABLES "#define k${prefix}MaxPlane $maxPlane\n";
731 print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n";
732 print DATA_TABLES "#define k${prefix}CharBits $charBits\n";
733
734 my $indexLen = 1 << $indexBits;
735 my $charsPerPage = 1 << $charBits;
736 my %charIndex = ();
737 my %pageMapIndex = ();
738 my @pageMap = ();
739 my @char = ();
740
741 my $planeMap = "\x00" x $maxPlane;
742 foreach my $plane (0 .. $maxPlane) {
743 my $pageMap = "\x00" x $indexLen * 2;
744 foreach my $page (0 .. $indexLen - 1) {
745 my $charValues = "";
746 for (my $ch = 0; $ch < $charsPerPage; $ch += $charsPerEntry) {
747 my $usv = $plane * 0x10000 + $page * $charsPerPage + $ch;
748 $charValues .= &$func($usv);
749 }
750 chop $charValues;
751
752 unless (exists $charIndex{$charValues}) {
753 $charIndex{$charValues} = scalar keys %charIndex;
754 $char[$charIndex{$charValues}] = $charValues;
755 }
756 substr($pageMap, $page * 2, 2) = pack('S', $charIndex{$charValues});
757 }
758
759 unless (exists $pageMapIndex{$pageMap}) {
760 $pageMapIndex{$pageMap} = scalar keys %pageMapIndex;
761 $pageMap[$pageMapIndex{$pageMap}] = $pageMap;
762 }
763 if ($plane > 0) {
764 substr($planeMap, $plane - 1, 1) = pack('C', $pageMapIndex{$pageMap});
765 }
766 }
767
768 if ($maxPlane) {
769 print DATA_TABLES "static const uint8_t s${prefix}Planes[$maxPlane] = {";
770 print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('C*', $planeMap));
771 print DATA_TABLES "};\n\n";
772 }
773
774 my $chCount = scalar @char;
775 my $pmBits = $chCount > 255 ? 16 : 8;
776 my $pmCount = scalar @pageMap;
777 if ($maxPlane == 0) {
778 die "there should only be one pageMap entry!" if $pmCount > 1;
779 print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$indexLen] = {\n";
780 } else {
781 print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$pmCount][$indexLen] = {\n";
782 }
783 for (my $i = 0; $i < scalar @pageMap; ++$i) {
784 print DATA_TABLES $maxPlane > 0 ? " {" : " ";
785 print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('S*', $pageMap[$i]));
786 print DATA_TABLES $maxPlane > 0 ? ($i < $#pageMap ? "},\n" : "}\n") : "\n";
787 }
788 print DATA_TABLES "};\n\n";
789
790 print HEADER "$typedef\n\n" if $typedef ne '';
791
792 my $pageLen = $charsPerPage / $charsPerEntry;
793 print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n";
794 for (my $i = 0; $i < scalar @char; ++$i) {
795 print DATA_TABLES " {";
796 print DATA_TABLES $char[$i];
797 print DATA_TABLES $i < $#char ? "},\n" : "}\n";
798 }
799 print DATA_TABLES "};\n\n";
800
801 my $dataSize = $pmCount * $indexLen * $pmBits/8 +
802 $chCount * $pageLen * $bytesPerEntry +
803 $maxPlane;
804 $totalData += $dataSize;
805
806 print STDERR "Data for $prefix = $dataSize\n";
807 }
808
809 print DATA_TABLES <<__END;
810 /*
811 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
812 */
813 __END
814
815 close DATA_TABLES;
816
817 print HEADER "enum {\n";
818 for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) {
819 print HEADER " MOZ_SCRIPT_", $scriptCodeToName[$i], " = ", $i, ",\n";
820 }
821 print HEADER "\n MOZ_NUM_SCRIPT_CODES = ", scalar @scriptCodeToName, ",\n";
822 print HEADER "\n MOZ_SCRIPT_INVALID = -1\n";
823 print HEADER "};\n\n";
824
825 print HEADER <<__END;
826 #endif
827 /*
828 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
829 */
830 __END
831
832 close HEADER;
833

mercurial