intl/unicharutil/tools/genUnicodePropertyData.pl

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 #!/usr/bin/env perl
michael@0 2
michael@0 3 # This Source Code Form is subject to the terms of the Mozilla Public
michael@0 4 # License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0 6
michael@0 7 # This tool is used to prepare lookup tables of Unicode character properties
michael@0 8 # needed by gfx code to support text shaping operations. The properties are
michael@0 9 # read from the Unicode Character Database and compiled into multi-level arrays
michael@0 10 # for efficient lookup.
michael@0 11 #
michael@0 12 # To regenerate the tables in nsUnicodePropertyData.cpp:
michael@0 13 #
michael@0 14 # (1) Download the current Unicode data files from
michael@0 15 #
michael@0 16 # http://www.unicode.org/Public/UNIDATA/
michael@0 17 #
michael@0 18 # NB: not all the files are actually needed; currently, we require
michael@0 19 # - UnicodeData.txt
michael@0 20 # - Scripts.txt
michael@0 21 # - EastAsianWidth.txt
michael@0 22 # - BidiMirroring.txt
michael@0 23 # - HangulSyllableType.txt
michael@0 24 # - ReadMe.txt (to record version/date of the UCD)
michael@0 25 # - Unihan_Variants.txt (from Unihan.zip)
michael@0 26 # though this may change if we find a need for additional properties.
michael@0 27 #
michael@0 28 # The Unicode data files listed above should be together in one directory.
michael@0 29 # We also require the file
michael@0 30 # http://www.unicode.org/Public/security/latest/xidmodifications.txt
michael@0 31 # This file should be in a sub-directory "security" immediately below the
michael@0 32 # directory containing the other Unicode data files.
michael@0 33 #
michael@0 34 # (2) Run this tool using a command line of the form
michael@0 35 #
michael@0 36 # perl genUnicodePropertyData.pl \
michael@0 37 # /path/to/harfbuzz/src \
michael@0 38 # /path/to/UCD-directory
michael@0 39 #
michael@0 40 # This will generate (or overwrite!) the files
michael@0 41 #
michael@0 42 # nsUnicodePropertyData.cpp
michael@0 43 # nsUnicodeScriptCodes.h
michael@0 44 #
michael@0 45 # in the current directory.
michael@0 46
michael@0 47 use strict;
michael@0 48 use List::Util qw(first);
michael@0 49
michael@0 50 if ($#ARGV != 1) {
michael@0 51 print <<__EOT;
michael@0 52 # Run this tool using a command line of the form
michael@0 53 #
michael@0 54 # perl genUnicodePropertyData.pl \
michael@0 55 # /path/to/harfbuzz/src \
michael@0 56 # /path/to/UCD-directory
michael@0 57 #
michael@0 58 # where harfbuzz/src is the directory containing harfbuzz .cc and .hh files,
michael@0 59 # and UCD-directory is a directory containing the current Unicode Character
michael@0 60 # Database files (UnicodeData.txt, etc), available from
michael@0 61 # http://www.unicode.org/Public/UNIDATA/
michael@0 62 #
michael@0 63 # This will generate (or overwrite!) the files
michael@0 64 #
michael@0 65 # nsUnicodePropertyData.cpp
michael@0 66 # nsUnicodeScriptCodes.h
michael@0 67 #
michael@0 68 # in the current directory.
michael@0 69 __EOT
michael@0 70 exit 0;
michael@0 71 }
michael@0 72
michael@0 73 # load HB_Script and HB_Category constants
michael@0 74
michael@0 75 # NOTE that HB_SCRIPT_* constants are now "tag" values, NOT sequentially-allocated
michael@0 76 # script codes as used by Glib/Pango/etc.
michael@0 77 # We therefore define a set of MOZ_SCRIPT_* constants that are script _codes_
michael@0 78 # compatible with those libraries, and map these to HB_SCRIPT_* _tags_ as needed.
michael@0 79
michael@0 80 # CHECK that this matches Pango source (as found for example at
michael@0 81 # http://git.gnome.org/browse/pango/tree/pango/pango-script.h)
michael@0 82 # for as many codes as that defines (currently up through Unicode 5.1)
michael@0 83 # and the GLib enumeration
michael@0 84 # http://developer.gnome.org/glib/2.30/glib-Unicode-Manipulation.html#GUnicodeScript
michael@0 85 # (currently defined up through Unicode 6.0).
michael@0 86 # Constants beyond these may be regarded as unstable for now, but we don't actually
michael@0 87 # depend on the specific values.
michael@0 88 my %scriptCode = (
michael@0 89 INVALID => -1,
michael@0 90 COMMON => 0,
michael@0 91 INHERITED => 1,
michael@0 92 ARABIC => 2,
michael@0 93 ARMENIAN => 3,
michael@0 94 BENGALI => 4,
michael@0 95 BOPOMOFO => 5,
michael@0 96 CHEROKEE => 6,
michael@0 97 COPTIC => 7,
michael@0 98 CYRILLIC => 8,
michael@0 99 DESERET => 9,
michael@0 100 DEVANAGARI => 10,
michael@0 101 ETHIOPIC => 11,
michael@0 102 GEORGIAN => 12,
michael@0 103 GOTHIC => 13,
michael@0 104 GREEK => 14,
michael@0 105 GUJARATI => 15,
michael@0 106 GURMUKHI => 16,
michael@0 107 HAN => 17,
michael@0 108 HANGUL => 18,
michael@0 109 HEBREW => 19,
michael@0 110 HIRAGANA => 20,
michael@0 111 KANNADA => 21,
michael@0 112 KATAKANA => 22,
michael@0 113 KHMER => 23,
michael@0 114 LAO => 24,
michael@0 115 LATIN => 25,
michael@0 116 MALAYALAM => 26,
michael@0 117 MONGOLIAN => 27,
michael@0 118 MYANMAR => 28,
michael@0 119 OGHAM => 29,
michael@0 120 OLD_ITALIC => 30,
michael@0 121 ORIYA => 31,
michael@0 122 RUNIC => 32,
michael@0 123 SINHALA => 33,
michael@0 124 SYRIAC => 34,
michael@0 125 TAMIL => 35,
michael@0 126 TELUGU => 36,
michael@0 127 THAANA => 37,
michael@0 128 THAI => 38,
michael@0 129 TIBETAN => 39,
michael@0 130 CANADIAN_ABORIGINAL => 40,
michael@0 131 YI => 41,
michael@0 132 TAGALOG => 42,
michael@0 133 HANUNOO => 43,
michael@0 134 BUHID => 44,
michael@0 135 TAGBANWA => 45,
michael@0 136 # unicode 4.0 additions
michael@0 137 BRAILLE => 46,
michael@0 138 CYPRIOT => 47,
michael@0 139 LIMBU => 48,
michael@0 140 OSMANYA => 49,
michael@0 141 SHAVIAN => 50,
michael@0 142 LINEAR_B => 51,
michael@0 143 TAI_LE => 52,
michael@0 144 UGARITIC => 53,
michael@0 145 # unicode 4.1 additions
michael@0 146 NEW_TAI_LUE => 54,
michael@0 147 BUGINESE => 55,
michael@0 148 GLAGOLITIC => 56,
michael@0 149 TIFINAGH => 57,
michael@0 150 SYLOTI_NAGRI => 58,
michael@0 151 OLD_PERSIAN => 59,
michael@0 152 KHAROSHTHI => 60,
michael@0 153 # unicode 5.0 additions
michael@0 154 UNKNOWN => 61,
michael@0 155 BALINESE => 62,
michael@0 156 CUNEIFORM => 63,
michael@0 157 PHOENICIAN => 64,
michael@0 158 PHAGS_PA => 65,
michael@0 159 NKO => 66,
michael@0 160 # unicode 5.1 additions
michael@0 161 KAYAH_LI => 67,
michael@0 162 LEPCHA => 68,
michael@0 163 REJANG => 69,
michael@0 164 SUNDANESE => 70,
michael@0 165 SAURASHTRA => 71,
michael@0 166 CHAM => 72,
michael@0 167 OL_CHIKI => 73,
michael@0 168 VAI => 74,
michael@0 169 CARIAN => 75,
michael@0 170 LYCIAN => 76,
michael@0 171 LYDIAN => 77,
michael@0 172 # unicode 5.2 additions
michael@0 173 AVESTAN => 78,
michael@0 174 BAMUM => 79,
michael@0 175 EGYPTIAN_HIEROGLYPHS => 80,
michael@0 176 IMPERIAL_ARAMAIC => 81,
michael@0 177 INSCRIPTIONAL_PAHLAVI => 82,
michael@0 178 INSCRIPTIONAL_PARTHIAN => 83,
michael@0 179 JAVANESE => 84,
michael@0 180 KAITHI => 85,
michael@0 181 LISU => 86,
michael@0 182 MEETEI_MAYEK => 87,
michael@0 183 OLD_SOUTH_ARABIAN => 88,
michael@0 184 OLD_TURKIC => 89,
michael@0 185 SAMARITAN => 90,
michael@0 186 TAI_THAM => 91,
michael@0 187 TAI_VIET => 92,
michael@0 188 # unicode 6.0 additions
michael@0 189 BATAK => 93,
michael@0 190 BRAHMI => 94,
michael@0 191 MANDAIC => 95,
michael@0 192 # unicode 6.1 additions
michael@0 193 CHAKMA => 96,
michael@0 194 MEROITIC_CURSIVE => 97,
michael@0 195 MEROITIC_HIEROGLYPHS => 98,
michael@0 196 MIAO => 99,
michael@0 197 SHARADA => 100,
michael@0 198 SORA_SOMPENG => 101,
michael@0 199 TAKRI => 102
michael@0 200 );
michael@0 201
michael@0 202 my $sc = -1;
michael@0 203 my $cc = -1;
michael@0 204 my %catCode;
michael@0 205 my @scriptCodeToTag;
michael@0 206 my @scriptCodeToName;
michael@0 207
michael@0 208 sub readHarfBuzzHeader
michael@0 209 {
michael@0 210 my $file = shift;
michael@0 211 open FH, "< $ARGV[0]/$file" or die "can't open harfbuzz header $ARGV[0]/$file\n";
michael@0 212 while (<FH>) {
michael@0 213 if (m/HB_SCRIPT_([A-Z_]+)\s*=\s*HB_TAG\s*\(('.','.','.','.')\)\s*,/) {
michael@0 214 unless (exists $scriptCode{$1}) {
michael@0 215 warn "unknown script name $1 found in $file\n";
michael@0 216 next;
michael@0 217 }
michael@0 218 $sc = $scriptCode{$1};
michael@0 219 $scriptCodeToTag[$sc] = $2;
michael@0 220 $scriptCodeToName[$sc] = $1;
michael@0 221 }
michael@0 222 if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) {
michael@0 223 $cc++;
michael@0 224 $catCode{$1} = $cc;
michael@0 225 }
michael@0 226 }
michael@0 227 close FH;
michael@0 228 }
michael@0 229
michael@0 230 &readHarfBuzzHeader("hb-common.h");
michael@0 231 &readHarfBuzzHeader("hb-unicode.h");
michael@0 232
michael@0 233 die "didn't find HarfBuzz script codes\n" if $sc == -1;
michael@0 234 die "didn't find HarfBuzz category codes\n" if $cc == -1;
michael@0 235
michael@0 236 my %xidmodCode = (
michael@0 237 'inclusion' => 0,
michael@0 238 'recommended' => 1,
michael@0 239 'default-ignorable' => 2,
michael@0 240 'historic' => 3,
michael@0 241 'limited-use' => 4,
michael@0 242 'not-NFKC' => 5,
michael@0 243 'not-xid' => 6,
michael@0 244 'obsolete' => 7,
michael@0 245 'technical' => 8,
michael@0 246 'not-chars' => 9
michael@0 247 );
michael@0 248
michael@0 249 my %bidicategoryCode = (
michael@0 250 "L" => "0", # Left-to-Right
michael@0 251 "R" => "1", # Right-to-Left
michael@0 252 "EN" => "2", # European Number
michael@0 253 "ES" => "3", # European Number Separator
michael@0 254 "ET" => "4", # European Number Terminator
michael@0 255 "AN" => "5", # Arabic Number
michael@0 256 "CS" => "6", # Common Number Separator
michael@0 257 "B" => "7", # Paragraph Separator
michael@0 258 "S" => "8", # Segment Separator
michael@0 259 "WS" => "9", # Whitespace
michael@0 260 "ON" => "10", # Other Neutrals
michael@0 261 "LRE" => "11", # Left-to-Right Embedding
michael@0 262 "LRO" => "12", # Left-to-Right Override
michael@0 263 "AL" => "13", # Right-to-Left Arabic
michael@0 264 "RLE" => "14", # Right-to-Left Embedding
michael@0 265 "RLO" => "15", # Right-to-Left Override
michael@0 266 "PDF" => "16", # Pop Directional Format
michael@0 267 "NSM" => "17", # Non-Spacing Mark
michael@0 268 "BN" => "18" # Boundary Neutral
michael@0 269 );
michael@0 270
michael@0 271 # initialize default properties
michael@0 272 my @script;
michael@0 273 my @category;
michael@0 274 my @combining;
michael@0 275 my @eaw;
michael@0 276 my @mirror;
michael@0 277 my @hangul;
michael@0 278 my @casemap;
michael@0 279 my @xidmod;
michael@0 280 my @numericvalue;
michael@0 281 my @hanVariant;
michael@0 282 my @bidicategory;
michael@0 283 my @fullWidth;
michael@0 284 for (my $i = 0; $i < 0x110000; ++$i) {
michael@0 285 $script[$i] = $scriptCode{"UNKNOWN"};
michael@0 286 $category[$i] = $catCode{"UNASSIGNED"};
michael@0 287 $combining[$i] = 0;
michael@0 288 $casemap[$i] = 0;
michael@0 289 $xidmod[$i] = $xidmodCode{"not-chars"};
michael@0 290 $numericvalue[$i] = -1;
michael@0 291 $hanVariant[$i] = 0;
michael@0 292 $bidicategory[$i] = $bidicategoryCode{"L"};
michael@0 293 $fullWidth[$i] = 0;
michael@0 294 }
michael@0 295
michael@0 296 # blocks where the default for bidi category is not L
michael@0 297 for my $i (0x0600..0x07BF, 0x08A0..0x08FF, 0xFB50..0xFDCF, 0xFDF0..0xFDFF, 0xFE70..0xFEFF, 0x1EE00..0x0001EEFF) {
michael@0 298 $bidicategory[$i] = $bidicategoryCode{"AL"};
michael@0 299 }
michael@0 300 for my $i (0x0590..0x05FF, 0x07C0..0x089F, 0xFB1D..0xFB4F, 0x00010800..0x00010FFF, 0x0001E800..0x0001EDFF, 0x0001EF00..0x0001EFFF) {
michael@0 301 $bidicategory[$i] = $bidicategoryCode{"R"};
michael@0 302 }
michael@0 303 for my $i (0x20A0..0x20CF) {
michael@0 304 $bidicategory[$i] = $bidicategoryCode{"ET"};
michael@0 305 }
michael@0 306
michael@0 307 my %ucd2hb = (
michael@0 308 'Cc' => 'CONTROL',
michael@0 309 'Cf' => 'FORMAT',
michael@0 310 'Cn' => 'UNASSIGNED',
michael@0 311 'Co' => 'PRIVATE_USE',
michael@0 312 'Cs' => 'SURROGATE',
michael@0 313 'Ll' => 'LOWERCASE_LETTER',
michael@0 314 'Lm' => 'MODIFIER_LETTER',
michael@0 315 'Lo' => 'OTHER_LETTER',
michael@0 316 'Lt' => 'TITLECASE_LETTER',
michael@0 317 'Lu' => 'UPPERCASE_LETTER',
michael@0 318 'Mc' => 'SPACING_MARK',
michael@0 319 'Me' => 'ENCLOSING_MARK',
michael@0 320 'Mn' => 'NON_SPACING_MARK',
michael@0 321 'Nd' => 'DECIMAL_NUMBER',
michael@0 322 'Nl' => 'LETTER_NUMBER',
michael@0 323 'No' => 'OTHER_NUMBER',
michael@0 324 'Pc' => 'CONNECT_PUNCTUATION',
michael@0 325 'Pd' => 'DASH_PUNCTUATION',
michael@0 326 'Pe' => 'CLOSE_PUNCTUATION',
michael@0 327 'Pf' => 'FINAL_PUNCTUATION',
michael@0 328 'Pi' => 'INITIAL_PUNCTUATION',
michael@0 329 'Po' => 'OTHER_PUNCTUATION',
michael@0 330 'Ps' => 'OPEN_PUNCTUATION',
michael@0 331 'Sc' => 'CURRENCY_SYMBOL',
michael@0 332 'Sk' => 'MODIFIER_SYMBOL',
michael@0 333 'Sm' => 'MATH_SYMBOL',
michael@0 334 'So' => 'OTHER_SYMBOL',
michael@0 335 'Zl' => 'LINE_SEPARATOR',
michael@0 336 'Zp' => 'PARAGRAPH_SEPARATOR',
michael@0 337 'Zs' => 'SPACE_SEPARATOR'
michael@0 338 );
michael@0 339
michael@0 340 # read ReadMe.txt
michael@0 341 my @versionInfo;
michael@0 342 open FH, "< $ARGV[1]/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
michael@0 343 while (<FH>) {
michael@0 344 chomp;
michael@0 345 push @versionInfo, $_;
michael@0 346 }
michael@0 347 close FH;
michael@0 348
michael@0 349 my $kTitleToUpper = 0x80000000;
michael@0 350 my $kUpperToLower = 0x40000000;
michael@0 351 my $kLowerToTitle = 0x20000000;
michael@0 352 my $kLowerToUpper = 0x10000000;
michael@0 353 my $kCaseMapCharMask = 0x001fffff;
michael@0 354
michael@0 355 # read UnicodeData.txt
michael@0 356 open FH, "< $ARGV[1]/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
michael@0 357 while (<FH>) {
michael@0 358 chomp;
michael@0 359 my @fields = split /;/;
michael@0 360 if ($fields[1] =~ /First/) {
michael@0 361 my $first = hex "0x$fields[0]";
michael@0 362 $_ = <FH>;
michael@0 363 @fields = split /;/;
michael@0 364 if ($fields[1] =~ /Last/) {
michael@0 365 my $last = hex "0x$fields[0]";
michael@0 366 do {
michael@0 367 $category[$first] = $catCode{$ucd2hb{$fields[2]}};
michael@0 368 $combining[$first] = $fields[3];
michael@0 369 $bidicategory[$first] = $bidicategoryCode{$fields[4]};
michael@0 370 unless (length($fields[7]) == 0) {
michael@0 371 $numericvalue[$first] = $fields[7];
michael@0 372 }
michael@0 373 if ($fields[1] =~ /CJK/) {
michael@0 374 @hanVariant[$first] = 3;
michael@0 375 }
michael@0 376 $first++;
michael@0 377 } while ($first <= $last);
michael@0 378 } else {
michael@0 379 die "didn't find Last code for range!\n";
michael@0 380 }
michael@0 381 } else {
michael@0 382 my $usv = hex "0x$fields[0]";
michael@0 383 $category[$usv] = $catCode{$ucd2hb{$fields[2]}};
michael@0 384 $combining[$usv] = $fields[3];
michael@0 385 my $upper = hex $fields[12];
michael@0 386 my $lower = hex $fields[13];
michael@0 387 my $title = hex $fields[14];
michael@0 388 # we only store one mapping for each character,
michael@0 389 # but also record what kind of mapping it is
michael@0 390 if ($upper && $lower) {
michael@0 391 $casemap[$usv] |= $kTitleToUpper;
michael@0 392 $casemap[$usv] |= ($usv ^ $upper);
michael@0 393 }
michael@0 394 elsif ($lower) {
michael@0 395 $casemap[$usv] |= $kUpperToLower;
michael@0 396 $casemap[$usv] |= ($usv ^ $lower);
michael@0 397 }
michael@0 398 elsif ($title && ($title != $upper)) {
michael@0 399 $casemap[$usv] |= $kLowerToTitle;
michael@0 400 $casemap[$usv] |= ($usv ^ $title);
michael@0 401 }
michael@0 402 elsif ($upper) {
michael@0 403 $casemap[$usv] |= $kLowerToUpper;
michael@0 404 $casemap[$usv] |= ($usv ^ $upper);
michael@0 405 }
michael@0 406 $bidicategory[$usv] = $bidicategoryCode{$fields[4]};
michael@0 407 unless (length($fields[7]) == 0) {
michael@0 408 $numericvalue[$usv] = $fields[7];
michael@0 409 }
michael@0 410 if ($fields[1] =~ /CJK/) {
michael@0 411 @hanVariant[$usv] = 3;
michael@0 412 }
michael@0 413 if ($fields[5] =~ /^<narrow>/) {
michael@0 414 my $wideChar = hex(substr($fields[5], 9));
michael@0 415 die "didn't expect supplementary-plane values here" if $usv > 0xffff || $wideChar > 0xffff;
michael@0 416 $fullWidth[$usv] = $wideChar;
michael@0 417 }
michael@0 418 elsif ($fields[5] =~ /^<wide>/) {
michael@0 419 my $narrowChar = hex(substr($fields[5], 7));
michael@0 420 die "didn't expect supplementary-plane values here" if $usv > 0xffff || $narrowChar > 0xffff;
michael@0 421 $fullWidth[$narrowChar] = $usv;
michael@0 422 }
michael@0 423 }
michael@0 424 }
michael@0 425 close FH;
michael@0 426
michael@0 427 # read Scripts.txt
michael@0 428 open FH, "< $ARGV[1]/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
michael@0 429 push @versionInfo, "";
michael@0 430 while (<FH>) {
michael@0 431 chomp;
michael@0 432 push @versionInfo, $_;
michael@0 433 last if /Date:/;
michael@0 434 }
michael@0 435 while (<FH>) {
michael@0 436 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) {
michael@0 437 my $script = uc($3);
michael@0 438 warn "unknown script $script" unless exists $scriptCode{$script};
michael@0 439 $script = $scriptCode{$script};
michael@0 440 my $start = hex "0x$1";
michael@0 441 my $end = (defined $2) ? hex "0x$2" : $start;
michael@0 442 for (my $i = $start; $i <= $end; ++$i) {
michael@0 443 $script[$i] = $script;
michael@0 444 }
michael@0 445 }
michael@0 446 }
michael@0 447 close FH;
michael@0 448
michael@0 449 # read EastAsianWidth.txt
michael@0 450 my %eawCode = (
michael@0 451 'A' => 0, # ; Ambiguous
michael@0 452 'F' => 1, # ; Fullwidth
michael@0 453 'H' => 2, # ; Halfwidth
michael@0 454 'N' => 3, # ; Neutral
michael@0 455 'NA'=> 4, # ; Narrow
michael@0 456 'W' => 5 # ; Wide
michael@0 457 );
michael@0 458 open FH, "< $ARGV[1]/EastAsianWidth.txt" or die "can't open UCD file EastAsianWidth.txt\n";
michael@0 459 push @versionInfo, "";
michael@0 460 while (<FH>) {
michael@0 461 chomp;
michael@0 462 push @versionInfo, $_;
michael@0 463 last if /Date:/;
michael@0 464 }
michael@0 465 while (<FH>) {
michael@0 466 s/#.*//;
michael@0 467 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
michael@0 468 my $eaw = uc($3);
michael@0 469 warn "unknown EAW code $eaw" unless exists $eawCode{$eaw};
michael@0 470 $eaw = $eawCode{$eaw};
michael@0 471 my $start = hex "0x$1";
michael@0 472 my $end = (defined $2) ? hex "0x$2" : $start;
michael@0 473 for (my $i = $start; $i <= $end; ++$i) {
michael@0 474 $eaw[$i] = $eaw;
michael@0 475 }
michael@0 476 }
michael@0 477 }
michael@0 478 close FH;
michael@0 479
michael@0 480 # read BidiMirroring.txt
michael@0 481 my @offsets = ();
michael@0 482 push @offsets, 0;
michael@0 483
michael@0 484 open FH, "< $ARGV[1]/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
michael@0 485 push @versionInfo, "";
michael@0 486 while (<FH>) {
michael@0 487 chomp;
michael@0 488 push @versionInfo, $_;
michael@0 489 last if /Date:/;
michael@0 490 }
michael@0 491 while (<FH>) {
michael@0 492 s/#.*//;
michael@0 493 if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6})/) {
michael@0 494 my $mirrorOffset = hex("0x$2") - hex("0x$1");
michael@0 495 my $offsetIndex = first { $offsets[$_] eq $mirrorOffset } 0..$#offsets;
michael@0 496 if ($offsetIndex == undef) {
michael@0 497 die "too many offset codes\n" if scalar @offsets == 31;
michael@0 498 push @offsets, $mirrorOffset;
michael@0 499 $offsetIndex = $#offsets;
michael@0 500 }
michael@0 501 $mirror[hex "0x$1"] = $offsetIndex;
michael@0 502 }
michael@0 503 }
michael@0 504 close FH;
michael@0 505
michael@0 506 # read HangulSyllableType.txt
michael@0 507 my %hangulType = (
michael@0 508 'L' => 0x01,
michael@0 509 'V' => 0x02,
michael@0 510 'T' => 0x04,
michael@0 511 'LV' => 0x03,
michael@0 512 'LVT' => 0x07
michael@0 513 );
michael@0 514 open FH, "< $ARGV[1]/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
michael@0 515 push @versionInfo, "";
michael@0 516 while (<FH>) {
michael@0 517 chomp;
michael@0 518 push @versionInfo, $_;
michael@0 519 last if /Date:/;
michael@0 520 }
michael@0 521 while (<FH>) {
michael@0 522 s/#.*//;
michael@0 523 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
michael@0 524 my $hangul = uc($3);
michael@0 525 warn "unknown Hangul syllable type" unless exists $hangulType{$hangul};
michael@0 526 $hangul = $hangulType{$hangul};
michael@0 527 my $start = hex "0x$1";
michael@0 528 my $end = (defined $2) ? hex "0x$2" : $start;
michael@0 529 for (my $i = $start; $i <= $end; ++$i) {
michael@0 530 $hangul[$i] = $hangul;
michael@0 531 }
michael@0 532 }
michael@0 533 }
michael@0 534 close FH;
michael@0 535
michael@0 536 # read xidmodifications.txt
michael@0 537 open FH, "< $ARGV[1]/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
michael@0 538 push @versionInfo, "";
michael@0 539 while (<FH>) {
michael@0 540 chomp;
michael@0 541 unless (/\xef\xbb\xbf/) {
michael@0 542 push @versionInfo, $_;
michael@0 543 }
michael@0 544 last if /Generated:/;
michael@0 545 }
michael@0 546 while (<FH>) {
michael@0 547 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) {
michael@0 548 my $xidmod = $3;
michael@0 549 warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod};
michael@0 550 $xidmod = $xidmodCode{$xidmod};
michael@0 551 my $start = hex "0x$1";
michael@0 552 my $end = (defined $2) ? hex "0x$2" : $start;
michael@0 553 for (my $i = $start; $i <= $end; ++$i) {
michael@0 554 $xidmod[$i] = $xidmod;
michael@0 555 }
michael@0 556 }
michael@0 557 }
michael@0 558 close FH;
michael@0 559 # special case U+30FB KATAKANA MIDDLE DOT -- see bug 857490
michael@0 560 $xidmod[0x30FB] = 1;
michael@0 561
michael@0 562 open FH, "< $ARGV[1]/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
michael@0 563 push @versionInfo, "";
michael@0 564 while (<FH>) {
michael@0 565 chomp;
michael@0 566 push @versionInfo, $_;
michael@0 567 last if /Date:/;
michael@0 568 }
michael@0 569 my $savedusv = 0;
michael@0 570 my $hasTC = 0;
michael@0 571 my $hasSC = 0;
michael@0 572 while (<FH>) {
michael@0 573 chomp;
michael@0 574 if (m/U\+([0-9A-F]{4,6})\s+k([^ ]+)Variant/) {
michael@0 575 my $usv = hex "0x$1";
michael@0 576 if ($usv != $savedusv) {
michael@0 577 unless ($savedusv == 0) {
michael@0 578 if ($hasTC && !$hasSC) {
michael@0 579 $hanVariant[$savedusv] = 1;
michael@0 580 } elsif (!$hasTC && $hasSC) {
michael@0 581 $hanVariant[$savedusv] = 2;
michael@0 582 }
michael@0 583 }
michael@0 584 $savedusv = $usv;
michael@0 585 $hasTC = 0;
michael@0 586 $hasSC = 0;
michael@0 587 }
michael@0 588 if ($2 eq "Traditional") {
michael@0 589 $hasTC = 1;
michael@0 590 }
michael@0 591 if ($2 eq "Simplified") {
michael@0 592 $hasSC = 1;
michael@0 593 }
michael@0 594 }
michael@0 595 }
michael@0 596 close FH;
michael@0 597
michael@0 598 my $timestamp = gmtime();
michael@0 599
michael@0 600 open DATA_TABLES, "> nsUnicodePropertyData.cpp" or die "unable to open nsUnicodePropertyData.cpp for output";
michael@0 601
michael@0 602 my $licenseBlock = q[
michael@0 603 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
michael@0 604 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 605 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 606 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 607
michael@0 608 /*
michael@0 609 * Derived from the Unicode Character Database by genUnicodePropertyData.pl
michael@0 610 *
michael@0 611 * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html
michael@0 612 */
michael@0 613 ];
michael@0 614
michael@0 615 my $versionInfo = join("\n", @versionInfo);
michael@0 616
michael@0 617 print DATA_TABLES <<__END;
michael@0 618 $licenseBlock
michael@0 619 /*
michael@0 620 * Created on $timestamp from UCD data files with version info:
michael@0 621 *
michael@0 622
michael@0 623 $versionInfo
michael@0 624
michael@0 625 *
michael@0 626 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
michael@0 627 */
michael@0 628
michael@0 629 #include <stdint.h>
michael@0 630 #include "harfbuzz/hb.h"
michael@0 631
michael@0 632 __END
michael@0 633
michael@0 634 open HEADER, "> nsUnicodeScriptCodes.h" or die "unable to open nsUnicodeScriptCodes.h for output";
michael@0 635
michael@0 636 print HEADER <<__END;
michael@0 637 $licenseBlock
michael@0 638 /*
michael@0 639 * Created on $timestamp from UCD data files with version info:
michael@0 640 *
michael@0 641
michael@0 642 $versionInfo
michael@0 643
michael@0 644 *
michael@0 645 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
michael@0 646 */
michael@0 647
michael@0 648 #ifndef NS_UNICODE_SCRIPT_CODES
michael@0 649 #define NS_UNICODE_SCRIPT_CODES
michael@0 650
michael@0 651 __END
michael@0 652
michael@0 653 print DATA_TABLES "static const uint32_t sScriptCodeToTag[] = {\n";
michael@0 654 for (my $i = 0; $i < scalar @scriptCodeToTag; ++$i) {
michael@0 655 printf DATA_TABLES " HB_TAG(%s)", $scriptCodeToTag[$i];
michael@0 656 print DATA_TABLES $i < $#scriptCodeToTag ? ",\n" : "\n";
michael@0 657 }
michael@0 658 print DATA_TABLES "};\n\n";
michael@0 659
michael@0 660 our $totalData = 0;
michael@0 661
michael@0 662 print DATA_TABLES "static const int16_t sMirrorOffsets[] = {\n";
michael@0 663 for (my $i = 0; $i < scalar @offsets; ++$i) {
michael@0 664 printf DATA_TABLES " $offsets[$i]";
michael@0 665 print DATA_TABLES $i < $#offsets ? ",\n" : "\n";
michael@0 666 }
michael@0 667 print DATA_TABLES "};\n\n";
michael@0 668
michael@0 669 print HEADER "#pragma pack(1)\n\n";
michael@0 670
michael@0 671 sub sprintCharProps1
michael@0 672 {
michael@0 673 my $usv = shift;
michael@0 674 return sprintf("{%d,%d,%d}, ", $mirror[$usv], $hangul[$usv], $combining[$usv]);
michael@0 675 }
michael@0 676 &genTables("CharProp1", "struct nsCharProps1 {\n unsigned char mMirrorOffsetIndex:5;\n unsigned char mHangulType:3;\n unsigned char mCombiningClass:8;\n};",
michael@0 677 "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1);
michael@0 678
michael@0 679 sub sprintCharProps2
michael@0 680 {
michael@0 681 my $usv = shift;
michael@0 682 return sprintf("{%d,%d,%d,%d,%d,%d},",
michael@0 683 $script[$usv], $eaw[$usv], $category[$usv],
michael@0 684 $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv]);
michael@0 685 }
michael@0 686 &genTables("CharProp2", "struct nsCharProps2 {\n unsigned char mScriptCode:8;\n unsigned char mEAW:3;\n unsigned char mCategory:5;\n unsigned char mBidiCategory:5;\n unsigned char mXidmod:4;\n signed char mNumericValue:5;\n unsigned char mHanVariant:2;\n};",
michael@0 687 "nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1);
michael@0 688
michael@0 689 print HEADER "#pragma pack()\n\n";
michael@0 690
michael@0 691 sub sprintHanVariants
michael@0 692 {
michael@0 693 my $baseUsv = shift;
michael@0 694 my $varShift = 0;
michael@0 695 my $val = 0;
michael@0 696 while ($varShift < 8) {
michael@0 697 $val |= $hanVariant[$baseUsv++] << $varShift;
michael@0 698 $varShift += 2;
michael@0 699 }
michael@0 700 return sprintf("0x%02x,", $val);
michael@0 701 }
michael@0 702 &genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
michael@0 703
michael@0 704 sub sprintFullWidth
michael@0 705 {
michael@0 706 my $usv = shift;
michael@0 707 return sprintf("0x%04x,", $fullWidth[$usv]);
michael@0 708 }
michael@0 709 &genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
michael@0 710
michael@0 711 sub sprintCasemap
michael@0 712 {
michael@0 713 my $usv = shift;
michael@0 714 return sprintf("0x%08x,", $casemap[$usv]);
michael@0 715 }
michael@0 716 &genTables("CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1);
michael@0 717
michael@0 718 print STDERR "Total data = $totalData\n";
michael@0 719
michael@0 720 printf DATA_TABLES "const uint32_t kTitleToUpper = 0x%08x;\n", $kTitleToUpper;
michael@0 721 printf DATA_TABLES "const uint32_t kUpperToLower = 0x%08x;\n", $kUpperToLower;
michael@0 722 printf DATA_TABLES "const uint32_t kLowerToTitle = 0x%08x;\n", $kLowerToTitle;
michael@0 723 printf DATA_TABLES "const uint32_t kLowerToUpper = 0x%08x;\n", $kLowerToUpper;
michael@0 724 printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCharMask;
michael@0 725
michael@0 726 sub genTables
michael@0 727 {
michael@0 728 my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
michael@0 729
michael@0 730 print DATA_TABLES "#define k${prefix}MaxPlane $maxPlane\n";
michael@0 731 print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n";
michael@0 732 print DATA_TABLES "#define k${prefix}CharBits $charBits\n";
michael@0 733
michael@0 734 my $indexLen = 1 << $indexBits;
michael@0 735 my $charsPerPage = 1 << $charBits;
michael@0 736 my %charIndex = ();
michael@0 737 my %pageMapIndex = ();
michael@0 738 my @pageMap = ();
michael@0 739 my @char = ();
michael@0 740
michael@0 741 my $planeMap = "\x00" x $maxPlane;
michael@0 742 foreach my $plane (0 .. $maxPlane) {
michael@0 743 my $pageMap = "\x00" x $indexLen * 2;
michael@0 744 foreach my $page (0 .. $indexLen - 1) {
michael@0 745 my $charValues = "";
michael@0 746 for (my $ch = 0; $ch < $charsPerPage; $ch += $charsPerEntry) {
michael@0 747 my $usv = $plane * 0x10000 + $page * $charsPerPage + $ch;
michael@0 748 $charValues .= &$func($usv);
michael@0 749 }
michael@0 750 chop $charValues;
michael@0 751
michael@0 752 unless (exists $charIndex{$charValues}) {
michael@0 753 $charIndex{$charValues} = scalar keys %charIndex;
michael@0 754 $char[$charIndex{$charValues}] = $charValues;
michael@0 755 }
michael@0 756 substr($pageMap, $page * 2, 2) = pack('S', $charIndex{$charValues});
michael@0 757 }
michael@0 758
michael@0 759 unless (exists $pageMapIndex{$pageMap}) {
michael@0 760 $pageMapIndex{$pageMap} = scalar keys %pageMapIndex;
michael@0 761 $pageMap[$pageMapIndex{$pageMap}] = $pageMap;
michael@0 762 }
michael@0 763 if ($plane > 0) {
michael@0 764 substr($planeMap, $plane - 1, 1) = pack('C', $pageMapIndex{$pageMap});
michael@0 765 }
michael@0 766 }
michael@0 767
michael@0 768 if ($maxPlane) {
michael@0 769 print DATA_TABLES "static const uint8_t s${prefix}Planes[$maxPlane] = {";
michael@0 770 print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('C*', $planeMap));
michael@0 771 print DATA_TABLES "};\n\n";
michael@0 772 }
michael@0 773
michael@0 774 my $chCount = scalar @char;
michael@0 775 my $pmBits = $chCount > 255 ? 16 : 8;
michael@0 776 my $pmCount = scalar @pageMap;
michael@0 777 if ($maxPlane == 0) {
michael@0 778 die "there should only be one pageMap entry!" if $pmCount > 1;
michael@0 779 print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$indexLen] = {\n";
michael@0 780 } else {
michael@0 781 print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$pmCount][$indexLen] = {\n";
michael@0 782 }
michael@0 783 for (my $i = 0; $i < scalar @pageMap; ++$i) {
michael@0 784 print DATA_TABLES $maxPlane > 0 ? " {" : " ";
michael@0 785 print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('S*', $pageMap[$i]));
michael@0 786 print DATA_TABLES $maxPlane > 0 ? ($i < $#pageMap ? "},\n" : "}\n") : "\n";
michael@0 787 }
michael@0 788 print DATA_TABLES "};\n\n";
michael@0 789
michael@0 790 print HEADER "$typedef\n\n" if $typedef ne '';
michael@0 791
michael@0 792 my $pageLen = $charsPerPage / $charsPerEntry;
michael@0 793 print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n";
michael@0 794 for (my $i = 0; $i < scalar @char; ++$i) {
michael@0 795 print DATA_TABLES " {";
michael@0 796 print DATA_TABLES $char[$i];
michael@0 797 print DATA_TABLES $i < $#char ? "},\n" : "}\n";
michael@0 798 }
michael@0 799 print DATA_TABLES "};\n\n";
michael@0 800
michael@0 801 my $dataSize = $pmCount * $indexLen * $pmBits/8 +
michael@0 802 $chCount * $pageLen * $bytesPerEntry +
michael@0 803 $maxPlane;
michael@0 804 $totalData += $dataSize;
michael@0 805
michael@0 806 print STDERR "Data for $prefix = $dataSize\n";
michael@0 807 }
michael@0 808
michael@0 809 print DATA_TABLES <<__END;
michael@0 810 /*
michael@0 811 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
michael@0 812 */
michael@0 813 __END
michael@0 814
michael@0 815 close DATA_TABLES;
michael@0 816
michael@0 817 print HEADER "enum {\n";
michael@0 818 for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) {
michael@0 819 print HEADER " MOZ_SCRIPT_", $scriptCodeToName[$i], " = ", $i, ",\n";
michael@0 820 }
michael@0 821 print HEADER "\n MOZ_NUM_SCRIPT_CODES = ", scalar @scriptCodeToName, ",\n";
michael@0 822 print HEADER "\n MOZ_SCRIPT_INVALID = -1\n";
michael@0 823 print HEADER "};\n\n";
michael@0 824
michael@0 825 print HEADER <<__END;
michael@0 826 #endif
michael@0 827 /*
michael@0 828 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
michael@0 829 */
michael@0 830 __END
michael@0 831
michael@0 832 close HEADER;
michael@0 833

mercurial