intl/unicharutil/tools/genUnicodePropertyData.pl

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/unicharutil/tools/genUnicodePropertyData.pl	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,833 @@
     1.4 +#!/usr/bin/env perl
     1.5 +
     1.6 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.7 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.8 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.9 +
    1.10 +# This tool is used to prepare lookup tables of Unicode character properties
    1.11 +# needed by gfx code to support text shaping operations. The properties are
    1.12 +# read from the Unicode Character Database and compiled into multi-level arrays
    1.13 +# for efficient lookup.
    1.14 +#
    1.15 +# To regenerate the tables in nsUnicodePropertyData.cpp:
    1.16 +#
    1.17 +# (1) Download the current Unicode data files from
    1.18 +#
    1.19 +#         http://www.unicode.org/Public/UNIDATA/
    1.20 +#
    1.21 +#     NB: not all the files are actually needed; currently, we require
    1.22 +#       - UnicodeData.txt
    1.23 +#       - Scripts.txt
    1.24 +#       - EastAsianWidth.txt
    1.25 +#       - BidiMirroring.txt
    1.26 +#       - HangulSyllableType.txt
    1.27 +#       - ReadMe.txt (to record version/date of the UCD)
    1.28 +#       - Unihan_Variants.txt (from Unihan.zip)
    1.29 +#     though this may change if we find a need for additional properties.
    1.30 +#
    1.31 +#     The Unicode data files listed above should be together in one directory.
    1.32 +#     We also require the file 
    1.33 +#        http://www.unicode.org/Public/security/latest/xidmodifications.txt
    1.34 +#     This file should be in a sub-directory "security" immediately below the
    1.35 +#        directory containing the other Unicode data files.
    1.36 +#
    1.37 +# (2) Run this tool using a command line of the form
    1.38 +#
    1.39 +#         perl genUnicodePropertyData.pl \
    1.40 +#                 /path/to/harfbuzz/src  \
    1.41 +#                 /path/to/UCD-directory
    1.42 +#
    1.43 +#     This will generate (or overwrite!) the files
    1.44 +#
    1.45 +#         nsUnicodePropertyData.cpp
    1.46 +#         nsUnicodeScriptCodes.h
    1.47 +#
    1.48 +#     in the current directory.
    1.49 +
    1.50 +use strict;
    1.51 +use List::Util qw(first);
    1.52 +
    1.53 +if ($#ARGV != 1) {
    1.54 +    print <<__EOT;
    1.55 +# Run this tool using a command line of the form
    1.56 +#
    1.57 +#     perl genUnicodePropertyData.pl \
    1.58 +#             /path/to/harfbuzz/src  \
    1.59 +#             /path/to/UCD-directory
    1.60 +#
    1.61 +# where harfbuzz/src is the directory containing harfbuzz .cc and .hh files,
    1.62 +# and UCD-directory is a directory containing the current Unicode Character
    1.63 +# Database files (UnicodeData.txt, etc), available from
    1.64 +# http://www.unicode.org/Public/UNIDATA/
    1.65 +#
    1.66 +# This will generate (or overwrite!) the files
    1.67 +#
    1.68 +#     nsUnicodePropertyData.cpp
    1.69 +#     nsUnicodeScriptCodes.h
    1.70 +#
    1.71 +# in the current directory.
    1.72 +__EOT
    1.73 +    exit 0;
    1.74 +}
    1.75 +
    1.76 +# load HB_Script and HB_Category constants
    1.77 +
    1.78 +# NOTE that HB_SCRIPT_* constants are now "tag" values, NOT sequentially-allocated
    1.79 +# script codes as used by Glib/Pango/etc.
    1.80 +# We therefore define a set of MOZ_SCRIPT_* constants that are script _codes_
    1.81 +# compatible with those libraries, and map these to HB_SCRIPT_* _tags_ as needed.
    1.82 +
    1.83 +# CHECK that this matches Pango source (as found for example at 
    1.84 +# http://git.gnome.org/browse/pango/tree/pango/pango-script.h)
    1.85 +# for as many codes as that defines (currently up through Unicode 5.1)
    1.86 +# and the GLib enumeration
    1.87 +# http://developer.gnome.org/glib/2.30/glib-Unicode-Manipulation.html#GUnicodeScript
    1.88 +# (currently defined up through Unicode 6.0).
    1.89 +# Constants beyond these may be regarded as unstable for now, but we don't actually
    1.90 +# depend on the specific values.
    1.91 +my %scriptCode = (
    1.92 +  INVALID => -1,
    1.93 +  COMMON => 0,
    1.94 +  INHERITED => 1,
    1.95 +  ARABIC => 2,
    1.96 +  ARMENIAN => 3,
    1.97 +  BENGALI => 4,
    1.98 +  BOPOMOFO => 5,
    1.99 +  CHEROKEE => 6,
   1.100 +  COPTIC => 7,
   1.101 +  CYRILLIC => 8,
   1.102 +  DESERET => 9,
   1.103 +  DEVANAGARI => 10,
   1.104 +  ETHIOPIC => 11,
   1.105 +  GEORGIAN => 12,
   1.106 +  GOTHIC => 13,
   1.107 +  GREEK => 14,
   1.108 +  GUJARATI => 15,
   1.109 +  GURMUKHI => 16,
   1.110 +  HAN => 17,
   1.111 +  HANGUL => 18,
   1.112 +  HEBREW => 19,
   1.113 +  HIRAGANA => 20,
   1.114 +  KANNADA => 21,
   1.115 +  KATAKANA => 22,
   1.116 +  KHMER => 23,
   1.117 +  LAO => 24,
   1.118 +  LATIN => 25,
   1.119 +  MALAYALAM => 26,
   1.120 +  MONGOLIAN => 27,
   1.121 +  MYANMAR => 28,
   1.122 +  OGHAM => 29,
   1.123 +  OLD_ITALIC => 30,
   1.124 +  ORIYA => 31,
   1.125 +  RUNIC => 32,
   1.126 +  SINHALA => 33,
   1.127 +  SYRIAC => 34,
   1.128 +  TAMIL => 35,
   1.129 +  TELUGU => 36,
   1.130 +  THAANA => 37,
   1.131 +  THAI => 38,
   1.132 +  TIBETAN => 39,
   1.133 +  CANADIAN_ABORIGINAL => 40,
   1.134 +  YI => 41,
   1.135 +  TAGALOG => 42,
   1.136 +  HANUNOO => 43,
   1.137 +  BUHID => 44,
   1.138 +  TAGBANWA => 45,
   1.139 +# unicode 4.0 additions
   1.140 +  BRAILLE => 46,
   1.141 +  CYPRIOT => 47,
   1.142 +  LIMBU => 48,
   1.143 +  OSMANYA => 49,
   1.144 +  SHAVIAN => 50,
   1.145 +  LINEAR_B => 51,
   1.146 +  TAI_LE => 52,
   1.147 +  UGARITIC => 53,
   1.148 +# unicode 4.1 additions
   1.149 +  NEW_TAI_LUE => 54,
   1.150 +  BUGINESE => 55,
   1.151 +  GLAGOLITIC => 56,
   1.152 +  TIFINAGH => 57,
   1.153 +  SYLOTI_NAGRI => 58,
   1.154 +  OLD_PERSIAN => 59,
   1.155 +  KHAROSHTHI => 60,
   1.156 +# unicode 5.0 additions
   1.157 +  UNKNOWN => 61,
   1.158 +  BALINESE => 62,
   1.159 +  CUNEIFORM => 63,
   1.160 +  PHOENICIAN => 64,
   1.161 +  PHAGS_PA => 65,
   1.162 +  NKO => 66,
   1.163 +# unicode 5.1 additions
   1.164 +  KAYAH_LI => 67,
   1.165 +  LEPCHA => 68,
   1.166 +  REJANG => 69,
   1.167 +  SUNDANESE => 70,
   1.168 +  SAURASHTRA => 71,
   1.169 +  CHAM => 72,
   1.170 +  OL_CHIKI => 73,
   1.171 +  VAI => 74,
   1.172 +  CARIAN => 75,
   1.173 +  LYCIAN => 76,
   1.174 +  LYDIAN => 77,
   1.175 +# unicode 5.2 additions
   1.176 +  AVESTAN => 78,
   1.177 +  BAMUM => 79,
   1.178 +  EGYPTIAN_HIEROGLYPHS => 80,
   1.179 +  IMPERIAL_ARAMAIC => 81,
   1.180 +  INSCRIPTIONAL_PAHLAVI => 82,
   1.181 +  INSCRIPTIONAL_PARTHIAN => 83,
   1.182 +  JAVANESE => 84,
   1.183 +  KAITHI => 85,
   1.184 +  LISU => 86,
   1.185 +  MEETEI_MAYEK => 87,
   1.186 +  OLD_SOUTH_ARABIAN => 88,
   1.187 +  OLD_TURKIC => 89,
   1.188 +  SAMARITAN => 90,
   1.189 +  TAI_THAM => 91,
   1.190 +  TAI_VIET => 92,
   1.191 +# unicode 6.0 additions
   1.192 +  BATAK => 93,
   1.193 +  BRAHMI => 94,
   1.194 +  MANDAIC => 95,
   1.195 +# unicode 6.1 additions
   1.196 +  CHAKMA => 96,
   1.197 +  MEROITIC_CURSIVE => 97,
   1.198 +  MEROITIC_HIEROGLYPHS => 98,
   1.199 +  MIAO => 99,
   1.200 +  SHARADA => 100,
   1.201 +  SORA_SOMPENG => 101,
   1.202 +  TAKRI => 102
   1.203 +);
   1.204 +
   1.205 +my $sc = -1;
   1.206 +my $cc = -1;
   1.207 +my %catCode;
   1.208 +my @scriptCodeToTag;
   1.209 +my @scriptCodeToName;
   1.210 +
   1.211 +sub readHarfBuzzHeader
   1.212 +{
   1.213 +    my $file = shift;
   1.214 +    open FH, "< $ARGV[0]/$file" or die "can't open harfbuzz header $ARGV[0]/$file\n";
   1.215 +    while (<FH>) {
   1.216 +        if (m/HB_SCRIPT_([A-Z_]+)\s*=\s*HB_TAG\s*\(('.','.','.','.')\)\s*,/) {
   1.217 +            unless (exists $scriptCode{$1}) {
   1.218 +                warn "unknown script name $1 found in $file\n";
   1.219 +                next;
   1.220 +            }
   1.221 +            $sc = $scriptCode{$1};
   1.222 +            $scriptCodeToTag[$sc] = $2;
   1.223 +            $scriptCodeToName[$sc] = $1;
   1.224 +        }
   1.225 +        if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) {
   1.226 +            $cc++;
   1.227 +            $catCode{$1} = $cc;
   1.228 +        }
   1.229 +    }
   1.230 +    close FH;
   1.231 +}
   1.232 +
   1.233 +&readHarfBuzzHeader("hb-common.h");
   1.234 +&readHarfBuzzHeader("hb-unicode.h");
   1.235 +
   1.236 +die "didn't find HarfBuzz script codes\n" if $sc == -1;
   1.237 +die "didn't find HarfBuzz category codes\n" if $cc == -1;
   1.238 +
   1.239 +my %xidmodCode = (
   1.240 +'inclusion'         => 0,
   1.241 +'recommended'       => 1,
   1.242 +'default-ignorable' => 2,
   1.243 +'historic'          => 3,
   1.244 +'limited-use'       => 4,
   1.245 +'not-NFKC'          => 5,
   1.246 +'not-xid'           => 6,
   1.247 +'obsolete'          => 7,
   1.248 +'technical'         => 8,
   1.249 +'not-chars'         => 9
   1.250 +);
   1.251 +
   1.252 +my %bidicategoryCode = (
   1.253 +  "L"   =>  "0", # Left-to-Right
   1.254 +  "R"   =>  "1", # Right-to-Left
   1.255 +  "EN"  =>  "2", # European Number
   1.256 +  "ES"  =>  "3", # European Number Separator
   1.257 +  "ET"  =>  "4", # European Number Terminator
   1.258 +  "AN"  =>  "5", # Arabic Number
   1.259 +  "CS"  =>  "6", # Common Number Separator
   1.260 +  "B"   =>  "7", # Paragraph Separator
   1.261 +  "S"   =>  "8", # Segment Separator
   1.262 +  "WS"  =>  "9", # Whitespace
   1.263 +  "ON"  => "10", # Other Neutrals
   1.264 +  "LRE" => "11", # Left-to-Right Embedding
   1.265 +  "LRO" => "12", # Left-to-Right Override
   1.266 +  "AL"  => "13", # Right-to-Left Arabic
   1.267 +  "RLE" => "14", # Right-to-Left Embedding
   1.268 +  "RLO" => "15", # Right-to-Left Override
   1.269 +  "PDF" => "16", # Pop Directional Format
   1.270 +  "NSM" => "17", # Non-Spacing Mark
   1.271 +  "BN"  => "18"  # Boundary Neutral
   1.272 +);
   1.273 +
   1.274 +# initialize default properties
   1.275 +my @script;
   1.276 +my @category;
   1.277 +my @combining;
   1.278 +my @eaw;
   1.279 +my @mirror;
   1.280 +my @hangul;
   1.281 +my @casemap;
   1.282 +my @xidmod;
   1.283 +my @numericvalue;
   1.284 +my @hanVariant;
   1.285 +my @bidicategory;
   1.286 +my @fullWidth;
   1.287 +for (my $i = 0; $i < 0x110000; ++$i) {
   1.288 +    $script[$i] = $scriptCode{"UNKNOWN"};
   1.289 +    $category[$i] = $catCode{"UNASSIGNED"};
   1.290 +    $combining[$i] = 0;
   1.291 +    $casemap[$i] = 0;
   1.292 +    $xidmod[$i] = $xidmodCode{"not-chars"};
   1.293 +    $numericvalue[$i] = -1;
   1.294 +    $hanVariant[$i] = 0;
   1.295 +    $bidicategory[$i] = $bidicategoryCode{"L"};
   1.296 +    $fullWidth[$i] = 0;
   1.297 +}
   1.298 +
   1.299 +# blocks where the default for bidi category is not L
   1.300 +for my $i (0x0600..0x07BF, 0x08A0..0x08FF, 0xFB50..0xFDCF, 0xFDF0..0xFDFF, 0xFE70..0xFEFF, 0x1EE00..0x0001EEFF) {
   1.301 +  $bidicategory[$i] = $bidicategoryCode{"AL"};
   1.302 +}
   1.303 +for my $i (0x0590..0x05FF, 0x07C0..0x089F, 0xFB1D..0xFB4F, 0x00010800..0x00010FFF, 0x0001E800..0x0001EDFF, 0x0001EF00..0x0001EFFF) {
   1.304 +  $bidicategory[$i] = $bidicategoryCode{"R"};
   1.305 +}
   1.306 +for my $i (0x20A0..0x20CF) {
   1.307 +  $bidicategory[$i] = $bidicategoryCode{"ET"};
   1.308 +}
   1.309 +
   1.310 +my %ucd2hb = (
   1.311 +'Cc' => 'CONTROL',
   1.312 +'Cf' => 'FORMAT',
   1.313 +'Cn' => 'UNASSIGNED',
   1.314 +'Co' => 'PRIVATE_USE',
   1.315 +'Cs' => 'SURROGATE',
   1.316 +'Ll' => 'LOWERCASE_LETTER',
   1.317 +'Lm' => 'MODIFIER_LETTER',
   1.318 +'Lo' => 'OTHER_LETTER',
   1.319 +'Lt' => 'TITLECASE_LETTER',
   1.320 +'Lu' => 'UPPERCASE_LETTER',
   1.321 +'Mc' => 'SPACING_MARK',
   1.322 +'Me' => 'ENCLOSING_MARK',
   1.323 +'Mn' => 'NON_SPACING_MARK',
   1.324 +'Nd' => 'DECIMAL_NUMBER',
   1.325 +'Nl' => 'LETTER_NUMBER',
   1.326 +'No' => 'OTHER_NUMBER',
   1.327 +'Pc' => 'CONNECT_PUNCTUATION',
   1.328 +'Pd' => 'DASH_PUNCTUATION',
   1.329 +'Pe' => 'CLOSE_PUNCTUATION',
   1.330 +'Pf' => 'FINAL_PUNCTUATION',
   1.331 +'Pi' => 'INITIAL_PUNCTUATION',
   1.332 +'Po' => 'OTHER_PUNCTUATION',
   1.333 +'Ps' => 'OPEN_PUNCTUATION',
   1.334 +'Sc' => 'CURRENCY_SYMBOL',
   1.335 +'Sk' => 'MODIFIER_SYMBOL',
   1.336 +'Sm' => 'MATH_SYMBOL',
   1.337 +'So' => 'OTHER_SYMBOL',
   1.338 +'Zl' => 'LINE_SEPARATOR',
   1.339 +'Zp' => 'PARAGRAPH_SEPARATOR',
   1.340 +'Zs' => 'SPACE_SEPARATOR'
   1.341 +);
   1.342 +
   1.343 +# read ReadMe.txt
   1.344 +my @versionInfo;
   1.345 +open FH, "< $ARGV[1]/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
   1.346 +while (<FH>) {
   1.347 +    chomp;
   1.348 +    push @versionInfo, $_;
   1.349 +}
   1.350 +close FH;
   1.351 +
   1.352 +my $kTitleToUpper = 0x80000000;
   1.353 +my $kUpperToLower = 0x40000000;
   1.354 +my $kLowerToTitle = 0x20000000;
   1.355 +my $kLowerToUpper = 0x10000000;
   1.356 +my $kCaseMapCharMask = 0x001fffff;
   1.357 +
   1.358 +# read UnicodeData.txt
   1.359 +open FH, "< $ARGV[1]/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
   1.360 +while (<FH>) {
   1.361 +    chomp;
   1.362 +    my @fields = split /;/;
   1.363 +    if ($fields[1] =~ /First/) {
   1.364 +        my $first = hex "0x$fields[0]";
   1.365 +        $_ = <FH>;
   1.366 +        @fields = split /;/;
   1.367 +        if ($fields[1] =~ /Last/) {
   1.368 +            my $last = hex "0x$fields[0]";
   1.369 +            do {
   1.370 +                $category[$first] = $catCode{$ucd2hb{$fields[2]}};
   1.371 +                $combining[$first] = $fields[3];
   1.372 +                $bidicategory[$first] = $bidicategoryCode{$fields[4]};
   1.373 +                unless (length($fields[7]) == 0) {
   1.374 +                  $numericvalue[$first] = $fields[7];
   1.375 +                }
   1.376 +                if ($fields[1] =~ /CJK/) {
   1.377 +                  @hanVariant[$first] = 3;
   1.378 +                }
   1.379 +                $first++;
   1.380 +            } while ($first <= $last);
   1.381 +        } else {
   1.382 +            die "didn't find Last code for range!\n";
   1.383 +        }
   1.384 +    } else {
   1.385 +        my $usv = hex "0x$fields[0]";
   1.386 +        $category[$usv] = $catCode{$ucd2hb{$fields[2]}};
   1.387 +        $combining[$usv] = $fields[3];
   1.388 +        my $upper = hex $fields[12];
   1.389 +        my $lower = hex $fields[13];
   1.390 +        my $title = hex $fields[14];
   1.391 +        # we only store one mapping for each character,
   1.392 +        # but also record what kind of mapping it is
   1.393 +        if ($upper && $lower) {
   1.394 +            $casemap[$usv] |= $kTitleToUpper;
   1.395 +            $casemap[$usv] |= ($usv ^ $upper);
   1.396 +        }
   1.397 +        elsif ($lower) {
   1.398 +            $casemap[$usv] |= $kUpperToLower;
   1.399 +            $casemap[$usv] |= ($usv ^ $lower);
   1.400 +        }
   1.401 +        elsif ($title && ($title != $upper)) {
   1.402 +            $casemap[$usv] |= $kLowerToTitle;
   1.403 +            $casemap[$usv] |= ($usv ^ $title);
   1.404 +        }
   1.405 +        elsif ($upper) {
   1.406 +            $casemap[$usv] |= $kLowerToUpper;
   1.407 +            $casemap[$usv] |= ($usv ^ $upper);
   1.408 +        }
   1.409 +        $bidicategory[$usv] = $bidicategoryCode{$fields[4]};
   1.410 +        unless (length($fields[7]) == 0) {
   1.411 +          $numericvalue[$usv] = $fields[7];
   1.412 +        }
   1.413 +        if ($fields[1] =~ /CJK/) {
   1.414 +          @hanVariant[$usv] = 3;
   1.415 +        }
   1.416 +        if ($fields[5] =~ /^<narrow>/) {
   1.417 +          my $wideChar = hex(substr($fields[5], 9));
   1.418 +          die "didn't expect supplementary-plane values here" if $usv > 0xffff || $wideChar > 0xffff;
   1.419 +          $fullWidth[$usv] = $wideChar;
   1.420 +        }
   1.421 +        elsif ($fields[5] =~ /^<wide>/) {
   1.422 +          my $narrowChar = hex(substr($fields[5], 7));
   1.423 +          die "didn't expect supplementary-plane values here" if $usv > 0xffff || $narrowChar > 0xffff;
   1.424 +          $fullWidth[$narrowChar] = $usv;
   1.425 +        }
   1.426 +    }
   1.427 +}
   1.428 +close FH;
   1.429 +
   1.430 +# read Scripts.txt
   1.431 +open FH, "< $ARGV[1]/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
   1.432 +push @versionInfo, "";
   1.433 +while (<FH>) {
   1.434 +    chomp;
   1.435 +    push @versionInfo, $_;
   1.436 +    last if /Date:/;
   1.437 +}
   1.438 +while (<FH>) {
   1.439 +    if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) {
   1.440 +        my $script = uc($3);
   1.441 +        warn "unknown script $script" unless exists $scriptCode{$script};
   1.442 +        $script = $scriptCode{$script};
   1.443 +        my $start = hex "0x$1";
   1.444 +        my $end = (defined $2) ? hex "0x$2" : $start;
   1.445 +        for (my $i = $start; $i <= $end; ++$i) {
   1.446 +            $script[$i] = $script;
   1.447 +        }
   1.448 +    }
   1.449 +}
   1.450 +close FH;
   1.451 +
   1.452 +# read EastAsianWidth.txt
   1.453 +my %eawCode = (
   1.454 +  'A' => 0, #         ; Ambiguous
   1.455 +  'F' => 1, #         ; Fullwidth
   1.456 +  'H' => 2, #         ; Halfwidth
   1.457 +  'N' => 3, #         ; Neutral
   1.458 +  'NA'=> 4, #         ; Narrow
   1.459 +  'W' => 5  #         ; Wide 
   1.460 +);
   1.461 +open FH, "< $ARGV[1]/EastAsianWidth.txt" or die "can't open UCD file EastAsianWidth.txt\n";
   1.462 +push @versionInfo, "";
   1.463 +while (<FH>) {
   1.464 +    chomp;
   1.465 +    push @versionInfo, $_;
   1.466 +    last if /Date:/;
   1.467 +}
   1.468 +while (<FH>) {
   1.469 +    s/#.*//;
   1.470 +    if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
   1.471 +        my $eaw = uc($3);
   1.472 +        warn "unknown EAW code $eaw" unless exists $eawCode{$eaw};
   1.473 +        $eaw = $eawCode{$eaw};
   1.474 +        my $start = hex "0x$1";
   1.475 +        my $end = (defined $2) ? hex "0x$2" : $start;
   1.476 +        for (my $i = $start; $i <= $end; ++$i) {
   1.477 +            $eaw[$i] = $eaw;
   1.478 +        }
   1.479 +    }
   1.480 +}
   1.481 +close FH;
   1.482 +
   1.483 +# read BidiMirroring.txt
   1.484 +my @offsets = ();
   1.485 +push @offsets, 0;
   1.486 +
   1.487 +open FH, "< $ARGV[1]/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
   1.488 +push @versionInfo, "";
   1.489 +while (<FH>) {
   1.490 +    chomp;
   1.491 +    push @versionInfo, $_;
   1.492 +    last if /Date:/;
   1.493 +}
   1.494 +while (<FH>) {
   1.495 +    s/#.*//;
   1.496 +    if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6})/) {
   1.497 +        my $mirrorOffset = hex("0x$2") - hex("0x$1");
   1.498 +	my $offsetIndex = first { $offsets[$_] eq $mirrorOffset } 0..$#offsets;
   1.499 +	if ($offsetIndex == undef) {
   1.500 +            die "too many offset codes\n" if scalar @offsets == 31;
   1.501 +            push @offsets, $mirrorOffset;
   1.502 +	    $offsetIndex = $#offsets;
   1.503 +        }
   1.504 +	$mirror[hex "0x$1"] = $offsetIndex;
   1.505 +    }
   1.506 +}
   1.507 +close FH;
   1.508 +
   1.509 +# read HangulSyllableType.txt
   1.510 +my %hangulType = (
   1.511 +  'L'   => 0x01,
   1.512 +  'V'   => 0x02,
   1.513 +  'T'   => 0x04,
   1.514 +  'LV'  => 0x03,
   1.515 +  'LVT' => 0x07
   1.516 +);
   1.517 +open FH, "< $ARGV[1]/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
   1.518 +push @versionInfo, "";
   1.519 +while (<FH>) {
   1.520 +    chomp;
   1.521 +    push @versionInfo, $_;
   1.522 +    last if /Date:/;
   1.523 +}
   1.524 +while (<FH>) {
   1.525 +    s/#.*//;
   1.526 +    if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
   1.527 +        my $hangul = uc($3);
   1.528 +        warn "unknown Hangul syllable type" unless exists $hangulType{$hangul};
   1.529 +        $hangul = $hangulType{$hangul};
   1.530 +        my $start = hex "0x$1";
   1.531 +        my $end = (defined $2) ? hex "0x$2" : $start;
   1.532 +        for (my $i = $start; $i <= $end; ++$i) {
   1.533 +            $hangul[$i] = $hangul;
   1.534 +        }
   1.535 +    }
   1.536 +}
   1.537 +close FH;
   1.538 +
   1.539 +# read xidmodifications.txt
   1.540 +open FH, "< $ARGV[1]/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
   1.541 +push @versionInfo, "";
   1.542 +while (<FH>) {
   1.543 +  chomp;
   1.544 +  unless (/\xef\xbb\xbf/) {
   1.545 +    push @versionInfo, $_;
   1.546 +  }
   1.547 +  last if /Generated:/;
   1.548 +}
   1.549 +while (<FH>) {
   1.550 +  if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) {
   1.551 +    my $xidmod = $3;
   1.552 +    warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod};
   1.553 +    $xidmod = $xidmodCode{$xidmod};
   1.554 +    my $start = hex "0x$1";
   1.555 +    my $end = (defined $2) ? hex "0x$2" : $start;
   1.556 +    for (my $i = $start; $i <= $end; ++$i) {
   1.557 +      $xidmod[$i] = $xidmod;
   1.558 +    }
   1.559 +  }
   1.560 +}
   1.561 +close FH;
   1.562 +# special case U+30FB KATAKANA MIDDLE DOT -- see bug 857490
   1.563 +$xidmod[0x30FB] = 1;
   1.564 +
   1.565 +open FH, "< $ARGV[1]/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
   1.566 +push @versionInfo, "";
   1.567 +while (<FH>) {
   1.568 +  chomp;
   1.569 +  push @versionInfo, $_;
   1.570 +  last if /Date:/;
   1.571 +}
   1.572 +my $savedusv = 0;
   1.573 +my $hasTC = 0;
   1.574 +my $hasSC = 0;
   1.575 +while (<FH>) {
   1.576 +  chomp;
   1.577 +  if (m/U\+([0-9A-F]{4,6})\s+k([^ ]+)Variant/) {
   1.578 +    my $usv = hex "0x$1";
   1.579 +    if ($usv != $savedusv) {
   1.580 +      unless ($savedusv == 0) {
   1.581 +        if ($hasTC && !$hasSC) {
   1.582 +          $hanVariant[$savedusv] = 1;
   1.583 +        } elsif (!$hasTC && $hasSC) {
   1.584 +          $hanVariant[$savedusv] = 2;
   1.585 +        }
   1.586 +      }
   1.587 +      $savedusv = $usv;
   1.588 +      $hasTC = 0;
   1.589 +      $hasSC = 0;
   1.590 +    }
   1.591 +    if ($2 eq "Traditional") {
   1.592 +      $hasTC = 1;
   1.593 +    }
   1.594 +    if ($2 eq "Simplified") {
   1.595 +      $hasSC = 1;
   1.596 +    }
   1.597 +  } 
   1.598 +}
   1.599 +close FH;
   1.600 +
   1.601 +my $timestamp = gmtime();
   1.602 +
   1.603 +open DATA_TABLES, "> nsUnicodePropertyData.cpp" or die "unable to open nsUnicodePropertyData.cpp for output";
   1.604 +
   1.605 +my $licenseBlock = q[
   1.606 +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   1.607 +/* This Source Code Form is subject to the terms of the Mozilla Public
   1.608 + * License, v. 2.0. If a copy of the MPL was not distributed with this
   1.609 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   1.610 +
   1.611 +/*
   1.612 + * Derived from the Unicode Character Database by genUnicodePropertyData.pl
   1.613 + *
   1.614 + * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html
   1.615 + */
   1.616 +];
   1.617 +
   1.618 +my $versionInfo = join("\n", @versionInfo);
   1.619 +
   1.620 +print DATA_TABLES <<__END;
   1.621 +$licenseBlock
   1.622 +/*
   1.623 + * Created on $timestamp from UCD data files with version info:
   1.624 + *
   1.625 +
   1.626 +$versionInfo
   1.627 +
   1.628 + *
   1.629 + * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
   1.630 + */
   1.631 +
   1.632 +#include <stdint.h>
   1.633 +#include "harfbuzz/hb.h"
   1.634 +
   1.635 +__END
   1.636 +
   1.637 +open HEADER, "> nsUnicodeScriptCodes.h" or die "unable to open nsUnicodeScriptCodes.h for output";
   1.638 +
   1.639 +print HEADER <<__END;
   1.640 +$licenseBlock
   1.641 +/*
   1.642 + * Created on $timestamp from UCD data files with version info:
   1.643 + *
   1.644 +
   1.645 +$versionInfo
   1.646 +
   1.647 + *
   1.648 + * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
   1.649 + */
   1.650 +
   1.651 +#ifndef NS_UNICODE_SCRIPT_CODES
   1.652 +#define NS_UNICODE_SCRIPT_CODES
   1.653 +
   1.654 +__END
   1.655 +
   1.656 +print DATA_TABLES "static const uint32_t sScriptCodeToTag[] = {\n";
   1.657 +for (my $i = 0; $i < scalar @scriptCodeToTag; ++$i) {
   1.658 +  printf DATA_TABLES "  HB_TAG(%s)", $scriptCodeToTag[$i];
   1.659 +  print DATA_TABLES $i < $#scriptCodeToTag ? ",\n" : "\n";
   1.660 +}
   1.661 +print DATA_TABLES "};\n\n";
   1.662 +
   1.663 +our $totalData = 0;
   1.664 +
   1.665 +print DATA_TABLES "static const int16_t sMirrorOffsets[] = {\n";
   1.666 +for (my $i = 0; $i < scalar @offsets; ++$i) {
   1.667 +    printf DATA_TABLES "  $offsets[$i]";
   1.668 +    print DATA_TABLES $i < $#offsets ? ",\n" : "\n";
   1.669 +}
   1.670 +print DATA_TABLES "};\n\n";
   1.671 +
   1.672 +print HEADER "#pragma pack(1)\n\n";
   1.673 +
   1.674 +sub sprintCharProps1
   1.675 +{
   1.676 +  my $usv = shift;
   1.677 +  return sprintf("{%d,%d,%d}, ", $mirror[$usv], $hangul[$usv], $combining[$usv]);
   1.678 +}
   1.679 +&genTables("CharProp1", "struct nsCharProps1 {\n  unsigned char mMirrorOffsetIndex:5;\n  unsigned char mHangulType:3;\n  unsigned char mCombiningClass:8;\n};",
   1.680 +           "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1);
   1.681 +
   1.682 +sub sprintCharProps2
   1.683 +{
   1.684 +  my $usv = shift;
   1.685 +  return sprintf("{%d,%d,%d,%d,%d,%d},",
   1.686 +                 $script[$usv], $eaw[$usv], $category[$usv],
   1.687 +                 $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv]);
   1.688 +}
   1.689 +&genTables("CharProp2", "struct nsCharProps2 {\n  unsigned char mScriptCode:8;\n  unsigned char mEAW:3;\n  unsigned char mCategory:5;\n  unsigned char mBidiCategory:5;\n  unsigned char mXidmod:4;\n  signed char mNumericValue:5;\n  unsigned char mHanVariant:2;\n};",
   1.690 +           "nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1);
   1.691 +
   1.692 +print HEADER "#pragma pack()\n\n";
   1.693 +
   1.694 +sub sprintHanVariants
   1.695 +{
   1.696 +  my $baseUsv = shift;
   1.697 +  my $varShift = 0;
   1.698 +  my $val = 0;
   1.699 +  while ($varShift < 8) {
   1.700 +    $val |= $hanVariant[$baseUsv++] << $varShift;
   1.701 +    $varShift += 2;
   1.702 +  }
   1.703 +  return sprintf("0x%02x,", $val);
   1.704 +}
   1.705 +&genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
   1.706 +
   1.707 +sub sprintFullWidth
   1.708 +{
   1.709 +  my $usv = shift;
   1.710 +  return sprintf("0x%04x,", $fullWidth[$usv]);
   1.711 +}
   1.712 +&genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
   1.713 +
   1.714 +sub sprintCasemap
   1.715 +{
   1.716 +  my $usv = shift;
   1.717 +  return sprintf("0x%08x,", $casemap[$usv]);
   1.718 +}
   1.719 +&genTables("CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1);
   1.720 +
   1.721 +print STDERR "Total data = $totalData\n";
   1.722 +
   1.723 +printf DATA_TABLES "const uint32_t kTitleToUpper = 0x%08x;\n", $kTitleToUpper;
   1.724 +printf DATA_TABLES "const uint32_t kUpperToLower = 0x%08x;\n", $kUpperToLower;
   1.725 +printf DATA_TABLES "const uint32_t kLowerToTitle = 0x%08x;\n", $kLowerToTitle;
   1.726 +printf DATA_TABLES "const uint32_t kLowerToUpper = 0x%08x;\n", $kLowerToUpper;
   1.727 +printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCharMask;
   1.728 +
   1.729 +sub genTables
   1.730 +{
   1.731 +  my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
   1.732 +
   1.733 +  print DATA_TABLES "#define k${prefix}MaxPlane  $maxPlane\n";
   1.734 +  print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n";
   1.735 +  print DATA_TABLES "#define k${prefix}CharBits  $charBits\n";
   1.736 +
   1.737 +  my $indexLen = 1 << $indexBits;
   1.738 +  my $charsPerPage = 1 << $charBits;
   1.739 +  my %charIndex = ();
   1.740 +  my %pageMapIndex = ();
   1.741 +  my @pageMap = ();
   1.742 +  my @char = ();
   1.743 +  
   1.744 +  my $planeMap = "\x00" x $maxPlane;
   1.745 +  foreach my $plane (0 .. $maxPlane) {
   1.746 +    my $pageMap = "\x00" x $indexLen * 2;
   1.747 +    foreach my $page (0 .. $indexLen - 1) {
   1.748 +        my $charValues = "";
   1.749 +        for (my $ch = 0; $ch < $charsPerPage; $ch += $charsPerEntry) {
   1.750 +            my $usv = $plane * 0x10000 + $page * $charsPerPage + $ch;
   1.751 +            $charValues .= &$func($usv);
   1.752 +        }
   1.753 +        chop $charValues;
   1.754 +
   1.755 +        unless (exists $charIndex{$charValues}) {
   1.756 +            $charIndex{$charValues} = scalar keys %charIndex;
   1.757 +            $char[$charIndex{$charValues}] = $charValues;
   1.758 +        }
   1.759 +        substr($pageMap, $page * 2, 2) = pack('S', $charIndex{$charValues});
   1.760 +    }
   1.761 +    
   1.762 +    unless (exists $pageMapIndex{$pageMap}) {
   1.763 +        $pageMapIndex{$pageMap} = scalar keys %pageMapIndex;
   1.764 +        $pageMap[$pageMapIndex{$pageMap}] = $pageMap;
   1.765 +    }
   1.766 +    if ($plane > 0) {
   1.767 +        substr($planeMap, $plane - 1, 1) = pack('C', $pageMapIndex{$pageMap});
   1.768 +    }
   1.769 +  }
   1.770 +
   1.771 +  if ($maxPlane) {
   1.772 +    print DATA_TABLES "static const uint8_t s${prefix}Planes[$maxPlane] = {";
   1.773 +    print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('C*', $planeMap));
   1.774 +    print DATA_TABLES "};\n\n";
   1.775 +  }
   1.776 +
   1.777 +  my $chCount = scalar @char;
   1.778 +  my $pmBits = $chCount > 255 ? 16 : 8;
   1.779 +  my $pmCount = scalar @pageMap;
   1.780 +  if ($maxPlane == 0) {
   1.781 +    die "there should only be one pageMap entry!" if $pmCount > 1;
   1.782 +    print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$indexLen] = {\n";
   1.783 +  } else {
   1.784 +    print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$pmCount][$indexLen] = {\n";
   1.785 +  }
   1.786 +  for (my $i = 0; $i < scalar @pageMap; ++$i) {
   1.787 +    print DATA_TABLES $maxPlane > 0 ? "  {" : "  ";
   1.788 +    print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('S*', $pageMap[$i]));
   1.789 +    print DATA_TABLES $maxPlane > 0 ? ($i < $#pageMap ? "},\n" : "}\n") : "\n";
   1.790 +  }
   1.791 +  print DATA_TABLES "};\n\n";
   1.792 +
   1.793 +  print HEADER "$typedef\n\n" if $typedef ne '';
   1.794 +
   1.795 +  my $pageLen = $charsPerPage / $charsPerEntry;
   1.796 +  print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n";
   1.797 +  for (my $i = 0; $i < scalar @char; ++$i) {
   1.798 +    print DATA_TABLES "  {";
   1.799 +    print DATA_TABLES $char[$i];
   1.800 +    print DATA_TABLES $i < $#char ? "},\n" : "}\n";
   1.801 +  }
   1.802 +  print DATA_TABLES "};\n\n";
   1.803 +
   1.804 +  my $dataSize = $pmCount * $indexLen * $pmBits/8 +
   1.805 +                 $chCount * $pageLen * $bytesPerEntry + 
   1.806 +                 $maxPlane;
   1.807 +  $totalData += $dataSize;
   1.808 +
   1.809 +  print STDERR "Data for $prefix = $dataSize\n";
   1.810 +}
   1.811 +
   1.812 +print DATA_TABLES <<__END;
   1.813 +/*
   1.814 + * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
   1.815 + */
   1.816 +__END
   1.817 +
   1.818 +close DATA_TABLES;
   1.819 +
   1.820 +print HEADER "enum {\n";
   1.821 +for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) {
   1.822 +  print HEADER "  MOZ_SCRIPT_", $scriptCodeToName[$i], " = ", $i, ",\n";
   1.823 +}
   1.824 +print HEADER "\n  MOZ_NUM_SCRIPT_CODES = ", scalar @scriptCodeToName, ",\n";
   1.825 +print HEADER "\n  MOZ_SCRIPT_INVALID = -1\n";
   1.826 +print HEADER "};\n\n";
   1.827 +
   1.828 +print HEADER <<__END;
   1.829 +#endif
   1.830 +/*
   1.831 + * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
   1.832 + */
   1.833 +__END
   1.834 +
   1.835 +close HEADER;
   1.836 +

mercurial