michael@0: #!/usr/bin/env perl michael@0: michael@0: # This Source Code Form is subject to the terms of the Mozilla Public michael@0: # License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: # file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: michael@0: # This tool is used to prepare lookup tables of Unicode character properties michael@0: # needed by gfx code to support text shaping operations. The properties are michael@0: # read from the Unicode Character Database and compiled into multi-level arrays michael@0: # for efficient lookup. michael@0: # michael@0: # To regenerate the tables in nsUnicodePropertyData.cpp: michael@0: # michael@0: # (1) Download the current Unicode data files from michael@0: # michael@0: # http://www.unicode.org/Public/UNIDATA/ michael@0: # michael@0: # NB: not all the files are actually needed; currently, we require michael@0: # - UnicodeData.txt michael@0: # - Scripts.txt michael@0: # - EastAsianWidth.txt michael@0: # - BidiMirroring.txt michael@0: # - HangulSyllableType.txt michael@0: # - ReadMe.txt (to record version/date of the UCD) michael@0: # - Unihan_Variants.txt (from Unihan.zip) michael@0: # though this may change if we find a need for additional properties. michael@0: # michael@0: # The Unicode data files listed above should be together in one directory. michael@0: # We also require the file michael@0: # http://www.unicode.org/Public/security/latest/xidmodifications.txt michael@0: # This file should be in a sub-directory "security" immediately below the michael@0: # directory containing the other Unicode data files. michael@0: # michael@0: # (2) Run this tool using a command line of the form michael@0: # michael@0: # perl genUnicodePropertyData.pl \ michael@0: # /path/to/harfbuzz/src \ michael@0: # /path/to/UCD-directory michael@0: # michael@0: # This will generate (or overwrite!) the files michael@0: # michael@0: # nsUnicodePropertyData.cpp michael@0: # nsUnicodeScriptCodes.h michael@0: # michael@0: # in the current directory. michael@0: michael@0: use strict; michael@0: use List::Util qw(first); michael@0: michael@0: if ($#ARGV != 1) { michael@0: print <<__EOT; michael@0: # Run this tool using a command line of the form michael@0: # michael@0: # perl genUnicodePropertyData.pl \ michael@0: # /path/to/harfbuzz/src \ michael@0: # /path/to/UCD-directory michael@0: # michael@0: # where harfbuzz/src is the directory containing harfbuzz .cc and .hh files, michael@0: # and UCD-directory is a directory containing the current Unicode Character michael@0: # Database files (UnicodeData.txt, etc), available from michael@0: # http://www.unicode.org/Public/UNIDATA/ michael@0: # michael@0: # This will generate (or overwrite!) the files michael@0: # michael@0: # nsUnicodePropertyData.cpp michael@0: # nsUnicodeScriptCodes.h michael@0: # michael@0: # in the current directory. michael@0: __EOT michael@0: exit 0; michael@0: } michael@0: michael@0: # load HB_Script and HB_Category constants michael@0: michael@0: # NOTE that HB_SCRIPT_* constants are now "tag" values, NOT sequentially-allocated michael@0: # script codes as used by Glib/Pango/etc. michael@0: # We therefore define a set of MOZ_SCRIPT_* constants that are script _codes_ michael@0: # compatible with those libraries, and map these to HB_SCRIPT_* _tags_ as needed. michael@0: michael@0: # CHECK that this matches Pango source (as found for example at michael@0: # http://git.gnome.org/browse/pango/tree/pango/pango-script.h) michael@0: # for as many codes as that defines (currently up through Unicode 5.1) michael@0: # and the GLib enumeration michael@0: # http://developer.gnome.org/glib/2.30/glib-Unicode-Manipulation.html#GUnicodeScript michael@0: # (currently defined up through Unicode 6.0). michael@0: # Constants beyond these may be regarded as unstable for now, but we don't actually michael@0: # depend on the specific values. michael@0: my %scriptCode = ( michael@0: INVALID => -1, michael@0: COMMON => 0, michael@0: INHERITED => 1, michael@0: ARABIC => 2, michael@0: ARMENIAN => 3, michael@0: BENGALI => 4, michael@0: BOPOMOFO => 5, michael@0: CHEROKEE => 6, michael@0: COPTIC => 7, michael@0: CYRILLIC => 8, michael@0: DESERET => 9, michael@0: DEVANAGARI => 10, michael@0: ETHIOPIC => 11, michael@0: GEORGIAN => 12, michael@0: GOTHIC => 13, michael@0: GREEK => 14, michael@0: GUJARATI => 15, michael@0: GURMUKHI => 16, michael@0: HAN => 17, michael@0: HANGUL => 18, michael@0: HEBREW => 19, michael@0: HIRAGANA => 20, michael@0: KANNADA => 21, michael@0: KATAKANA => 22, michael@0: KHMER => 23, michael@0: LAO => 24, michael@0: LATIN => 25, michael@0: MALAYALAM => 26, michael@0: MONGOLIAN => 27, michael@0: MYANMAR => 28, michael@0: OGHAM => 29, michael@0: OLD_ITALIC => 30, michael@0: ORIYA => 31, michael@0: RUNIC => 32, michael@0: SINHALA => 33, michael@0: SYRIAC => 34, michael@0: TAMIL => 35, michael@0: TELUGU => 36, michael@0: THAANA => 37, michael@0: THAI => 38, michael@0: TIBETAN => 39, michael@0: CANADIAN_ABORIGINAL => 40, michael@0: YI => 41, michael@0: TAGALOG => 42, michael@0: HANUNOO => 43, michael@0: BUHID => 44, michael@0: TAGBANWA => 45, michael@0: # unicode 4.0 additions michael@0: BRAILLE => 46, michael@0: CYPRIOT => 47, michael@0: LIMBU => 48, michael@0: OSMANYA => 49, michael@0: SHAVIAN => 50, michael@0: LINEAR_B => 51, michael@0: TAI_LE => 52, michael@0: UGARITIC => 53, michael@0: # unicode 4.1 additions michael@0: NEW_TAI_LUE => 54, michael@0: BUGINESE => 55, michael@0: GLAGOLITIC => 56, michael@0: TIFINAGH => 57, michael@0: SYLOTI_NAGRI => 58, michael@0: OLD_PERSIAN => 59, michael@0: KHAROSHTHI => 60, michael@0: # unicode 5.0 additions michael@0: UNKNOWN => 61, michael@0: BALINESE => 62, michael@0: CUNEIFORM => 63, michael@0: PHOENICIAN => 64, michael@0: PHAGS_PA => 65, michael@0: NKO => 66, michael@0: # unicode 5.1 additions michael@0: KAYAH_LI => 67, michael@0: LEPCHA => 68, michael@0: REJANG => 69, michael@0: SUNDANESE => 70, michael@0: SAURASHTRA => 71, michael@0: CHAM => 72, michael@0: OL_CHIKI => 73, michael@0: VAI => 74, michael@0: CARIAN => 75, michael@0: LYCIAN => 76, michael@0: LYDIAN => 77, michael@0: # unicode 5.2 additions michael@0: AVESTAN => 78, michael@0: BAMUM => 79, michael@0: EGYPTIAN_HIEROGLYPHS => 80, michael@0: IMPERIAL_ARAMAIC => 81, michael@0: INSCRIPTIONAL_PAHLAVI => 82, michael@0: INSCRIPTIONAL_PARTHIAN => 83, michael@0: JAVANESE => 84, michael@0: KAITHI => 85, michael@0: LISU => 86, michael@0: MEETEI_MAYEK => 87, michael@0: OLD_SOUTH_ARABIAN => 88, michael@0: OLD_TURKIC => 89, michael@0: SAMARITAN => 90, michael@0: TAI_THAM => 91, michael@0: TAI_VIET => 92, michael@0: # unicode 6.0 additions michael@0: BATAK => 93, michael@0: BRAHMI => 94, michael@0: MANDAIC => 95, michael@0: # unicode 6.1 additions michael@0: CHAKMA => 96, michael@0: MEROITIC_CURSIVE => 97, michael@0: MEROITIC_HIEROGLYPHS => 98, michael@0: MIAO => 99, michael@0: SHARADA => 100, michael@0: SORA_SOMPENG => 101, michael@0: TAKRI => 102 michael@0: ); michael@0: michael@0: my $sc = -1; michael@0: my $cc = -1; michael@0: my %catCode; michael@0: my @scriptCodeToTag; michael@0: my @scriptCodeToName; michael@0: michael@0: sub readHarfBuzzHeader michael@0: { michael@0: my $file = shift; michael@0: open FH, "< $ARGV[0]/$file" or die "can't open harfbuzz header $ARGV[0]/$file\n"; michael@0: while () { michael@0: if (m/HB_SCRIPT_([A-Z_]+)\s*=\s*HB_TAG\s*\(('.','.','.','.')\)\s*,/) { michael@0: unless (exists $scriptCode{$1}) { michael@0: warn "unknown script name $1 found in $file\n"; michael@0: next; michael@0: } michael@0: $sc = $scriptCode{$1}; michael@0: $scriptCodeToTag[$sc] = $2; michael@0: $scriptCodeToName[$sc] = $1; michael@0: } michael@0: if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) { michael@0: $cc++; michael@0: $catCode{$1} = $cc; michael@0: } michael@0: } michael@0: close FH; michael@0: } michael@0: michael@0: &readHarfBuzzHeader("hb-common.h"); michael@0: &readHarfBuzzHeader("hb-unicode.h"); michael@0: michael@0: die "didn't find HarfBuzz script codes\n" if $sc == -1; michael@0: die "didn't find HarfBuzz category codes\n" if $cc == -1; michael@0: michael@0: my %xidmodCode = ( michael@0: 'inclusion' => 0, michael@0: 'recommended' => 1, michael@0: 'default-ignorable' => 2, michael@0: 'historic' => 3, michael@0: 'limited-use' => 4, michael@0: 'not-NFKC' => 5, michael@0: 'not-xid' => 6, michael@0: 'obsolete' => 7, michael@0: 'technical' => 8, michael@0: 'not-chars' => 9 michael@0: ); michael@0: michael@0: my %bidicategoryCode = ( michael@0: "L" => "0", # Left-to-Right michael@0: "R" => "1", # Right-to-Left michael@0: "EN" => "2", # European Number michael@0: "ES" => "3", # European Number Separator michael@0: "ET" => "4", # European Number Terminator michael@0: "AN" => "5", # Arabic Number michael@0: "CS" => "6", # Common Number Separator michael@0: "B" => "7", # Paragraph Separator michael@0: "S" => "8", # Segment Separator michael@0: "WS" => "9", # Whitespace michael@0: "ON" => "10", # Other Neutrals michael@0: "LRE" => "11", # Left-to-Right Embedding michael@0: "LRO" => "12", # Left-to-Right Override michael@0: "AL" => "13", # Right-to-Left Arabic michael@0: "RLE" => "14", # Right-to-Left Embedding michael@0: "RLO" => "15", # Right-to-Left Override michael@0: "PDF" => "16", # Pop Directional Format michael@0: "NSM" => "17", # Non-Spacing Mark michael@0: "BN" => "18" # Boundary Neutral michael@0: ); michael@0: michael@0: # initialize default properties michael@0: my @script; michael@0: my @category; michael@0: my @combining; michael@0: my @eaw; michael@0: my @mirror; michael@0: my @hangul; michael@0: my @casemap; michael@0: my @xidmod; michael@0: my @numericvalue; michael@0: my @hanVariant; michael@0: my @bidicategory; michael@0: my @fullWidth; michael@0: for (my $i = 0; $i < 0x110000; ++$i) { michael@0: $script[$i] = $scriptCode{"UNKNOWN"}; michael@0: $category[$i] = $catCode{"UNASSIGNED"}; michael@0: $combining[$i] = 0; michael@0: $casemap[$i] = 0; michael@0: $xidmod[$i] = $xidmodCode{"not-chars"}; michael@0: $numericvalue[$i] = -1; michael@0: $hanVariant[$i] = 0; michael@0: $bidicategory[$i] = $bidicategoryCode{"L"}; michael@0: $fullWidth[$i] = 0; michael@0: } michael@0: michael@0: # blocks where the default for bidi category is not L michael@0: for my $i (0x0600..0x07BF, 0x08A0..0x08FF, 0xFB50..0xFDCF, 0xFDF0..0xFDFF, 0xFE70..0xFEFF, 0x1EE00..0x0001EEFF) { michael@0: $bidicategory[$i] = $bidicategoryCode{"AL"}; michael@0: } michael@0: for my $i (0x0590..0x05FF, 0x07C0..0x089F, 0xFB1D..0xFB4F, 0x00010800..0x00010FFF, 0x0001E800..0x0001EDFF, 0x0001EF00..0x0001EFFF) { michael@0: $bidicategory[$i] = $bidicategoryCode{"R"}; michael@0: } michael@0: for my $i (0x20A0..0x20CF) { michael@0: $bidicategory[$i] = $bidicategoryCode{"ET"}; michael@0: } michael@0: michael@0: my %ucd2hb = ( michael@0: 'Cc' => 'CONTROL', michael@0: 'Cf' => 'FORMAT', michael@0: 'Cn' => 'UNASSIGNED', michael@0: 'Co' => 'PRIVATE_USE', michael@0: 'Cs' => 'SURROGATE', michael@0: 'Ll' => 'LOWERCASE_LETTER', michael@0: 'Lm' => 'MODIFIER_LETTER', michael@0: 'Lo' => 'OTHER_LETTER', michael@0: 'Lt' => 'TITLECASE_LETTER', michael@0: 'Lu' => 'UPPERCASE_LETTER', michael@0: 'Mc' => 'SPACING_MARK', michael@0: 'Me' => 'ENCLOSING_MARK', michael@0: 'Mn' => 'NON_SPACING_MARK', michael@0: 'Nd' => 'DECIMAL_NUMBER', michael@0: 'Nl' => 'LETTER_NUMBER', michael@0: 'No' => 'OTHER_NUMBER', michael@0: 'Pc' => 'CONNECT_PUNCTUATION', michael@0: 'Pd' => 'DASH_PUNCTUATION', michael@0: 'Pe' => 'CLOSE_PUNCTUATION', michael@0: 'Pf' => 'FINAL_PUNCTUATION', michael@0: 'Pi' => 'INITIAL_PUNCTUATION', michael@0: 'Po' => 'OTHER_PUNCTUATION', michael@0: 'Ps' => 'OPEN_PUNCTUATION', michael@0: 'Sc' => 'CURRENCY_SYMBOL', michael@0: 'Sk' => 'MODIFIER_SYMBOL', michael@0: 'Sm' => 'MATH_SYMBOL', michael@0: 'So' => 'OTHER_SYMBOL', michael@0: 'Zl' => 'LINE_SEPARATOR', michael@0: 'Zp' => 'PARAGRAPH_SEPARATOR', michael@0: 'Zs' => 'SPACE_SEPARATOR' michael@0: ); michael@0: michael@0: # read ReadMe.txt michael@0: my @versionInfo; michael@0: open FH, "< $ARGV[1]/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n"; michael@0: while () { michael@0: chomp; michael@0: push @versionInfo, $_; michael@0: } michael@0: close FH; michael@0: michael@0: my $kTitleToUpper = 0x80000000; michael@0: my $kUpperToLower = 0x40000000; michael@0: my $kLowerToTitle = 0x20000000; michael@0: my $kLowerToUpper = 0x10000000; michael@0: my $kCaseMapCharMask = 0x001fffff; michael@0: michael@0: # read UnicodeData.txt michael@0: open FH, "< $ARGV[1]/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n"; michael@0: while () { michael@0: chomp; michael@0: my @fields = split /;/; michael@0: if ($fields[1] =~ /First/) { michael@0: my $first = hex "0x$fields[0]"; michael@0: $_ = ; michael@0: @fields = split /;/; michael@0: if ($fields[1] =~ /Last/) { michael@0: my $last = hex "0x$fields[0]"; michael@0: do { michael@0: $category[$first] = $catCode{$ucd2hb{$fields[2]}}; michael@0: $combining[$first] = $fields[3]; michael@0: $bidicategory[$first] = $bidicategoryCode{$fields[4]}; michael@0: unless (length($fields[7]) == 0) { michael@0: $numericvalue[$first] = $fields[7]; michael@0: } michael@0: if ($fields[1] =~ /CJK/) { michael@0: @hanVariant[$first] = 3; michael@0: } michael@0: $first++; michael@0: } while ($first <= $last); michael@0: } else { michael@0: die "didn't find Last code for range!\n"; michael@0: } michael@0: } else { michael@0: my $usv = hex "0x$fields[0]"; michael@0: $category[$usv] = $catCode{$ucd2hb{$fields[2]}}; michael@0: $combining[$usv] = $fields[3]; michael@0: my $upper = hex $fields[12]; michael@0: my $lower = hex $fields[13]; michael@0: my $title = hex $fields[14]; michael@0: # we only store one mapping for each character, michael@0: # but also record what kind of mapping it is michael@0: if ($upper && $lower) { michael@0: $casemap[$usv] |= $kTitleToUpper; michael@0: $casemap[$usv] |= ($usv ^ $upper); michael@0: } michael@0: elsif ($lower) { michael@0: $casemap[$usv] |= $kUpperToLower; michael@0: $casemap[$usv] |= ($usv ^ $lower); michael@0: } michael@0: elsif ($title && ($title != $upper)) { michael@0: $casemap[$usv] |= $kLowerToTitle; michael@0: $casemap[$usv] |= ($usv ^ $title); michael@0: } michael@0: elsif ($upper) { michael@0: $casemap[$usv] |= $kLowerToUpper; michael@0: $casemap[$usv] |= ($usv ^ $upper); michael@0: } michael@0: $bidicategory[$usv] = $bidicategoryCode{$fields[4]}; michael@0: unless (length($fields[7]) == 0) { michael@0: $numericvalue[$usv] = $fields[7]; michael@0: } michael@0: if ($fields[1] =~ /CJK/) { michael@0: @hanVariant[$usv] = 3; michael@0: } michael@0: if ($fields[5] =~ /^/) { michael@0: my $wideChar = hex(substr($fields[5], 9)); michael@0: die "didn't expect supplementary-plane values here" if $usv > 0xffff || $wideChar > 0xffff; michael@0: $fullWidth[$usv] = $wideChar; michael@0: } michael@0: elsif ($fields[5] =~ /^/) { michael@0: my $narrowChar = hex(substr($fields[5], 7)); michael@0: die "didn't expect supplementary-plane values here" if $usv > 0xffff || $narrowChar > 0xffff; michael@0: $fullWidth[$narrowChar] = $usv; michael@0: } michael@0: } michael@0: } michael@0: close FH; michael@0: michael@0: # read Scripts.txt michael@0: open FH, "< $ARGV[1]/Scripts.txt" or die "can't open UCD file Scripts.txt\n"; michael@0: push @versionInfo, ""; michael@0: while () { michael@0: chomp; michael@0: push @versionInfo, $_; michael@0: last if /Date:/; michael@0: } michael@0: while () { michael@0: if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) { michael@0: my $script = uc($3); michael@0: warn "unknown script $script" unless exists $scriptCode{$script}; michael@0: $script = $scriptCode{$script}; michael@0: my $start = hex "0x$1"; michael@0: my $end = (defined $2) ? hex "0x$2" : $start; michael@0: for (my $i = $start; $i <= $end; ++$i) { michael@0: $script[$i] = $script; michael@0: } michael@0: } michael@0: } michael@0: close FH; michael@0: michael@0: # read EastAsianWidth.txt michael@0: my %eawCode = ( michael@0: 'A' => 0, # ; Ambiguous michael@0: 'F' => 1, # ; Fullwidth michael@0: 'H' => 2, # ; Halfwidth michael@0: 'N' => 3, # ; Neutral michael@0: 'NA'=> 4, # ; Narrow michael@0: 'W' => 5 # ; Wide michael@0: ); michael@0: open FH, "< $ARGV[1]/EastAsianWidth.txt" or die "can't open UCD file EastAsianWidth.txt\n"; michael@0: push @versionInfo, ""; michael@0: while () { michael@0: chomp; michael@0: push @versionInfo, $_; michael@0: last if /Date:/; michael@0: } michael@0: while () { michael@0: s/#.*//; michael@0: if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) { michael@0: my $eaw = uc($3); michael@0: warn "unknown EAW code $eaw" unless exists $eawCode{$eaw}; michael@0: $eaw = $eawCode{$eaw}; michael@0: my $start = hex "0x$1"; michael@0: my $end = (defined $2) ? hex "0x$2" : $start; michael@0: for (my $i = $start; $i <= $end; ++$i) { michael@0: $eaw[$i] = $eaw; michael@0: } michael@0: } michael@0: } michael@0: close FH; michael@0: michael@0: # read BidiMirroring.txt michael@0: my @offsets = (); michael@0: push @offsets, 0; michael@0: michael@0: open FH, "< $ARGV[1]/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n"; michael@0: push @versionInfo, ""; michael@0: while () { michael@0: chomp; michael@0: push @versionInfo, $_; michael@0: last if /Date:/; michael@0: } michael@0: while () { michael@0: s/#.*//; michael@0: if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6})/) { michael@0: my $mirrorOffset = hex("0x$2") - hex("0x$1"); michael@0: my $offsetIndex = first { $offsets[$_] eq $mirrorOffset } 0..$#offsets; michael@0: if ($offsetIndex == undef) { michael@0: die "too many offset codes\n" if scalar @offsets == 31; michael@0: push @offsets, $mirrorOffset; michael@0: $offsetIndex = $#offsets; michael@0: } michael@0: $mirror[hex "0x$1"] = $offsetIndex; michael@0: } michael@0: } michael@0: close FH; michael@0: michael@0: # read HangulSyllableType.txt michael@0: my %hangulType = ( michael@0: 'L' => 0x01, michael@0: 'V' => 0x02, michael@0: 'T' => 0x04, michael@0: 'LV' => 0x03, michael@0: 'LVT' => 0x07 michael@0: ); michael@0: open FH, "< $ARGV[1]/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n"; michael@0: push @versionInfo, ""; michael@0: while () { michael@0: chomp; michael@0: push @versionInfo, $_; michael@0: last if /Date:/; michael@0: } michael@0: while () { michael@0: s/#.*//; michael@0: if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) { michael@0: my $hangul = uc($3); michael@0: warn "unknown Hangul syllable type" unless exists $hangulType{$hangul}; michael@0: $hangul = $hangulType{$hangul}; michael@0: my $start = hex "0x$1"; michael@0: my $end = (defined $2) ? hex "0x$2" : $start; michael@0: for (my $i = $start; $i <= $end; ++$i) { michael@0: $hangul[$i] = $hangul; michael@0: } michael@0: } michael@0: } michael@0: close FH; michael@0: michael@0: # read xidmodifications.txt michael@0: open FH, "< $ARGV[1]/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n"; michael@0: push @versionInfo, ""; michael@0: while () { michael@0: chomp; michael@0: unless (/\xef\xbb\xbf/) { michael@0: push @versionInfo, $_; michael@0: } michael@0: last if /Generated:/; michael@0: } michael@0: while () { michael@0: if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) { michael@0: my $xidmod = $3; michael@0: warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod}; michael@0: $xidmod = $xidmodCode{$xidmod}; michael@0: my $start = hex "0x$1"; michael@0: my $end = (defined $2) ? hex "0x$2" : $start; michael@0: for (my $i = $start; $i <= $end; ++$i) { michael@0: $xidmod[$i] = $xidmod; michael@0: } michael@0: } michael@0: } michael@0: close FH; michael@0: # special case U+30FB KATAKANA MIDDLE DOT -- see bug 857490 michael@0: $xidmod[0x30FB] = 1; michael@0: michael@0: open FH, "< $ARGV[1]/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n"; michael@0: push @versionInfo, ""; michael@0: while () { michael@0: chomp; michael@0: push @versionInfo, $_; michael@0: last if /Date:/; michael@0: } michael@0: my $savedusv = 0; michael@0: my $hasTC = 0; michael@0: my $hasSC = 0; michael@0: while () { michael@0: chomp; michael@0: if (m/U\+([0-9A-F]{4,6})\s+k([^ ]+)Variant/) { michael@0: my $usv = hex "0x$1"; michael@0: if ($usv != $savedusv) { michael@0: unless ($savedusv == 0) { michael@0: if ($hasTC && !$hasSC) { michael@0: $hanVariant[$savedusv] = 1; michael@0: } elsif (!$hasTC && $hasSC) { michael@0: $hanVariant[$savedusv] = 2; michael@0: } michael@0: } michael@0: $savedusv = $usv; michael@0: $hasTC = 0; michael@0: $hasSC = 0; michael@0: } michael@0: if ($2 eq "Traditional") { michael@0: $hasTC = 1; michael@0: } michael@0: if ($2 eq "Simplified") { michael@0: $hasSC = 1; michael@0: } michael@0: } michael@0: } michael@0: close FH; michael@0: michael@0: my $timestamp = gmtime(); michael@0: michael@0: open DATA_TABLES, "> nsUnicodePropertyData.cpp" or die "unable to open nsUnicodePropertyData.cpp for output"; michael@0: michael@0: my $licenseBlock = q[ michael@0: /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: /* michael@0: * Derived from the Unicode Character Database by genUnicodePropertyData.pl michael@0: * michael@0: * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html michael@0: */ michael@0: ]; michael@0: michael@0: my $versionInfo = join("\n", @versionInfo); michael@0: michael@0: print DATA_TABLES <<__END; michael@0: $licenseBlock michael@0: /* michael@0: * Created on $timestamp from UCD data files with version info: michael@0: * michael@0: michael@0: $versionInfo michael@0: michael@0: * michael@0: * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * michael@0: */ michael@0: michael@0: #include michael@0: #include "harfbuzz/hb.h" michael@0: michael@0: __END michael@0: michael@0: open HEADER, "> nsUnicodeScriptCodes.h" or die "unable to open nsUnicodeScriptCodes.h for output"; michael@0: michael@0: print HEADER <<__END; michael@0: $licenseBlock michael@0: /* michael@0: * Created on $timestamp from UCD data files with version info: michael@0: * michael@0: michael@0: $versionInfo michael@0: michael@0: * michael@0: * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * michael@0: */ michael@0: michael@0: #ifndef NS_UNICODE_SCRIPT_CODES michael@0: #define NS_UNICODE_SCRIPT_CODES michael@0: michael@0: __END michael@0: michael@0: print DATA_TABLES "static const uint32_t sScriptCodeToTag[] = {\n"; michael@0: for (my $i = 0; $i < scalar @scriptCodeToTag; ++$i) { michael@0: printf DATA_TABLES " HB_TAG(%s)", $scriptCodeToTag[$i]; michael@0: print DATA_TABLES $i < $#scriptCodeToTag ? ",\n" : "\n"; michael@0: } michael@0: print DATA_TABLES "};\n\n"; michael@0: michael@0: our $totalData = 0; michael@0: michael@0: print DATA_TABLES "static const int16_t sMirrorOffsets[] = {\n"; michael@0: for (my $i = 0; $i < scalar @offsets; ++$i) { michael@0: printf DATA_TABLES " $offsets[$i]"; michael@0: print DATA_TABLES $i < $#offsets ? ",\n" : "\n"; michael@0: } michael@0: print DATA_TABLES "};\n\n"; michael@0: michael@0: print HEADER "#pragma pack(1)\n\n"; michael@0: michael@0: sub sprintCharProps1 michael@0: { michael@0: my $usv = shift; michael@0: return sprintf("{%d,%d,%d}, ", $mirror[$usv], $hangul[$usv], $combining[$usv]); michael@0: } michael@0: &genTables("CharProp1", "struct nsCharProps1 {\n unsigned char mMirrorOffsetIndex:5;\n unsigned char mHangulType:3;\n unsigned char mCombiningClass:8;\n};", michael@0: "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1); michael@0: michael@0: sub sprintCharProps2 michael@0: { michael@0: my $usv = shift; michael@0: return sprintf("{%d,%d,%d,%d,%d,%d},", michael@0: $script[$usv], $eaw[$usv], $category[$usv], michael@0: $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv]); michael@0: } michael@0: &genTables("CharProp2", "struct nsCharProps2 {\n unsigned char mScriptCode:8;\n unsigned char mEAW:3;\n unsigned char mCategory:5;\n unsigned char mBidiCategory:5;\n unsigned char mXidmod:4;\n signed char mNumericValue:5;\n unsigned char mHanVariant:2;\n};", michael@0: "nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1); michael@0: michael@0: print HEADER "#pragma pack()\n\n"; michael@0: michael@0: sub sprintHanVariants michael@0: { michael@0: my $baseUsv = shift; michael@0: my $varShift = 0; michael@0: my $val = 0; michael@0: while ($varShift < 8) { michael@0: $val |= $hanVariant[$baseUsv++] << $varShift; michael@0: $varShift += 2; michael@0: } michael@0: return sprintf("0x%02x,", $val); michael@0: } michael@0: &genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4); michael@0: michael@0: sub sprintFullWidth michael@0: { michael@0: my $usv = shift; michael@0: return sprintf("0x%04x,", $fullWidth[$usv]); michael@0: } michael@0: &genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1); michael@0: michael@0: sub sprintCasemap michael@0: { michael@0: my $usv = shift; michael@0: return sprintf("0x%08x,", $casemap[$usv]); michael@0: } michael@0: &genTables("CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1); michael@0: michael@0: print STDERR "Total data = $totalData\n"; michael@0: michael@0: printf DATA_TABLES "const uint32_t kTitleToUpper = 0x%08x;\n", $kTitleToUpper; michael@0: printf DATA_TABLES "const uint32_t kUpperToLower = 0x%08x;\n", $kUpperToLower; michael@0: printf DATA_TABLES "const uint32_t kLowerToTitle = 0x%08x;\n", $kLowerToTitle; michael@0: printf DATA_TABLES "const uint32_t kLowerToUpper = 0x%08x;\n", $kLowerToUpper; michael@0: printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCharMask; michael@0: michael@0: sub genTables michael@0: { michael@0: my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_; michael@0: michael@0: print DATA_TABLES "#define k${prefix}MaxPlane $maxPlane\n"; michael@0: print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n"; michael@0: print DATA_TABLES "#define k${prefix}CharBits $charBits\n"; michael@0: michael@0: my $indexLen = 1 << $indexBits; michael@0: my $charsPerPage = 1 << $charBits; michael@0: my %charIndex = (); michael@0: my %pageMapIndex = (); michael@0: my @pageMap = (); michael@0: my @char = (); michael@0: michael@0: my $planeMap = "\x00" x $maxPlane; michael@0: foreach my $plane (0 .. $maxPlane) { michael@0: my $pageMap = "\x00" x $indexLen * 2; michael@0: foreach my $page (0 .. $indexLen - 1) { michael@0: my $charValues = ""; michael@0: for (my $ch = 0; $ch < $charsPerPage; $ch += $charsPerEntry) { michael@0: my $usv = $plane * 0x10000 + $page * $charsPerPage + $ch; michael@0: $charValues .= &$func($usv); michael@0: } michael@0: chop $charValues; michael@0: michael@0: unless (exists $charIndex{$charValues}) { michael@0: $charIndex{$charValues} = scalar keys %charIndex; michael@0: $char[$charIndex{$charValues}] = $charValues; michael@0: } michael@0: substr($pageMap, $page * 2, 2) = pack('S', $charIndex{$charValues}); michael@0: } michael@0: michael@0: unless (exists $pageMapIndex{$pageMap}) { michael@0: $pageMapIndex{$pageMap} = scalar keys %pageMapIndex; michael@0: $pageMap[$pageMapIndex{$pageMap}] = $pageMap; michael@0: } michael@0: if ($plane > 0) { michael@0: substr($planeMap, $plane - 1, 1) = pack('C', $pageMapIndex{$pageMap}); michael@0: } michael@0: } michael@0: michael@0: if ($maxPlane) { michael@0: print DATA_TABLES "static const uint8_t s${prefix}Planes[$maxPlane] = {"; michael@0: print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('C*', $planeMap)); michael@0: print DATA_TABLES "};\n\n"; michael@0: } michael@0: michael@0: my $chCount = scalar @char; michael@0: my $pmBits = $chCount > 255 ? 16 : 8; michael@0: my $pmCount = scalar @pageMap; michael@0: if ($maxPlane == 0) { michael@0: die "there should only be one pageMap entry!" if $pmCount > 1; michael@0: print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$indexLen] = {\n"; michael@0: } else { michael@0: print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$pmCount][$indexLen] = {\n"; michael@0: } michael@0: for (my $i = 0; $i < scalar @pageMap; ++$i) { michael@0: print DATA_TABLES $maxPlane > 0 ? " {" : " "; michael@0: print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('S*', $pageMap[$i])); michael@0: print DATA_TABLES $maxPlane > 0 ? ($i < $#pageMap ? "},\n" : "}\n") : "\n"; michael@0: } michael@0: print DATA_TABLES "};\n\n"; michael@0: michael@0: print HEADER "$typedef\n\n" if $typedef ne ''; michael@0: michael@0: my $pageLen = $charsPerPage / $charsPerEntry; michael@0: print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n"; michael@0: for (my $i = 0; $i < scalar @char; ++$i) { michael@0: print DATA_TABLES " {"; michael@0: print DATA_TABLES $char[$i]; michael@0: print DATA_TABLES $i < $#char ? "},\n" : "}\n"; michael@0: } michael@0: print DATA_TABLES "};\n\n"; michael@0: michael@0: my $dataSize = $pmCount * $indexLen * $pmBits/8 + michael@0: $chCount * $pageLen * $bytesPerEntry + michael@0: $maxPlane; michael@0: $totalData += $dataSize; michael@0: michael@0: print STDERR "Data for $prefix = $dataSize\n"; michael@0: } michael@0: michael@0: print DATA_TABLES <<__END; michael@0: /* michael@0: * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * michael@0: */ michael@0: __END michael@0: michael@0: close DATA_TABLES; michael@0: michael@0: print HEADER "enum {\n"; michael@0: for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) { michael@0: print HEADER " MOZ_SCRIPT_", $scriptCodeToName[$i], " = ", $i, ",\n"; michael@0: } michael@0: print HEADER "\n MOZ_NUM_SCRIPT_CODES = ", scalar @scriptCodeToName, ",\n"; michael@0: print HEADER "\n MOZ_SCRIPT_INVALID = -1\n"; michael@0: print HEADER "};\n\n"; michael@0: michael@0: print HEADER <<__END; michael@0: #endif michael@0: /* michael@0: * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * michael@0: */ michael@0: __END michael@0: michael@0: close HEADER; michael@0: