The Tor Browser: comparison intl/unicharutil/tools/genUnicodePropertyData.pl

--1:000000000000
+:9e758d5e69e3
+#!/usr/bin/env perl
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+# This tool is used to prepare lookup tables of Unicode character properties
+# needed by gfx code to support text shaping operations. The properties are
+# read from the Unicode Character Database and compiled into multi-level arrays
+# for efficient lookup.
+#
+# To regenerate the tables in nsUnicodePropertyData.cpp:
+#
+# (1) Download the current Unicode data files from
+#
+#         http://www.unicode.org/Public/UNIDATA/
+#
+#     NB: not all the files are actually needed; currently, we require
+#       - UnicodeData.txt
+#       - Scripts.txt
+#       - EastAsianWidth.txt
+#       - BidiMirroring.txt
+#       - HangulSyllableType.txt
+#       - ReadMe.txt (to record version/date of the UCD)
+#       - Unihan_Variants.txt (from Unihan.zip)
+#     though this may change if we find a need for additional properties.
+#
+#     The Unicode data files listed above should be together in one directory.
+#     We also require the file
+#        http://www.unicode.org/Public/security/latest/xidmodifications.txt
+#     This file should be in a sub-directory "security" immediately below the
+#        directory containing the other Unicode data files.
+#
+# (2) Run this tool using a command line of the form
+#
+#         perl genUnicodePropertyData.pl \
+#                 /path/to/harfbuzz/src  \
+#                 /path/to/UCD-directory
+#
+#     This will generate (or overwrite!) the files
+#
+#         nsUnicodePropertyData.cpp
+#         nsUnicodeScriptCodes.h
+#
+#     in the current directory.
+use strict;
+use List::Util qw(first);
+if ($#ARGV != 1) {
+print <<__EOT;
+# Run this tool using a command line of the form
+#
+#     perl genUnicodePropertyData.pl \
+#             /path/to/harfbuzz/src  \
+#             /path/to/UCD-directory
+#
+# where harfbuzz/src is the directory containing harfbuzz .cc and .hh files,
+# and UCD-directory is a directory containing the current Unicode Character
+# Database files (UnicodeData.txt, etc), available from
+# http://www.unicode.org/Public/UNIDATA/
+#
+# This will generate (or overwrite!) the files
+#
+#     nsUnicodePropertyData.cpp
+#     nsUnicodeScriptCodes.h
+#
+# in the current directory.
+__EOT
+exit 0;
+}
+# load HB_Script and HB_Category constants
+# NOTE that HB_SCRIPT_* constants are now "tag" values, NOT sequentially-allocated
+# script codes as used by Glib/Pango/etc.
+# We therefore define a set of MOZ_SCRIPT_* constants that are script _codes_
+# compatible with those libraries, and map these to HB_SCRIPT_* _tags_ as needed.
+# CHECK that this matches Pango source (as found for example at
+# http://git.gnome.org/browse/pango/tree/pango/pango-script.h)
+# for as many codes as that defines (currently up through Unicode 5.1)
+# and the GLib enumeration
+# http://developer.gnome.org/glib/2.30/glib-Unicode-Manipulation.html#GUnicodeScript
+# (currently defined up through Unicode 6.0).
+# Constants beyond these may be regarded as unstable for now, but we don't actually
+# depend on the specific values.
+my %scriptCode = (
+INVALID => -1,
+COMMON => 0,
+INHERITED => 1,
+ARABIC => 2,
+ARMENIAN => 3,
+BENGALI => 4,
+BOPOMOFO => 5,
+CHEROKEE => 6,
+COPTIC => 7,
+CYRILLIC => 8,
+DESERET => 9,
+DEVANAGARI => 10,
+ETHIOPIC => 11,
+GEORGIAN => 12,
+GOTHIC => 13,
+GREEK => 14,
+GUJARATI => 15,
+GURMUKHI => 16,
+HAN => 17,
+HANGUL => 18,
+HEBREW => 19,
+HIRAGANA => 20,
+KANNADA => 21,
+KATAKANA => 22,
+KHMER => 23,
+LAO => 24,
+LATIN => 25,
+MALAYALAM => 26,
+MONGOLIAN => 27,
+MYANMAR => 28,
+OGHAM => 29,
+OLD_ITALIC => 30,
+ORIYA => 31,
+RUNIC => 32,
+SINHALA => 33,
+SYRIAC => 34,
+TAMIL => 35,
+TELUGU => 36,
+THAANA => 37,
+THAI => 38,
+TIBETAN => 39,
+CANADIAN_ABORIGINAL => 40,
+YI => 41,
+TAGALOG => 42,
+HANUNOO => 43,
+BUHID => 44,
+TAGBANWA => 45,
+# unicode 4.0 additions
+BRAILLE => 46,
+CYPRIOT => 47,
+LIMBU => 48,
+OSMANYA => 49,
+SHAVIAN => 50,
+LINEAR_B => 51,
+TAI_LE => 52,
+UGARITIC => 53,
+# unicode 4.1 additions
+NEW_TAI_LUE => 54,
+BUGINESE => 55,
+GLAGOLITIC => 56,
+TIFINAGH => 57,
+SYLOTI_NAGRI => 58,
+OLD_PERSIAN => 59,
+KHAROSHTHI => 60,
+# unicode 5.0 additions
+UNKNOWN => 61,
+BALINESE => 62,
+CUNEIFORM => 63,
+PHOENICIAN => 64,
+PHAGS_PA => 65,
+NKO => 66,
+# unicode 5.1 additions
+KAYAH_LI => 67,
+LEPCHA => 68,
+REJANG => 69,
+SUNDANESE => 70,
+SAURASHTRA => 71,
+CHAM => 72,
+OL_CHIKI => 73,
+VAI => 74,
+CARIAN => 75,
+LYCIAN => 76,
+LYDIAN => 77,
+# unicode 5.2 additions
+AVESTAN => 78,
+BAMUM => 79,
+EGYPTIAN_HIEROGLYPHS => 80,
+IMPERIAL_ARAMAIC => 81,
+INSCRIPTIONAL_PAHLAVI => 82,
+INSCRIPTIONAL_PARTHIAN => 83,
+JAVANESE => 84,
+KAITHI => 85,
+LISU => 86,
+MEETEI_MAYEK => 87,
+OLD_SOUTH_ARABIAN => 88,
+OLD_TURKIC => 89,
+SAMARITAN => 90,
+TAI_THAM => 91,
+TAI_VIET => 92,
+# unicode 6.0 additions
+BATAK => 93,
+BRAHMI => 94,
+MANDAIC => 95,
+# unicode 6.1 additions
+CHAKMA => 96,
+MEROITIC_CURSIVE => 97,
+MEROITIC_HIEROGLYPHS => 98,
+MIAO => 99,
+SHARADA => 100,
+SORA_SOMPENG => 101,
+TAKRI => 102
+);
+my $sc = -1;
+my $cc = -1;
+my %catCode;
+my @scriptCodeToTag;
+my @scriptCodeToName;
+sub readHarfBuzzHeader
+{
+my $file = shift;
+open FH, "< $ARGV[0]/$file" or die "can't open harfbuzz header $ARGV[0]/$file\n";
+while (<FH>) {
+if (m/HB_SCRIPT_([A-Z_]+)\s*=\s*HB_TAG\s*\(('.','.','.','.')\)\s*,/) {
+unless (exists $scriptCode{$1}) {
+warn "unknown script name $1 found in $file\n";
+next;
+}
+$sc = $scriptCode{$1};
+$scriptCodeToTag[$sc] = $2;
+$scriptCodeToName[$sc] = $1;
+}
+if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) {
+$cc++;
+$catCode{$1} = $cc;
+}
+}
+close FH;
+}
+&readHarfBuzzHeader("hb-common.h");
+&readHarfBuzzHeader("hb-unicode.h");
+die "didn't find HarfBuzz script codes\n" if $sc == -1;
+die "didn't find HarfBuzz category codes\n" if $cc == -1;
+my %xidmodCode = (
+'inclusion'         => 0,
+'recommended'       => 1,
+'default-ignorable' => 2,
+'historic'          => 3,
+'limited-use'       => 4,
+'not-NFKC'          => 5,
+'not-xid'           => 6,
+'obsolete'          => 7,
+'technical'         => 8,
+'not-chars'         => 9
+);
+my %bidicategoryCode = (
+"L"   =>  "0", # Left-to-Right
+"R"   =>  "1", # Right-to-Left
+"EN"  =>  "2", # European Number
+"ES"  =>  "3", # European Number Separator
+"ET"  =>  "4", # European Number Terminator
+"AN"  =>  "5", # Arabic Number
+"CS"  =>  "6", # Common Number Separator
+"B"   =>  "7", # Paragraph Separator
+"S"   =>  "8", # Segment Separator
+"WS"  =>  "9", # Whitespace
+"ON"  => "10", # Other Neutrals
+"LRE" => "11", # Left-to-Right Embedding
+"LRO" => "12", # Left-to-Right Override
+"AL"  => "13", # Right-to-Left Arabic
+"RLE" => "14", # Right-to-Left Embedding
+"RLO" => "15", # Right-to-Left Override
+"PDF" => "16", # Pop Directional Format
+"NSM" => "17", # Non-Spacing Mark
+"BN"  => "18"  # Boundary Neutral
+);
+# initialize default properties
+my @script;
+my @category;
+my @combining;
+my @eaw;
+my @mirror;
+my @hangul;
+my @casemap;
+my @xidmod;
+my @numericvalue;
+my @hanVariant;
+my @bidicategory;
+my @fullWidth;
+for (my $i = 0; $i < 0x110000; ++$i) {
+$script[$i] = $scriptCode{"UNKNOWN"};
+$category[$i] = $catCode{"UNASSIGNED"};
+$combining[$i] = 0;
+$casemap[$i] = 0;
+$xidmod[$i] = $xidmodCode{"not-chars"};
+$numericvalue[$i] = -1;
+$hanVariant[$i] = 0;
+$bidicategory[$i] = $bidicategoryCode{"L"};
+$fullWidth[$i] = 0;
+}
+# blocks where the default for bidi category is not L
+for my $i (0x0600..0x07BF, 0x08A0..0x08FF, 0xFB50..0xFDCF, 0xFDF0..0xFDFF, 0xFE70..0xFEFF, 0x1EE00..0x0001EEFF) {
+$bidicategory[$i] = $bidicategoryCode{"AL"};
+}
+for my $i (0x0590..0x05FF, 0x07C0..0x089F, 0xFB1D..0xFB4F, 0x00010800..0x00010FFF, 0x0001E800..0x0001EDFF, 0x0001EF00..0x0001EFFF) {
+$bidicategory[$i] = $bidicategoryCode{"R"};
+}
+for my $i (0x20A0..0x20CF) {
+$bidicategory[$i] = $bidicategoryCode{"ET"};
+}
+my %ucd2hb = (
+'Cc' => 'CONTROL',
+'Cf' => 'FORMAT',
+'Cn' => 'UNASSIGNED',
+'Co' => 'PRIVATE_USE',
+'Cs' => 'SURROGATE',
+'Ll' => 'LOWERCASE_LETTER',
+'Lm' => 'MODIFIER_LETTER',
+'Lo' => 'OTHER_LETTER',
+'Lt' => 'TITLECASE_LETTER',
+'Lu' => 'UPPERCASE_LETTER',
+'Mc' => 'SPACING_MARK',
+'Me' => 'ENCLOSING_MARK',
+'Mn' => 'NON_SPACING_MARK',
+'Nd' => 'DECIMAL_NUMBER',
+'Nl' => 'LETTER_NUMBER',
+'No' => 'OTHER_NUMBER',
+'Pc' => 'CONNECT_PUNCTUATION',
+'Pd' => 'DASH_PUNCTUATION',
+'Pe' => 'CLOSE_PUNCTUATION',
+'Pf' => 'FINAL_PUNCTUATION',
+'Pi' => 'INITIAL_PUNCTUATION',
+'Po' => 'OTHER_PUNCTUATION',
+'Ps' => 'OPEN_PUNCTUATION',
+'Sc' => 'CURRENCY_SYMBOL',
+'Sk' => 'MODIFIER_SYMBOL',
+'Sm' => 'MATH_SYMBOL',
+'So' => 'OTHER_SYMBOL',
+'Zl' => 'LINE_SEPARATOR',
+'Zp' => 'PARAGRAPH_SEPARATOR',
+'Zs' => 'SPACE_SEPARATOR'
+);
+# read ReadMe.txt
+my @versionInfo;
+open FH, "< $ARGV[1]/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
+while (<FH>) {
+chomp;
+push @versionInfo, $_;
+}
+close FH;
+my $kTitleToUpper = 0x80000000;
+my $kUpperToLower = 0x40000000;
+my $kLowerToTitle = 0x20000000;
+my $kLowerToUpper = 0x10000000;
+my $kCaseMapCharMask = 0x001fffff;
+# read UnicodeData.txt
+open FH, "< $ARGV[1]/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
+while (<FH>) {
+chomp;
+my @fields = split /;/;
+if ($fields[1] =~ /First/) {
+my $first = hex "0x$fields[0]";
+$_ = <FH>;
+@fields = split /;/;
+if ($fields[1] =~ /Last/) {
+my $last = hex "0x$fields[0]";
+do {
+$category[$first] = $catCode{$ucd2hb{$fields[2]}};
+$combining[$first] = $fields[3];
+$bidicategory[$first] = $bidicategoryCode{$fields[4]};
+unless (length($fields[7]) == 0) {
+$numericvalue[$first] = $fields[7];
+}
+if ($fields[1] =~ /CJK/) {
+@hanVariant[$first] = 3;
+}
+$first++;
+} while ($first <= $last);
+} else {
+die "didn't find Last code for range!\n";
+}
+} else {
+my $usv = hex "0x$fields[0]";
+$category[$usv] = $catCode{$ucd2hb{$fields[2]}};
+$combining[$usv] = $fields[3];
+my $upper = hex $fields[12];
+my $lower = hex $fields[13];
+my $title = hex $fields[14];
+# we only store one mapping for each character,
+# but also record what kind of mapping it is
+if ($upper && $lower) {
+$casemap[$usv] |= $kTitleToUpper;
+$casemap[$usv] |= ($usv ^ $upper);
+}
+elsif ($lower) {
+$casemap[$usv] |= $kUpperToLower;
+$casemap[$usv] |= ($usv ^ $lower);
+}
+elsif ($title && ($title != $upper)) {
+$casemap[$usv] |= $kLowerToTitle;
+$casemap[$usv] |= ($usv ^ $title);
+}
+elsif ($upper) {
+$casemap[$usv] |= $kLowerToUpper;
+$casemap[$usv] |= ($usv ^ $upper);
+}
+$bidicategory[$usv] = $bidicategoryCode{$fields[4]};
+unless (length($fields[7]) == 0) {
+$numericvalue[$usv] = $fields[7];
+}
+if ($fields[1] =~ /CJK/) {
+@hanVariant[$usv] = 3;
+}
+if ($fields[5] =~ /^<narrow>/) {
+my $wideChar = hex(substr($fields[5], 9));
+die "didn't expect supplementary-plane values here" if $usv > 0xffff || $wideChar > 0xffff;
+$fullWidth[$usv] = $wideChar;
+}
+elsif ($fields[5] =~ /^<wide>/) {
+my $narrowChar = hex(substr($fields[5], 7));
+die "didn't expect supplementary-plane values here" if $usv > 0xffff || $narrowChar > 0xffff;
+$fullWidth[$narrowChar] = $usv;
+}
+}
+}
+close FH;
+# read Scripts.txt
+open FH, "< $ARGV[1]/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
+push @versionInfo, "";
+while (<FH>) {
+chomp;
+push @versionInfo, $_;
+last if /Date:/;
+}
+while (<FH>) {
+if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) {
+my $script = uc($3);
+warn "unknown script $script" unless exists $scriptCode{$script};
+$script = $scriptCode{$script};
+my $start = hex "0x$1";
+my $end = (defined $2) ? hex "0x$2" : $start;
+for (my $i = $start; $i <= $end; ++$i) {
+$script[$i] = $script;
+}
+}
+}
+close FH;
+# read EastAsianWidth.txt
+my %eawCode = (
+'A' => 0, #         ; Ambiguous
+'F' => 1, #         ; Fullwidth
+'H' => 2, #         ; Halfwidth
+'N' => 3, #         ; Neutral
+'NA'=> 4, #         ; Narrow
+'W' => 5  #         ; Wide
+);
+open FH, "< $ARGV[1]/EastAsianWidth.txt" or die "can't open UCD file EastAsianWidth.txt\n";
+push @versionInfo, "";
+while (<FH>) {
+chomp;
+push @versionInfo, $_;
+last if /Date:/;
+}
+while (<FH>) {
+s/#.*//;
+if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
+my $eaw = uc($3);
+warn "unknown EAW code $eaw" unless exists $eawCode{$eaw};
+$eaw = $eawCode{$eaw};
+my $start = hex "0x$1";
+my $end = (defined $2) ? hex "0x$2" : $start;
+for (my $i = $start; $i <= $end; ++$i) {
+$eaw[$i] = $eaw;
+}
+}
+}
+close FH;
+# read BidiMirroring.txt
+my @offsets = ();
+push @offsets, 0;
+open FH, "< $ARGV[1]/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
+push @versionInfo, "";
+while (<FH>) {
+chomp;
+push @versionInfo, $_;
+last if /Date:/;
+}
+while (<FH>) {
+s/#.*//;
+if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6})/) {
+my $mirrorOffset = hex("0x$2") - hex("0x$1");
+	my $offsetIndex = first { $offsets[$_] eq $mirrorOffset } 0..$#offsets;
+	if ($offsetIndex == undef) {
+die "too many offset codes\n" if scalar @offsets == 31;
+push @offsets, $mirrorOffset;
+	    $offsetIndex = $#offsets;
+}
+	$mirror[hex "0x$1"] = $offsetIndex;
+}
+}
+close FH;
+# read HangulSyllableType.txt
+my %hangulType = (
+'L'   => 0x01,
+'V'   => 0x02,
+'T'   => 0x04,
+'LV'  => 0x03,
+'LVT' => 0x07
+);
+open FH, "< $ARGV[1]/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
+push @versionInfo, "";
+while (<FH>) {
+chomp;
+push @versionInfo, $_;
+last if /Date:/;
+}
+while (<FH>) {
+s/#.*//;
+if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
+my $hangul = uc($3);
+warn "unknown Hangul syllable type" unless exists $hangulType{$hangul};
+$hangul = $hangulType{$hangul};
+my $start = hex "0x$1";
+my $end = (defined $2) ? hex "0x$2" : $start;
+for (my $i = $start; $i <= $end; ++$i) {
+$hangul[$i] = $hangul;
+}
+}
+}
+close FH;
+# read xidmodifications.txt
+open FH, "< $ARGV[1]/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
+push @versionInfo, "";
+while (<FH>) {
+chomp;
+unless (/\xef\xbb\xbf/) {
+push @versionInfo, $_;
+}
+last if /Generated:/;
+}
+while (<FH>) {
+if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) {
+my $xidmod = $3;
+warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod};
+$xidmod = $xidmodCode{$xidmod};
+my $start = hex "0x$1";
+my $end = (defined $2) ? hex "0x$2" : $start;
+for (my $i = $start; $i <= $end; ++$i) {
+$xidmod[$i] = $xidmod;
+}
+}
+}
+close FH;
+# special case U+30FB KATAKANA MIDDLE DOT -- see bug 857490
+$xidmod[0x30FB] = 1;
+open FH, "< $ARGV[1]/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
+push @versionInfo, "";
+while (<FH>) {
+chomp;
+push @versionInfo, $_;
+last if /Date:/;
+}
+my $savedusv = 0;
+my $hasTC = 0;
+my $hasSC = 0;
+while (<FH>) {
+chomp;
+if (m/U\+([0-9A-F]{4,6})\s+k([^ ]+)Variant/) {
+my $usv = hex "0x$1";
+if ($usv != $savedusv) {
+unless ($savedusv == 0) {
+if ($hasTC && !$hasSC) {
+$hanVariant[$savedusv] = 1;
+} elsif (!$hasTC && $hasSC) {
+$hanVariant[$savedusv] = 2;
+}
+}
+$savedusv = $usv;
+$hasTC = 0;
+$hasSC = 0;
+}
+if ($2 eq "Traditional") {
+$hasTC = 1;
+}
+if ($2 eq "Simplified") {
+$hasSC = 1;
+}
+}
+}
+close FH;
+my $timestamp = gmtime();
+open DATA_TABLES, "> nsUnicodePropertyData.cpp" or die "unable to open nsUnicodePropertyData.cpp for output";
+my $licenseBlock = q[
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+* License, v. 2.0. If a copy of the MPL was not distributed with this
+* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+/*
+* Derived from the Unicode Character Database by genUnicodePropertyData.pl
+*
+* For Unicode terms of use, see http://www.unicode.org/terms_of_use.html
+*/
+];
+my $versionInfo = join("\n", @versionInfo);
+print DATA_TABLES <<__END;
+$licenseBlock
+/*
+* Created on $timestamp from UCD data files with version info:
+*
+$versionInfo
+*
+* * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
+*/
+#include <stdint.h>
+#include "harfbuzz/hb.h"
+__END
+open HEADER, "> nsUnicodeScriptCodes.h" or die "unable to open nsUnicodeScriptCodes.h for output";
+print HEADER <<__END;
+$licenseBlock
+/*
+* Created on $timestamp from UCD data files with version info:
+*
+$versionInfo
+*
+* * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
+*/
+#ifndef NS_UNICODE_SCRIPT_CODES
+#define NS_UNICODE_SCRIPT_CODES
+__END
+print DATA_TABLES "static const uint32_t sScriptCodeToTag[] = {\n";
+for (my $i = 0; $i < scalar @scriptCodeToTag; ++$i) {
+printf DATA_TABLES "  HB_TAG(%s)", $scriptCodeToTag[$i];
+print DATA_TABLES $i < $#scriptCodeToTag ? ",\n" : "\n";
+}
+print DATA_TABLES "};\n\n";
+our $totalData = 0;
+print DATA_TABLES "static const int16_t sMirrorOffsets[] = {\n";
+for (my $i = 0; $i < scalar @offsets; ++$i) {
+printf DATA_TABLES "  $offsets[$i]";
+print DATA_TABLES $i < $#offsets ? ",\n" : "\n";
+}
+print DATA_TABLES "};\n\n";
+print HEADER "#pragma pack(1)\n\n";
+sub sprintCharProps1
+{
+my $usv = shift;
+return sprintf("{%d,%d,%d}, ", $mirror[$usv], $hangul[$usv], $combining[$usv]);
+}
+&genTables("CharProp1", "struct nsCharProps1 {\n  unsigned char mMirrorOffsetIndex:5;\n  unsigned char mHangulType:3;\n  unsigned char mCombiningClass:8;\n};",
+"nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1);
+sub sprintCharProps2
+{
+my $usv = shift;
+return sprintf("{%d,%d,%d,%d,%d,%d},",
+$script[$usv], $eaw[$usv], $category[$usv],
+$bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv]);
+}
+&genTables("CharProp2", "struct nsCharProps2 {\n  unsigned char mScriptCode:8;\n  unsigned char mEAW:3;\n  unsigned char mCategory:5;\n  unsigned char mBidiCategory:5;\n  unsigned char mXidmod:4;\n  signed char mNumericValue:5;\n  unsigned char mHanVariant:2;\n};",
+"nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1);
+print HEADER "#pragma pack()\n\n";
+sub sprintHanVariants
+{
+my $baseUsv = shift;
+my $varShift = 0;
+my $val = 0;
+while ($varShift < 8) {
+$val |= $hanVariant[$baseUsv++] << $varShift;
+$varShift += 2;
+}
+return sprintf("0x%02x,", $val);
+}
+&genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
+sub sprintFullWidth
+{
+my $usv = shift;
+return sprintf("0x%04x,", $fullWidth[$usv]);
+}
+&genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
+sub sprintCasemap
+{
+my $usv = shift;
+return sprintf("0x%08x,", $casemap[$usv]);
+}
+&genTables("CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1);
+print STDERR "Total data = $totalData\n";
+printf DATA_TABLES "const uint32_t kTitleToUpper = 0x%08x;\n", $kTitleToUpper;
+printf DATA_TABLES "const uint32_t kUpperToLower = 0x%08x;\n", $kUpperToLower;
+printf DATA_TABLES "const uint32_t kLowerToTitle = 0x%08x;\n", $kLowerToTitle;
+printf DATA_TABLES "const uint32_t kLowerToUpper = 0x%08x;\n", $kLowerToUpper;
+printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCharMask;
+sub genTables
+{
+my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
+print DATA_TABLES "#define k${prefix}MaxPlane  $maxPlane\n";
+print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n";
+print DATA_TABLES "#define k${prefix}CharBits  $charBits\n";
+my $indexLen = 1 << $indexBits;
+my $charsPerPage = 1 << $charBits;
+my %charIndex = ();
+my %pageMapIndex = ();
+my @pageMap = ();
+my @char = ();
+my $planeMap = "\x00" x $maxPlane;
+foreach my $plane (0 .. $maxPlane) {
+my $pageMap = "\x00" x $indexLen * 2;
+foreach my $page (0 .. $indexLen - 1) {
+my $charValues = "";
+for (my $ch = 0; $ch < $charsPerPage; $ch += $charsPerEntry) {
+my $usv = $plane * 0x10000 + $page * $charsPerPage + $ch;
+$charValues .= &$func($usv);
+}
+chop $charValues;
+unless (exists $charIndex{$charValues}) {
+$charIndex{$charValues} = scalar keys %charIndex;
+$char[$charIndex{$charValues}] = $charValues;
+}
+substr($pageMap, $page * 2, 2) = pack('S', $charIndex{$charValues});
+}
+unless (exists $pageMapIndex{$pageMap}) {
+$pageMapIndex{$pageMap} = scalar keys %pageMapIndex;
+$pageMap[$pageMapIndex{$pageMap}] = $pageMap;
+}
+if ($plane > 0) {
+substr($planeMap, $plane - 1, 1) = pack('C', $pageMapIndex{$pageMap});
+}
+}
+if ($maxPlane) {
+print DATA_TABLES "static const uint8_t s${prefix}Planes[$maxPlane] = {";
+print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('C*', $planeMap));
+print DATA_TABLES "};\n\n";
+}
+my $chCount = scalar @char;
+my $pmBits = $chCount > 255 ? 16 : 8;
+my $pmCount = scalar @pageMap;
+if ($maxPlane == 0) {
+die "there should only be one pageMap entry!" if $pmCount > 1;
+print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$indexLen] = {\n";
+} else {
+print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$pmCount][$indexLen] = {\n";
+}
+for (my $i = 0; $i < scalar @pageMap; ++$i) {
+print DATA_TABLES $maxPlane > 0 ? "  {" : "  ";
+print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('S*', $pageMap[$i]));
+print DATA_TABLES $maxPlane > 0 ? ($i < $#pageMap ? "},\n" : "}\n") : "\n";
+}
+print DATA_TABLES "};\n\n";
+print HEADER "$typedef\n\n" if $typedef ne '';
+my $pageLen = $charsPerPage / $charsPerEntry;
+print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n";
+for (my $i = 0; $i < scalar @char; ++$i) {
+print DATA_TABLES "  {";
+print DATA_TABLES $char[$i];
+print DATA_TABLES $i < $#char ? "},\n" : "}\n";
+}
+print DATA_TABLES "};\n\n";
+my $dataSize = $pmCount * $indexLen * $pmBits/8 +
+$chCount * $pageLen * $bytesPerEntry +
+$maxPlane;
+$totalData += $dataSize;
+print STDERR "Data for $prefix = $dataSize\n";
+}
+print DATA_TABLES <<__END;
+/*
+* * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
+*/
+__END
+close DATA_TABLES;
+print HEADER "enum {\n";
+for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) {
+print HEADER "  MOZ_SCRIPT_", $scriptCodeToName[$i], " = ", $i, ",\n";
+}
+print HEADER "\n  MOZ_NUM_SCRIPT_CODES = ", scalar @scriptCodeToName, ",\n";
+print HEADER "\n  MOZ_SCRIPT_INVALID = -1\n";
+print HEADER "};\n\n";
+print HEADER <<__END;
+#endif
+/*
+* * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
+*/
+__END
+close HEADER;

The Tor Browser / file comparison

comparison: intl/unicharutil/tools/genUnicodePropertyData.pl

intl/unicharutil/tools/genUnicodePropertyData.pl