1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/unicharutil/tools/genUnicodePropertyData.pl Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,833 @@ 1.4 +#!/usr/bin/env perl 1.5 + 1.6 +# This Source Code Form is subject to the terms of the Mozilla Public 1.7 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.8 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.9 + 1.10 +# This tool is used to prepare lookup tables of Unicode character properties 1.11 +# needed by gfx code to support text shaping operations. The properties are 1.12 +# read from the Unicode Character Database and compiled into multi-level arrays 1.13 +# for efficient lookup. 1.14 +# 1.15 +# To regenerate the tables in nsUnicodePropertyData.cpp: 1.16 +# 1.17 +# (1) Download the current Unicode data files from 1.18 +# 1.19 +# http://www.unicode.org/Public/UNIDATA/ 1.20 +# 1.21 +# NB: not all the files are actually needed; currently, we require 1.22 +# - UnicodeData.txt 1.23 +# - Scripts.txt 1.24 +# - EastAsianWidth.txt 1.25 +# - BidiMirroring.txt 1.26 +# - HangulSyllableType.txt 1.27 +# - ReadMe.txt (to record version/date of the UCD) 1.28 +# - Unihan_Variants.txt (from Unihan.zip) 1.29 +# though this may change if we find a need for additional properties. 1.30 +# 1.31 +# The Unicode data files listed above should be together in one directory. 1.32 +# We also require the file 1.33 +# http://www.unicode.org/Public/security/latest/xidmodifications.txt 1.34 +# This file should be in a sub-directory "security" immediately below the 1.35 +# directory containing the other Unicode data files. 1.36 +# 1.37 +# (2) Run this tool using a command line of the form 1.38 +# 1.39 +# perl genUnicodePropertyData.pl \ 1.40 +# /path/to/harfbuzz/src \ 1.41 +# /path/to/UCD-directory 1.42 +# 1.43 +# This will generate (or overwrite!) the files 1.44 +# 1.45 +# nsUnicodePropertyData.cpp 1.46 +# nsUnicodeScriptCodes.h 1.47 +# 1.48 +# in the current directory. 1.49 + 1.50 +use strict; 1.51 +use List::Util qw(first); 1.52 + 1.53 +if ($#ARGV != 1) { 1.54 + print <<__EOT; 1.55 +# Run this tool using a command line of the form 1.56 +# 1.57 +# perl genUnicodePropertyData.pl \ 1.58 +# /path/to/harfbuzz/src \ 1.59 +# /path/to/UCD-directory 1.60 +# 1.61 +# where harfbuzz/src is the directory containing harfbuzz .cc and .hh files, 1.62 +# and UCD-directory is a directory containing the current Unicode Character 1.63 +# Database files (UnicodeData.txt, etc), available from 1.64 +# http://www.unicode.org/Public/UNIDATA/ 1.65 +# 1.66 +# This will generate (or overwrite!) the files 1.67 +# 1.68 +# nsUnicodePropertyData.cpp 1.69 +# nsUnicodeScriptCodes.h 1.70 +# 1.71 +# in the current directory. 1.72 +__EOT 1.73 + exit 0; 1.74 +} 1.75 + 1.76 +# load HB_Script and HB_Category constants 1.77 + 1.78 +# NOTE that HB_SCRIPT_* constants are now "tag" values, NOT sequentially-allocated 1.79 +# script codes as used by Glib/Pango/etc. 1.80 +# We therefore define a set of MOZ_SCRIPT_* constants that are script _codes_ 1.81 +# compatible with those libraries, and map these to HB_SCRIPT_* _tags_ as needed. 1.82 + 1.83 +# CHECK that this matches Pango source (as found for example at 1.84 +# http://git.gnome.org/browse/pango/tree/pango/pango-script.h) 1.85 +# for as many codes as that defines (currently up through Unicode 5.1) 1.86 +# and the GLib enumeration 1.87 +# http://developer.gnome.org/glib/2.30/glib-Unicode-Manipulation.html#GUnicodeScript 1.88 +# (currently defined up through Unicode 6.0). 1.89 +# Constants beyond these may be regarded as unstable for now, but we don't actually 1.90 +# depend on the specific values. 1.91 +my %scriptCode = ( 1.92 + INVALID => -1, 1.93 + COMMON => 0, 1.94 + INHERITED => 1, 1.95 + ARABIC => 2, 1.96 + ARMENIAN => 3, 1.97 + BENGALI => 4, 1.98 + BOPOMOFO => 5, 1.99 + CHEROKEE => 6, 1.100 + COPTIC => 7, 1.101 + CYRILLIC => 8, 1.102 + DESERET => 9, 1.103 + DEVANAGARI => 10, 1.104 + ETHIOPIC => 11, 1.105 + GEORGIAN => 12, 1.106 + GOTHIC => 13, 1.107 + GREEK => 14, 1.108 + GUJARATI => 15, 1.109 + GURMUKHI => 16, 1.110 + HAN => 17, 1.111 + HANGUL => 18, 1.112 + HEBREW => 19, 1.113 + HIRAGANA => 20, 1.114 + KANNADA => 21, 1.115 + KATAKANA => 22, 1.116 + KHMER => 23, 1.117 + LAO => 24, 1.118 + LATIN => 25, 1.119 + MALAYALAM => 26, 1.120 + MONGOLIAN => 27, 1.121 + MYANMAR => 28, 1.122 + OGHAM => 29, 1.123 + OLD_ITALIC => 30, 1.124 + ORIYA => 31, 1.125 + RUNIC => 32, 1.126 + SINHALA => 33, 1.127 + SYRIAC => 34, 1.128 + TAMIL => 35, 1.129 + TELUGU => 36, 1.130 + THAANA => 37, 1.131 + THAI => 38, 1.132 + TIBETAN => 39, 1.133 + CANADIAN_ABORIGINAL => 40, 1.134 + YI => 41, 1.135 + TAGALOG => 42, 1.136 + HANUNOO => 43, 1.137 + BUHID => 44, 1.138 + TAGBANWA => 45, 1.139 +# unicode 4.0 additions 1.140 + BRAILLE => 46, 1.141 + CYPRIOT => 47, 1.142 + LIMBU => 48, 1.143 + OSMANYA => 49, 1.144 + SHAVIAN => 50, 1.145 + LINEAR_B => 51, 1.146 + TAI_LE => 52, 1.147 + UGARITIC => 53, 1.148 +# unicode 4.1 additions 1.149 + NEW_TAI_LUE => 54, 1.150 + BUGINESE => 55, 1.151 + GLAGOLITIC => 56, 1.152 + TIFINAGH => 57, 1.153 + SYLOTI_NAGRI => 58, 1.154 + OLD_PERSIAN => 59, 1.155 + KHAROSHTHI => 60, 1.156 +# unicode 5.0 additions 1.157 + UNKNOWN => 61, 1.158 + BALINESE => 62, 1.159 + CUNEIFORM => 63, 1.160 + PHOENICIAN => 64, 1.161 + PHAGS_PA => 65, 1.162 + NKO => 66, 1.163 +# unicode 5.1 additions 1.164 + KAYAH_LI => 67, 1.165 + LEPCHA => 68, 1.166 + REJANG => 69, 1.167 + SUNDANESE => 70, 1.168 + SAURASHTRA => 71, 1.169 + CHAM => 72, 1.170 + OL_CHIKI => 73, 1.171 + VAI => 74, 1.172 + CARIAN => 75, 1.173 + LYCIAN => 76, 1.174 + LYDIAN => 77, 1.175 +# unicode 5.2 additions 1.176 + AVESTAN => 78, 1.177 + BAMUM => 79, 1.178 + EGYPTIAN_HIEROGLYPHS => 80, 1.179 + IMPERIAL_ARAMAIC => 81, 1.180 + INSCRIPTIONAL_PAHLAVI => 82, 1.181 + INSCRIPTIONAL_PARTHIAN => 83, 1.182 + JAVANESE => 84, 1.183 + KAITHI => 85, 1.184 + LISU => 86, 1.185 + MEETEI_MAYEK => 87, 1.186 + OLD_SOUTH_ARABIAN => 88, 1.187 + OLD_TURKIC => 89, 1.188 + SAMARITAN => 90, 1.189 + TAI_THAM => 91, 1.190 + TAI_VIET => 92, 1.191 +# unicode 6.0 additions 1.192 + BATAK => 93, 1.193 + BRAHMI => 94, 1.194 + MANDAIC => 95, 1.195 +# unicode 6.1 additions 1.196 + CHAKMA => 96, 1.197 + MEROITIC_CURSIVE => 97, 1.198 + MEROITIC_HIEROGLYPHS => 98, 1.199 + MIAO => 99, 1.200 + SHARADA => 100, 1.201 + SORA_SOMPENG => 101, 1.202 + TAKRI => 102 1.203 +); 1.204 + 1.205 +my $sc = -1; 1.206 +my $cc = -1; 1.207 +my %catCode; 1.208 +my @scriptCodeToTag; 1.209 +my @scriptCodeToName; 1.210 + 1.211 +sub readHarfBuzzHeader 1.212 +{ 1.213 + my $file = shift; 1.214 + open FH, "< $ARGV[0]/$file" or die "can't open harfbuzz header $ARGV[0]/$file\n"; 1.215 + while (<FH>) { 1.216 + if (m/HB_SCRIPT_([A-Z_]+)\s*=\s*HB_TAG\s*\(('.','.','.','.')\)\s*,/) { 1.217 + unless (exists $scriptCode{$1}) { 1.218 + warn "unknown script name $1 found in $file\n"; 1.219 + next; 1.220 + } 1.221 + $sc = $scriptCode{$1}; 1.222 + $scriptCodeToTag[$sc] = $2; 1.223 + $scriptCodeToName[$sc] = $1; 1.224 + } 1.225 + if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) { 1.226 + $cc++; 1.227 + $catCode{$1} = $cc; 1.228 + } 1.229 + } 1.230 + close FH; 1.231 +} 1.232 + 1.233 +&readHarfBuzzHeader("hb-common.h"); 1.234 +&readHarfBuzzHeader("hb-unicode.h"); 1.235 + 1.236 +die "didn't find HarfBuzz script codes\n" if $sc == -1; 1.237 +die "didn't find HarfBuzz category codes\n" if $cc == -1; 1.238 + 1.239 +my %xidmodCode = ( 1.240 +'inclusion' => 0, 1.241 +'recommended' => 1, 1.242 +'default-ignorable' => 2, 1.243 +'historic' => 3, 1.244 +'limited-use' => 4, 1.245 +'not-NFKC' => 5, 1.246 +'not-xid' => 6, 1.247 +'obsolete' => 7, 1.248 +'technical' => 8, 1.249 +'not-chars' => 9 1.250 +); 1.251 + 1.252 +my %bidicategoryCode = ( 1.253 + "L" => "0", # Left-to-Right 1.254 + "R" => "1", # Right-to-Left 1.255 + "EN" => "2", # European Number 1.256 + "ES" => "3", # European Number Separator 1.257 + "ET" => "4", # European Number Terminator 1.258 + "AN" => "5", # Arabic Number 1.259 + "CS" => "6", # Common Number Separator 1.260 + "B" => "7", # Paragraph Separator 1.261 + "S" => "8", # Segment Separator 1.262 + "WS" => "9", # Whitespace 1.263 + "ON" => "10", # Other Neutrals 1.264 + "LRE" => "11", # Left-to-Right Embedding 1.265 + "LRO" => "12", # Left-to-Right Override 1.266 + "AL" => "13", # Right-to-Left Arabic 1.267 + "RLE" => "14", # Right-to-Left Embedding 1.268 + "RLO" => "15", # Right-to-Left Override 1.269 + "PDF" => "16", # Pop Directional Format 1.270 + "NSM" => "17", # Non-Spacing Mark 1.271 + "BN" => "18" # Boundary Neutral 1.272 +); 1.273 + 1.274 +# initialize default properties 1.275 +my @script; 1.276 +my @category; 1.277 +my @combining; 1.278 +my @eaw; 1.279 +my @mirror; 1.280 +my @hangul; 1.281 +my @casemap; 1.282 +my @xidmod; 1.283 +my @numericvalue; 1.284 +my @hanVariant; 1.285 +my @bidicategory; 1.286 +my @fullWidth; 1.287 +for (my $i = 0; $i < 0x110000; ++$i) { 1.288 + $script[$i] = $scriptCode{"UNKNOWN"}; 1.289 + $category[$i] = $catCode{"UNASSIGNED"}; 1.290 + $combining[$i] = 0; 1.291 + $casemap[$i] = 0; 1.292 + $xidmod[$i] = $xidmodCode{"not-chars"}; 1.293 + $numericvalue[$i] = -1; 1.294 + $hanVariant[$i] = 0; 1.295 + $bidicategory[$i] = $bidicategoryCode{"L"}; 1.296 + $fullWidth[$i] = 0; 1.297 +} 1.298 + 1.299 +# blocks where the default for bidi category is not L 1.300 +for my $i (0x0600..0x07BF, 0x08A0..0x08FF, 0xFB50..0xFDCF, 0xFDF0..0xFDFF, 0xFE70..0xFEFF, 0x1EE00..0x0001EEFF) { 1.301 + $bidicategory[$i] = $bidicategoryCode{"AL"}; 1.302 +} 1.303 +for my $i (0x0590..0x05FF, 0x07C0..0x089F, 0xFB1D..0xFB4F, 0x00010800..0x00010FFF, 0x0001E800..0x0001EDFF, 0x0001EF00..0x0001EFFF) { 1.304 + $bidicategory[$i] = $bidicategoryCode{"R"}; 1.305 +} 1.306 +for my $i (0x20A0..0x20CF) { 1.307 + $bidicategory[$i] = $bidicategoryCode{"ET"}; 1.308 +} 1.309 + 1.310 +my %ucd2hb = ( 1.311 +'Cc' => 'CONTROL', 1.312 +'Cf' => 'FORMAT', 1.313 +'Cn' => 'UNASSIGNED', 1.314 +'Co' => 'PRIVATE_USE', 1.315 +'Cs' => 'SURROGATE', 1.316 +'Ll' => 'LOWERCASE_LETTER', 1.317 +'Lm' => 'MODIFIER_LETTER', 1.318 +'Lo' => 'OTHER_LETTER', 1.319 +'Lt' => 'TITLECASE_LETTER', 1.320 +'Lu' => 'UPPERCASE_LETTER', 1.321 +'Mc' => 'SPACING_MARK', 1.322 +'Me' => 'ENCLOSING_MARK', 1.323 +'Mn' => 'NON_SPACING_MARK', 1.324 +'Nd' => 'DECIMAL_NUMBER', 1.325 +'Nl' => 'LETTER_NUMBER', 1.326 +'No' => 'OTHER_NUMBER', 1.327 +'Pc' => 'CONNECT_PUNCTUATION', 1.328 +'Pd' => 'DASH_PUNCTUATION', 1.329 +'Pe' => 'CLOSE_PUNCTUATION', 1.330 +'Pf' => 'FINAL_PUNCTUATION', 1.331 +'Pi' => 'INITIAL_PUNCTUATION', 1.332 +'Po' => 'OTHER_PUNCTUATION', 1.333 +'Ps' => 'OPEN_PUNCTUATION', 1.334 +'Sc' => 'CURRENCY_SYMBOL', 1.335 +'Sk' => 'MODIFIER_SYMBOL', 1.336 +'Sm' => 'MATH_SYMBOL', 1.337 +'So' => 'OTHER_SYMBOL', 1.338 +'Zl' => 'LINE_SEPARATOR', 1.339 +'Zp' => 'PARAGRAPH_SEPARATOR', 1.340 +'Zs' => 'SPACE_SEPARATOR' 1.341 +); 1.342 + 1.343 +# read ReadMe.txt 1.344 +my @versionInfo; 1.345 +open FH, "< $ARGV[1]/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n"; 1.346 +while (<FH>) { 1.347 + chomp; 1.348 + push @versionInfo, $_; 1.349 +} 1.350 +close FH; 1.351 + 1.352 +my $kTitleToUpper = 0x80000000; 1.353 +my $kUpperToLower = 0x40000000; 1.354 +my $kLowerToTitle = 0x20000000; 1.355 +my $kLowerToUpper = 0x10000000; 1.356 +my $kCaseMapCharMask = 0x001fffff; 1.357 + 1.358 +# read UnicodeData.txt 1.359 +open FH, "< $ARGV[1]/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n"; 1.360 +while (<FH>) { 1.361 + chomp; 1.362 + my @fields = split /;/; 1.363 + if ($fields[1] =~ /First/) { 1.364 + my $first = hex "0x$fields[0]"; 1.365 + $_ = <FH>; 1.366 + @fields = split /;/; 1.367 + if ($fields[1] =~ /Last/) { 1.368 + my $last = hex "0x$fields[0]"; 1.369 + do { 1.370 + $category[$first] = $catCode{$ucd2hb{$fields[2]}}; 1.371 + $combining[$first] = $fields[3]; 1.372 + $bidicategory[$first] = $bidicategoryCode{$fields[4]}; 1.373 + unless (length($fields[7]) == 0) { 1.374 + $numericvalue[$first] = $fields[7]; 1.375 + } 1.376 + if ($fields[1] =~ /CJK/) { 1.377 + @hanVariant[$first] = 3; 1.378 + } 1.379 + $first++; 1.380 + } while ($first <= $last); 1.381 + } else { 1.382 + die "didn't find Last code for range!\n"; 1.383 + } 1.384 + } else { 1.385 + my $usv = hex "0x$fields[0]"; 1.386 + $category[$usv] = $catCode{$ucd2hb{$fields[2]}}; 1.387 + $combining[$usv] = $fields[3]; 1.388 + my $upper = hex $fields[12]; 1.389 + my $lower = hex $fields[13]; 1.390 + my $title = hex $fields[14]; 1.391 + # we only store one mapping for each character, 1.392 + # but also record what kind of mapping it is 1.393 + if ($upper && $lower) { 1.394 + $casemap[$usv] |= $kTitleToUpper; 1.395 + $casemap[$usv] |= ($usv ^ $upper); 1.396 + } 1.397 + elsif ($lower) { 1.398 + $casemap[$usv] |= $kUpperToLower; 1.399 + $casemap[$usv] |= ($usv ^ $lower); 1.400 + } 1.401 + elsif ($title && ($title != $upper)) { 1.402 + $casemap[$usv] |= $kLowerToTitle; 1.403 + $casemap[$usv] |= ($usv ^ $title); 1.404 + } 1.405 + elsif ($upper) { 1.406 + $casemap[$usv] |= $kLowerToUpper; 1.407 + $casemap[$usv] |= ($usv ^ $upper); 1.408 + } 1.409 + $bidicategory[$usv] = $bidicategoryCode{$fields[4]}; 1.410 + unless (length($fields[7]) == 0) { 1.411 + $numericvalue[$usv] = $fields[7]; 1.412 + } 1.413 + if ($fields[1] =~ /CJK/) { 1.414 + @hanVariant[$usv] = 3; 1.415 + } 1.416 + if ($fields[5] =~ /^<narrow>/) { 1.417 + my $wideChar = hex(substr($fields[5], 9)); 1.418 + die "didn't expect supplementary-plane values here" if $usv > 0xffff || $wideChar > 0xffff; 1.419 + $fullWidth[$usv] = $wideChar; 1.420 + } 1.421 + elsif ($fields[5] =~ /^<wide>/) { 1.422 + my $narrowChar = hex(substr($fields[5], 7)); 1.423 + die "didn't expect supplementary-plane values here" if $usv > 0xffff || $narrowChar > 0xffff; 1.424 + $fullWidth[$narrowChar] = $usv; 1.425 + } 1.426 + } 1.427 +} 1.428 +close FH; 1.429 + 1.430 +# read Scripts.txt 1.431 +open FH, "< $ARGV[1]/Scripts.txt" or die "can't open UCD file Scripts.txt\n"; 1.432 +push @versionInfo, ""; 1.433 +while (<FH>) { 1.434 + chomp; 1.435 + push @versionInfo, $_; 1.436 + last if /Date:/; 1.437 +} 1.438 +while (<FH>) { 1.439 + if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) { 1.440 + my $script = uc($3); 1.441 + warn "unknown script $script" unless exists $scriptCode{$script}; 1.442 + $script = $scriptCode{$script}; 1.443 + my $start = hex "0x$1"; 1.444 + my $end = (defined $2) ? hex "0x$2" : $start; 1.445 + for (my $i = $start; $i <= $end; ++$i) { 1.446 + $script[$i] = $script; 1.447 + } 1.448 + } 1.449 +} 1.450 +close FH; 1.451 + 1.452 +# read EastAsianWidth.txt 1.453 +my %eawCode = ( 1.454 + 'A' => 0, # ; Ambiguous 1.455 + 'F' => 1, # ; Fullwidth 1.456 + 'H' => 2, # ; Halfwidth 1.457 + 'N' => 3, # ; Neutral 1.458 + 'NA'=> 4, # ; Narrow 1.459 + 'W' => 5 # ; Wide 1.460 +); 1.461 +open FH, "< $ARGV[1]/EastAsianWidth.txt" or die "can't open UCD file EastAsianWidth.txt\n"; 1.462 +push @versionInfo, ""; 1.463 +while (<FH>) { 1.464 + chomp; 1.465 + push @versionInfo, $_; 1.466 + last if /Date:/; 1.467 +} 1.468 +while (<FH>) { 1.469 + s/#.*//; 1.470 + if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) { 1.471 + my $eaw = uc($3); 1.472 + warn "unknown EAW code $eaw" unless exists $eawCode{$eaw}; 1.473 + $eaw = $eawCode{$eaw}; 1.474 + my $start = hex "0x$1"; 1.475 + my $end = (defined $2) ? hex "0x$2" : $start; 1.476 + for (my $i = $start; $i <= $end; ++$i) { 1.477 + $eaw[$i] = $eaw; 1.478 + } 1.479 + } 1.480 +} 1.481 +close FH; 1.482 + 1.483 +# read BidiMirroring.txt 1.484 +my @offsets = (); 1.485 +push @offsets, 0; 1.486 + 1.487 +open FH, "< $ARGV[1]/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n"; 1.488 +push @versionInfo, ""; 1.489 +while (<FH>) { 1.490 + chomp; 1.491 + push @versionInfo, $_; 1.492 + last if /Date:/; 1.493 +} 1.494 +while (<FH>) { 1.495 + s/#.*//; 1.496 + if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6})/) { 1.497 + my $mirrorOffset = hex("0x$2") - hex("0x$1"); 1.498 + my $offsetIndex = first { $offsets[$_] eq $mirrorOffset } 0..$#offsets; 1.499 + if ($offsetIndex == undef) { 1.500 + die "too many offset codes\n" if scalar @offsets == 31; 1.501 + push @offsets, $mirrorOffset; 1.502 + $offsetIndex = $#offsets; 1.503 + } 1.504 + $mirror[hex "0x$1"] = $offsetIndex; 1.505 + } 1.506 +} 1.507 +close FH; 1.508 + 1.509 +# read HangulSyllableType.txt 1.510 +my %hangulType = ( 1.511 + 'L' => 0x01, 1.512 + 'V' => 0x02, 1.513 + 'T' => 0x04, 1.514 + 'LV' => 0x03, 1.515 + 'LVT' => 0x07 1.516 +); 1.517 +open FH, "< $ARGV[1]/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n"; 1.518 +push @versionInfo, ""; 1.519 +while (<FH>) { 1.520 + chomp; 1.521 + push @versionInfo, $_; 1.522 + last if /Date:/; 1.523 +} 1.524 +while (<FH>) { 1.525 + s/#.*//; 1.526 + if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) { 1.527 + my $hangul = uc($3); 1.528 + warn "unknown Hangul syllable type" unless exists $hangulType{$hangul}; 1.529 + $hangul = $hangulType{$hangul}; 1.530 + my $start = hex "0x$1"; 1.531 + my $end = (defined $2) ? hex "0x$2" : $start; 1.532 + for (my $i = $start; $i <= $end; ++$i) { 1.533 + $hangul[$i] = $hangul; 1.534 + } 1.535 + } 1.536 +} 1.537 +close FH; 1.538 + 1.539 +# read xidmodifications.txt 1.540 +open FH, "< $ARGV[1]/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n"; 1.541 +push @versionInfo, ""; 1.542 +while (<FH>) { 1.543 + chomp; 1.544 + unless (/\xef\xbb\xbf/) { 1.545 + push @versionInfo, $_; 1.546 + } 1.547 + last if /Generated:/; 1.548 +} 1.549 +while (<FH>) { 1.550 + if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) { 1.551 + my $xidmod = $3; 1.552 + warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod}; 1.553 + $xidmod = $xidmodCode{$xidmod}; 1.554 + my $start = hex "0x$1"; 1.555 + my $end = (defined $2) ? hex "0x$2" : $start; 1.556 + for (my $i = $start; $i <= $end; ++$i) { 1.557 + $xidmod[$i] = $xidmod; 1.558 + } 1.559 + } 1.560 +} 1.561 +close FH; 1.562 +# special case U+30FB KATAKANA MIDDLE DOT -- see bug 857490 1.563 +$xidmod[0x30FB] = 1; 1.564 + 1.565 +open FH, "< $ARGV[1]/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n"; 1.566 +push @versionInfo, ""; 1.567 +while (<FH>) { 1.568 + chomp; 1.569 + push @versionInfo, $_; 1.570 + last if /Date:/; 1.571 +} 1.572 +my $savedusv = 0; 1.573 +my $hasTC = 0; 1.574 +my $hasSC = 0; 1.575 +while (<FH>) { 1.576 + chomp; 1.577 + if (m/U\+([0-9A-F]{4,6})\s+k([^ ]+)Variant/) { 1.578 + my $usv = hex "0x$1"; 1.579 + if ($usv != $savedusv) { 1.580 + unless ($savedusv == 0) { 1.581 + if ($hasTC && !$hasSC) { 1.582 + $hanVariant[$savedusv] = 1; 1.583 + } elsif (!$hasTC && $hasSC) { 1.584 + $hanVariant[$savedusv] = 2; 1.585 + } 1.586 + } 1.587 + $savedusv = $usv; 1.588 + $hasTC = 0; 1.589 + $hasSC = 0; 1.590 + } 1.591 + if ($2 eq "Traditional") { 1.592 + $hasTC = 1; 1.593 + } 1.594 + if ($2 eq "Simplified") { 1.595 + $hasSC = 1; 1.596 + } 1.597 + } 1.598 +} 1.599 +close FH; 1.600 + 1.601 +my $timestamp = gmtime(); 1.602 + 1.603 +open DATA_TABLES, "> nsUnicodePropertyData.cpp" or die "unable to open nsUnicodePropertyData.cpp for output"; 1.604 + 1.605 +my $licenseBlock = q[ 1.606 +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 1.607 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.608 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.609 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.610 + 1.611 +/* 1.612 + * Derived from the Unicode Character Database by genUnicodePropertyData.pl 1.613 + * 1.614 + * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html 1.615 + */ 1.616 +]; 1.617 + 1.618 +my $versionInfo = join("\n", @versionInfo); 1.619 + 1.620 +print DATA_TABLES <<__END; 1.621 +$licenseBlock 1.622 +/* 1.623 + * Created on $timestamp from UCD data files with version info: 1.624 + * 1.625 + 1.626 +$versionInfo 1.627 + 1.628 + * 1.629 + * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * 1.630 + */ 1.631 + 1.632 +#include <stdint.h> 1.633 +#include "harfbuzz/hb.h" 1.634 + 1.635 +__END 1.636 + 1.637 +open HEADER, "> nsUnicodeScriptCodes.h" or die "unable to open nsUnicodeScriptCodes.h for output"; 1.638 + 1.639 +print HEADER <<__END; 1.640 +$licenseBlock 1.641 +/* 1.642 + * Created on $timestamp from UCD data files with version info: 1.643 + * 1.644 + 1.645 +$versionInfo 1.646 + 1.647 + * 1.648 + * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * 1.649 + */ 1.650 + 1.651 +#ifndef NS_UNICODE_SCRIPT_CODES 1.652 +#define NS_UNICODE_SCRIPT_CODES 1.653 + 1.654 +__END 1.655 + 1.656 +print DATA_TABLES "static const uint32_t sScriptCodeToTag[] = {\n"; 1.657 +for (my $i = 0; $i < scalar @scriptCodeToTag; ++$i) { 1.658 + printf DATA_TABLES " HB_TAG(%s)", $scriptCodeToTag[$i]; 1.659 + print DATA_TABLES $i < $#scriptCodeToTag ? ",\n" : "\n"; 1.660 +} 1.661 +print DATA_TABLES "};\n\n"; 1.662 + 1.663 +our $totalData = 0; 1.664 + 1.665 +print DATA_TABLES "static const int16_t sMirrorOffsets[] = {\n"; 1.666 +for (my $i = 0; $i < scalar @offsets; ++$i) { 1.667 + printf DATA_TABLES " $offsets[$i]"; 1.668 + print DATA_TABLES $i < $#offsets ? ",\n" : "\n"; 1.669 +} 1.670 +print DATA_TABLES "};\n\n"; 1.671 + 1.672 +print HEADER "#pragma pack(1)\n\n"; 1.673 + 1.674 +sub sprintCharProps1 1.675 +{ 1.676 + my $usv = shift; 1.677 + return sprintf("{%d,%d,%d}, ", $mirror[$usv], $hangul[$usv], $combining[$usv]); 1.678 +} 1.679 +&genTables("CharProp1", "struct nsCharProps1 {\n unsigned char mMirrorOffsetIndex:5;\n unsigned char mHangulType:3;\n unsigned char mCombiningClass:8;\n};", 1.680 + "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1); 1.681 + 1.682 +sub sprintCharProps2 1.683 +{ 1.684 + my $usv = shift; 1.685 + return sprintf("{%d,%d,%d,%d,%d,%d},", 1.686 + $script[$usv], $eaw[$usv], $category[$usv], 1.687 + $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv]); 1.688 +} 1.689 +&genTables("CharProp2", "struct nsCharProps2 {\n unsigned char mScriptCode:8;\n unsigned char mEAW:3;\n unsigned char mCategory:5;\n unsigned char mBidiCategory:5;\n unsigned char mXidmod:4;\n signed char mNumericValue:5;\n unsigned char mHanVariant:2;\n};", 1.690 + "nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1); 1.691 + 1.692 +print HEADER "#pragma pack()\n\n"; 1.693 + 1.694 +sub sprintHanVariants 1.695 +{ 1.696 + my $baseUsv = shift; 1.697 + my $varShift = 0; 1.698 + my $val = 0; 1.699 + while ($varShift < 8) { 1.700 + $val |= $hanVariant[$baseUsv++] << $varShift; 1.701 + $varShift += 2; 1.702 + } 1.703 + return sprintf("0x%02x,", $val); 1.704 +} 1.705 +&genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4); 1.706 + 1.707 +sub sprintFullWidth 1.708 +{ 1.709 + my $usv = shift; 1.710 + return sprintf("0x%04x,", $fullWidth[$usv]); 1.711 +} 1.712 +&genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1); 1.713 + 1.714 +sub sprintCasemap 1.715 +{ 1.716 + my $usv = shift; 1.717 + return sprintf("0x%08x,", $casemap[$usv]); 1.718 +} 1.719 +&genTables("CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1); 1.720 + 1.721 +print STDERR "Total data = $totalData\n"; 1.722 + 1.723 +printf DATA_TABLES "const uint32_t kTitleToUpper = 0x%08x;\n", $kTitleToUpper; 1.724 +printf DATA_TABLES "const uint32_t kUpperToLower = 0x%08x;\n", $kUpperToLower; 1.725 +printf DATA_TABLES "const uint32_t kLowerToTitle = 0x%08x;\n", $kLowerToTitle; 1.726 +printf DATA_TABLES "const uint32_t kLowerToUpper = 0x%08x;\n", $kLowerToUpper; 1.727 +printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCharMask; 1.728 + 1.729 +sub genTables 1.730 +{ 1.731 + my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_; 1.732 + 1.733 + print DATA_TABLES "#define k${prefix}MaxPlane $maxPlane\n"; 1.734 + print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n"; 1.735 + print DATA_TABLES "#define k${prefix}CharBits $charBits\n"; 1.736 + 1.737 + my $indexLen = 1 << $indexBits; 1.738 + my $charsPerPage = 1 << $charBits; 1.739 + my %charIndex = (); 1.740 + my %pageMapIndex = (); 1.741 + my @pageMap = (); 1.742 + my @char = (); 1.743 + 1.744 + my $planeMap = "\x00" x $maxPlane; 1.745 + foreach my $plane (0 .. $maxPlane) { 1.746 + my $pageMap = "\x00" x $indexLen * 2; 1.747 + foreach my $page (0 .. $indexLen - 1) { 1.748 + my $charValues = ""; 1.749 + for (my $ch = 0; $ch < $charsPerPage; $ch += $charsPerEntry) { 1.750 + my $usv = $plane * 0x10000 + $page * $charsPerPage + $ch; 1.751 + $charValues .= &$func($usv); 1.752 + } 1.753 + chop $charValues; 1.754 + 1.755 + unless (exists $charIndex{$charValues}) { 1.756 + $charIndex{$charValues} = scalar keys %charIndex; 1.757 + $char[$charIndex{$charValues}] = $charValues; 1.758 + } 1.759 + substr($pageMap, $page * 2, 2) = pack('S', $charIndex{$charValues}); 1.760 + } 1.761 + 1.762 + unless (exists $pageMapIndex{$pageMap}) { 1.763 + $pageMapIndex{$pageMap} = scalar keys %pageMapIndex; 1.764 + $pageMap[$pageMapIndex{$pageMap}] = $pageMap; 1.765 + } 1.766 + if ($plane > 0) { 1.767 + substr($planeMap, $plane - 1, 1) = pack('C', $pageMapIndex{$pageMap}); 1.768 + } 1.769 + } 1.770 + 1.771 + if ($maxPlane) { 1.772 + print DATA_TABLES "static const uint8_t s${prefix}Planes[$maxPlane] = {"; 1.773 + print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('C*', $planeMap)); 1.774 + print DATA_TABLES "};\n\n"; 1.775 + } 1.776 + 1.777 + my $chCount = scalar @char; 1.778 + my $pmBits = $chCount > 255 ? 16 : 8; 1.779 + my $pmCount = scalar @pageMap; 1.780 + if ($maxPlane == 0) { 1.781 + die "there should only be one pageMap entry!" if $pmCount > 1; 1.782 + print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$indexLen] = {\n"; 1.783 + } else { 1.784 + print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$pmCount][$indexLen] = {\n"; 1.785 + } 1.786 + for (my $i = 0; $i < scalar @pageMap; ++$i) { 1.787 + print DATA_TABLES $maxPlane > 0 ? " {" : " "; 1.788 + print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('S*', $pageMap[$i])); 1.789 + print DATA_TABLES $maxPlane > 0 ? ($i < $#pageMap ? "},\n" : "}\n") : "\n"; 1.790 + } 1.791 + print DATA_TABLES "};\n\n"; 1.792 + 1.793 + print HEADER "$typedef\n\n" if $typedef ne ''; 1.794 + 1.795 + my $pageLen = $charsPerPage / $charsPerEntry; 1.796 + print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n"; 1.797 + for (my $i = 0; $i < scalar @char; ++$i) { 1.798 + print DATA_TABLES " {"; 1.799 + print DATA_TABLES $char[$i]; 1.800 + print DATA_TABLES $i < $#char ? "},\n" : "}\n"; 1.801 + } 1.802 + print DATA_TABLES "};\n\n"; 1.803 + 1.804 + my $dataSize = $pmCount * $indexLen * $pmBits/8 + 1.805 + $chCount * $pageLen * $bytesPerEntry + 1.806 + $maxPlane; 1.807 + $totalData += $dataSize; 1.808 + 1.809 + print STDERR "Data for $prefix = $dataSize\n"; 1.810 +} 1.811 + 1.812 +print DATA_TABLES <<__END; 1.813 +/* 1.814 + * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * 1.815 + */ 1.816 +__END 1.817 + 1.818 +close DATA_TABLES; 1.819 + 1.820 +print HEADER "enum {\n"; 1.821 +for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) { 1.822 + print HEADER " MOZ_SCRIPT_", $scriptCodeToName[$i], " = ", $i, ",\n"; 1.823 +} 1.824 +print HEADER "\n MOZ_NUM_SCRIPT_CODES = ", scalar @scriptCodeToName, ",\n"; 1.825 +print HEADER "\n MOZ_SCRIPT_INVALID = -1\n"; 1.826 +print HEADER "};\n\n"; 1.827 + 1.828 +print HEADER <<__END; 1.829 +#endif 1.830 +/* 1.831 + * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * 1.832 + */ 1.833 +__END 1.834 + 1.835 +close HEADER; 1.836 +