1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/unicharutil/tools/genSpecialCasingData.pl Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,287 @@ 1.4 +#!/usr/bin/env perl 1.5 + 1.6 +# This Source Code Form is subject to the terms of the Mozilla Public 1.7 +# License, v. 2.0. If a copy of the MPL was not distributed with this file, 1.8 +# You can obtain one at http://mozilla.org/MPL/2.0/. 1.9 + 1.10 +# This tool is used to extract "special" (one-to-many) case mappings 1.11 +# into a form that can be used by nsTextRunTransformations. 1.12 + 1.13 +use strict; 1.14 + 1.15 +if ($#ARGV != 1) { 1.16 + print <<__EOT; 1.17 +# Run this tool using a command line of the form 1.18 +# 1.19 +# perl genSpecialCasingData.pl UnicodeData.txt SpecialCasing.txt 1.20 +# 1.21 +# The nsSpecialCasingData.cpp file will be written to standard output. 1.22 +# 1.23 +# This tool will also write up-to-date versions of the test files 1.24 +# all-{upper,lower,title}.html 1.25 +# and corresponding -ref files in the current directory. 1.26 +# 1.27 +__EOT 1.28 + exit 0; 1.29 +} 1.30 + 1.31 +my %allLower; 1.32 +my %allUpper; 1.33 +my %allTitle; 1.34 +my %compositions; 1.35 +my %gc; 1.36 +open FH, "< $ARGV[0]" or die "can't open $ARGV[0] (should be UnicodeData.txt)\n"; 1.37 +while (<FH>) { 1.38 + chomp; 1.39 + my @fields = split /;/; 1.40 + next if ($fields[1] =~ /</); # ignore ranges etc 1.41 + my $usv = hex "0x$fields[0]"; 1.42 + $allUpper{$usv} = $fields[12] if $fields[12] ne ''; 1.43 + $allLower{$usv} = $fields[13] if $fields[13] ne ''; 1.44 + $allTitle{$usv} = $fields[14] if $fields[14] ne ''; 1.45 + $gc{$usv} = $fields[2]; 1.46 + # we only care about non-singleton canonical decomps 1.47 + my $decomp = $fields[5]; 1.48 + next if $decomp eq '' or $decomp =~ /</ or not $decomp =~ / /; 1.49 + $compositions{$decomp} = sprintf("%04X", $usv); 1.50 +} 1.51 +close FH; 1.52 + 1.53 +my %specialLower; 1.54 +my %specialUpper; 1.55 +my %specialTitle; 1.56 +my %charName; 1.57 +my @headerLines; 1.58 +open FH, "< $ARGV[1]" or die "can't open $ARGV[1] (should be SpecialCasing.txt)\n"; 1.59 +while (<FH>) { 1.60 + chomp; 1.61 + m/#\s*(.+)$/; 1.62 + my $comment = $1; 1.63 + if ($comment =~ /^(SpecialCasing-|Date:)/) { 1.64 + push @headerLines, $comment; 1.65 + next; 1.66 + } 1.67 + s/#.*//; 1.68 + s/;\s*$//; 1.69 + next if $_ eq ''; 1.70 + my @fields = split /; */; 1.71 + next unless (scalar @fields) == 4; 1.72 + my $usv = hex "0x$fields[0]"; 1.73 + addIfSpecial(\%specialLower, $usv, $fields[1]); 1.74 + addIfSpecial(\%specialTitle, $usv, $fields[2]); 1.75 + addIfSpecial(\%specialUpper, $usv, $fields[3]); 1.76 + $charName{$usv} = $comment; 1.77 +} 1.78 +close FH; 1.79 + 1.80 +print <<__END__; 1.81 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.82 + * License, v. 2.0. If a copy of the MPL was not distributed with this file, 1.83 + * You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.84 + 1.85 +/* Auto-generated from files in the Unicode Character Database 1.86 + by genSpecialCasingData.pl - do not edit! */ 1.87 + 1.88 +#include "nsSpecialCasingData.h" 1.89 +#include "mozilla/Util.h" // for ArrayLength 1.90 +#include <stdlib.h> // for bsearch 1.91 + 1.92 +__END__ 1.93 +map { print "/* $_ */\n" } @headerLines; 1.94 + 1.95 +print <<__END__; 1.96 + 1.97 +using mozilla::unicode::MultiCharMapping; 1.98 + 1.99 +__END__ 1.100 + 1.101 +printMappings('Lower', \%specialLower); 1.102 +printMappings('Upper', \%specialUpper); 1.103 +printMappings('Title', \%specialTitle); 1.104 + 1.105 +print <<__END__; 1.106 +static int CompareMCM(const void* aKey, const void* aElement) 1.107 +{ 1.108 + const uint32_t ch = *static_cast<const uint32_t*>(aKey); 1.109 + const MultiCharMapping* mcm = static_cast<const MultiCharMapping*>(aElement); 1.110 + return int(ch) - int(mcm->mOriginalChar); 1.111 +} 1.112 + 1.113 +#define MAKE_SPECIAL_CASE_ACCESSOR(which) \\ 1.114 + const MultiCharMapping* \\ 1.115 + Special##which(uint32_t aChar) \\ 1.116 + { \\ 1.117 + const void* p = bsearch(&aChar, CaseSpecials_##which, \\ 1.118 + mozilla::ArrayLength(CaseSpecials_##which), \\ 1.119 + sizeof(MultiCharMapping), CompareMCM); \\ 1.120 + return static_cast<const MultiCharMapping*>(p); \\ 1.121 + } 1.122 + 1.123 +namespace mozilla { 1.124 +namespace unicode { 1.125 + 1.126 +MAKE_SPECIAL_CASE_ACCESSOR(Lower) 1.127 +MAKE_SPECIAL_CASE_ACCESSOR(Upper) 1.128 +MAKE_SPECIAL_CASE_ACCESSOR(Title) 1.129 + 1.130 +} // namespace unicode 1.131 +} // namespace mozilla 1.132 +__END__ 1.133 + 1.134 +addSpecialsTo(\%allLower, \%specialLower); 1.135 +addSpecialsTo(\%allUpper, \%specialUpper); 1.136 +addSpecialsTo(\%allTitle, \%specialTitle); 1.137 + 1.138 +my $testFont = "../fonts/dejavu-sans/DejaVuSans.ttf"; 1.139 +genTest('lower', \%allLower); 1.140 +genTest('upper', \%allUpper); 1.141 +genTitleTest(); 1.142 + 1.143 +sub printMappings { 1.144 + my ($whichMapping, $hash) = @_; 1.145 + print "static const MultiCharMapping CaseSpecials_${whichMapping}[] = {\n"; 1.146 + foreach my $key (sort { $a <=> $b } keys %$hash) { 1.147 + my @chars = split(/ /, $hash->{$key}); 1.148 + printf " { 0x%04x, {0x%04x, 0x%04x, 0x%04x} }, // %s\n", $key, 1.149 + hex "0x0$chars[0]", hex "0x0$chars[1]", hex "0x0$chars[2]", 1.150 + "$charName{$key}"; 1.151 + } 1.152 + print "};\n\n"; 1.153 +}; 1.154 + 1.155 +sub addIfSpecial { 1.156 + my ($hash, $usv, $mapping) = @_; 1.157 + return unless $mapping =~ / /; 1.158 + # only do compositions that start with the initial char 1.159 + foreach (keys %compositions) { 1.160 + $mapping =~ s/^$_/$compositions{$_}/; 1.161 + } 1.162 + $hash->{$usv} = $mapping; 1.163 +}; 1.164 + 1.165 +sub addSpecialsTo { 1.166 + my ($hash, $specials) = @_; 1.167 + foreach my $key (keys %$specials) { 1.168 + $hash->{$key} = $specials->{$key}; 1.169 + } 1.170 +}; 1.171 + 1.172 +sub genTest { 1.173 + my ($whichMapping, $hash) = @_; 1.174 + open OUT, "> all-$whichMapping.html"; 1.175 + print OUT <<__END__; 1.176 +<!DOCTYPE html> 1.177 +<html> 1.178 + <head> 1.179 + <meta http-equiv="Content-type" content="text/html; charset=utf-8"> 1.180 + <style type="text/css"> 1.181 + \@font-face { font-family: foo; src: url($testFont); } 1.182 + p { font-family: foo; text-transform: ${whichMapping}case; } 1.183 + </style> 1.184 + </head> 1.185 + <body> 1.186 + <p> 1.187 +__END__ 1.188 + foreach my $key (sort { $a <=> $b } keys %$hash) { 1.189 + printf OUT "&#x%04X;", $key; 1.190 + print OUT " <!-- $charName{$key} -->" if exists $charName{$key}; 1.191 + print OUT "\n"; 1.192 + } 1.193 + print OUT <<__END__; 1.194 + </p> 1.195 + </body> 1.196 +</html> 1.197 +__END__ 1.198 + close OUT; 1.199 + 1.200 + open OUT, "> all-$whichMapping-ref.html"; 1.201 + print OUT <<__END__; 1.202 +<!DOCTYPE html> 1.203 +<html> 1.204 + <head> 1.205 + <meta http-equiv="Content-type" content="text/html; charset=utf-8"> 1.206 + <style type="text/css"> 1.207 + \@font-face { font-family: foo; src: url($testFont); } 1.208 + p { font-family: foo; } 1.209 + </style> 1.210 + </head> 1.211 + <body> 1.212 + <p> 1.213 +__END__ 1.214 + foreach my $key (sort { $a <=> $b } keys %$hash) { 1.215 + print OUT join('', map { sprintf("&#x%s;", $_) } split(/ /, $hash->{$key})); 1.216 + print OUT " <!-- $charName{$key} -->" if exists $charName{$key}; 1.217 + print OUT "\n"; 1.218 + } 1.219 + print OUT <<__END__; 1.220 + </p> 1.221 + </body> 1.222 +</html> 1.223 +__END__ 1.224 + close OUT; 1.225 +}; 1.226 + 1.227 +sub genTitleTest { 1.228 + open OUT, "> all-title.html"; 1.229 + print OUT <<__END__; 1.230 +<!DOCTYPE html> 1.231 +<html> 1.232 + <head> 1.233 + <meta http-equiv="Content-type" content="text/html; charset=utf-8"> 1.234 + <style type="text/css"> 1.235 + \@font-face { font-family: foo; src: url($testFont); } 1.236 + p { font-family: foo; text-transform: capitalize; } 1.237 + </style> 1.238 + </head> 1.239 + <body> 1.240 + <p> 1.241 +__END__ 1.242 + foreach my $key (sort { $a <=> $b } keys %allTitle) { 1.243 + printf OUT "&#x%04X;x", $key; 1.244 + print OUT " <!-- $charName{$key} -->" if exists $charName{$key}; 1.245 + print OUT "\n"; 1.246 + } 1.247 + print OUT <<__END__; 1.248 + </p> 1.249 + </body> 1.250 +</html> 1.251 +__END__ 1.252 + close OUT; 1.253 + 1.254 + open OUT, "> all-title-ref.html"; 1.255 + print OUT <<__END__; 1.256 +<!DOCTYPE html> 1.257 +<html> 1.258 + <head> 1.259 + <meta http-equiv="Content-type" content="text/html; charset=utf-8"> 1.260 + <style type="text/css"> 1.261 + \@font-face { font-family: foo; src: url($testFont); } 1.262 + p { font-family: foo; } 1.263 + </style> 1.264 + </head> 1.265 + <body> 1.266 + <p> 1.267 +__END__ 1.268 + foreach my $key (sort { $a <=> $b } keys %allTitle) { 1.269 + # capitalize is only applied to characters with GC=L* or N*... 1.270 + if ($gc{$key} =~ /^[LN]/) { 1.271 + # ...and those that are already uppercase are not transformed 1.272 + if (exists $allUpper{$key}) { 1.273 + print OUT join('', map { sprintf("&#x%s;", $_) } split(/ /, $allTitle{$key})); 1.274 + } else { 1.275 + printf OUT "&#x%04X;", $key; 1.276 + } 1.277 + print OUT "x"; 1.278 + } else { 1.279 + printf OUT "&#x%04X;X", $key; 1.280 + } 1.281 + print OUT " <!-- $charName{$key} -->" if exists $charName{$key}; 1.282 + print OUT "\n"; 1.283 + } 1.284 + print OUT <<__END__; 1.285 + </p> 1.286 + </body> 1.287 +</html> 1.288 +__END__ 1.289 + close OUT; 1.290 +};