diff -r 000000000000 -r 6474c204b198 intl/unicharutil/tools/gentransliterate.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/intl/unicharutil/tools/gentransliterate.pl Wed Dec 31 06:09:35 2014 +0100 @@ -0,0 +1,477 @@ +#!/usr/bin/perl +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +$header = <//eg; + $t = foldcombining($t); + return rdecompose( $table{$t}); + } + return $dec; +} +sub decompose +{ + my ($removeprefix, $dec) = (@_); + $removeprefix .= " "; + + $dec =~ s/$removeprefix//eg; + if($dec eq "0020") { + $dec = "\\u0020"; + } elsif($dec eq "005C") { + $dec = "\\u005C"; + } else { + $k = "\/"; + $dec =~ s/2044/$k/eg; + $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg; + $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g; + $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg; + $dec =~ s/ //eg; + } + return $dec; +} + +###################################################################### +# +# Open the unicode database file +# +###################################################################### +open ( UNICODATA , "< UnicodeData-Latest.txt") + || die "cannot find UnicodeData-Latest.txt"; + +open ( UNICODATA2 , "< UnicodeData-Latest.txt") + || die "cannot find UnicodeData-Latest.txt"; +###################################################################### +# +# Open the output file +# +###################################################################### +open ( OUT , "> ../tables/transliterate.properties") + || die "cannot open output ../tables/transliterate.properties file"; + +print OUT $header; + +# remove comments from $handcoded +$handcoded =~ s/^#[^#].*\n//mg; +print OUT $handcoded; + +###################################################################### +# +# Process the file line by line +# +###################################################################### +while() { + chop; + @f = split(/;/ , $_); + $udec = hex($u); + if(($udec > 256 ) && ($f[5] ne "")) { + $table{$f[0]}=$f[5]; + } +} +while() { + chop; + ###################################################################### + # + # Get value from fields + # + ###################################################################### + @f = split(/;/ , $_); + $u = $f[0]; # The unicode value + $cmt = $f[1]; # The comment + $dec = $f[5]; # The decomposed value + $d1 = $f[6]; + $d2 = $f[7]; + $d3 = $f[8]; + $udec = hex($u); + + if($udec > 128) + { + # not ASCII + if($dec ne "") + { + # have decomposition + if($dec =~ //) { + output($u,$cmt,$udec,&decompose("", $dec)); + } elsif($dec =~ //) { + # ignore non ASCII decomposition + # warning($_); + } elsif($dec =~ //) { + output($u,$cmt,$udec,&decompose("", "(".$dec.")")); + } elsif($dec =~ //) { + output($u,$cmt,$udec,&decompose("", $dec)); + } elsif($dec =~ //) { + output($u,$cmt,$udec,&decompose("", $dec)); + } elsif($dec =~ //) { + # warning($_); + } elsif($dec =~ //) { + output($u,$cmt,$udec,"^(".&decompose("", $dec).")"); + } elsif($dec =~ //) { + output($u,$cmt,$udec,"v(".&decompose("", $dec).")"); + } elsif($dec =~ //) { + output($u,$cmt,$udec,&decompose("", $dec)); + } elsif($dec =~ //) { + # ignore + # warning($_); + } elsif($dec =~ //) { + output($u,$cmt,$udec,&decompose("", $dec)); + } elsif($dec =~ //) { + # ignore + # warning($_); + } elsif($dec =~ //) { + # ignore + # warning($_); + } elsif($dec =~ //) { + # ignore + # warning($_); + } elsif($dec =~ //) { + # ignore + # warning($_); + } elsif($dec =~ //) { + if($dec eq " 0020") + { + output($u,$cmt,$udec,"\\u0020"); + } else { + # ignore + # warning($_); + } + } else { + warning($_); + } + } else { + # decomposition without format code + if($cmt =~ /LATIN/) { + $dec = foldcombining($dec); + output($u,$cmt,$udec,&decompose("", $dec)); + } elsif($cmt =~ /CYRILLIC/) { + # ignore + # warning($_); + } elsif($cmt =~ /GREEK/) { + # ignore + # warning($_); + } elsif($cmt =~ /ARABIC/) { + # ignore + # warning($_); + } elsif($cmt =~ /CJK/) { + # ignore + # warning($_); + } elsif($cmt =~ /HEBREW/) { + # ignore + # warning($_); + } elsif($cmt =~ /DEVANAGARI/) { + # ignore + # warning($_); + } elsif($cmt =~ /BENGALI/) { + # ignore + # warning($_); + } elsif($cmt =~ /GURMUKHI/) { + # ignore + # warning($_); + } elsif($cmt =~ /ORIYA/) { + # ignore + # warning($_); + } elsif($cmt =~ /TAMIL/) { + # ignore + # warning($_); + } elsif($cmt =~ /TELUGU/) { + # ignore + # warning($_); + } elsif($cmt =~ /KANNADA/) { + # ignore + # warning($_); + } elsif($cmt =~ /MALAYALAM/) { + # ignore + # warning($_); + } elsif($cmt =~ /SINHALA/) { + # ignore + # warning($_); + } elsif($cmt =~ /TIBETAN/) { + # ignore + # warning($_); + } elsif($cmt =~ /MYANMAR/) { + # ignore + # warning($_); + } elsif($cmt =~ /KATAKANA/) { + # ignore + # warning($_); + } elsif($cmt =~ /HIRAGANA/) { + # ignore + # warning($_); + } else { + # ignore + # warning($_); + } + } + } else { + # do not have decomposition + if ($d1 ne "") + { + # are numeric characters + output($u,$cmt,$udec,$d1); + } elsif ($d2 ne "") { + if($cmt =~ /CIRCLED/) { + # circled + output($u,$cmt,$udec,"(".$d2.")"); + } else { + # others, use [ ] + output($u,$cmt,$udec,"[".$d2."]"); + } + } elsif ($d3 ne "") { + if($cmt =~ /CIRCLED/) { + # circled + output($u,$cmt,$udec,"(".$d3.")"); + } else { + # others, use [ ] + output($u,$cmt,$udec,"[".$d3."]"); + } + } else { + # not numeric characters + + } # end of no decomposition + } # end of have/not decomposition + } +} +###################################################################### +# +# Close files +# +###################################################################### +close(UNIDATA); +close(OUT); +