michael@0: #!/usr/bin/perl michael@0: # michael@0: # This Source Code Form is subject to the terms of the Mozilla Public michael@0: # License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: # file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: michael@0: $header = <//eg; michael@0: $t = foldcombining($t); michael@0: return rdecompose( $table{$t}); michael@0: } michael@0: return $dec; michael@0: } michael@0: sub decompose michael@0: { michael@0: my ($removeprefix, $dec) = (@_); michael@0: $removeprefix .= " "; michael@0: michael@0: $dec =~ s/$removeprefix//eg; michael@0: if($dec eq "0020") { michael@0: $dec = "\\u0020"; michael@0: } elsif($dec eq "005C") { michael@0: $dec = "\\u005C"; michael@0: } else { michael@0: $k = "\/"; michael@0: $dec =~ s/2044/$k/eg; michael@0: $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg; michael@0: $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g; michael@0: $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg; michael@0: $dec =~ s/ //eg; michael@0: } michael@0: return $dec; michael@0: } michael@0: michael@0: ###################################################################### michael@0: # michael@0: # Open the unicode database file michael@0: # michael@0: ###################################################################### michael@0: open ( UNICODATA , "< UnicodeData-Latest.txt") michael@0: || die "cannot find UnicodeData-Latest.txt"; michael@0: michael@0: open ( UNICODATA2 , "< UnicodeData-Latest.txt") michael@0: || die "cannot find UnicodeData-Latest.txt"; michael@0: ###################################################################### michael@0: # michael@0: # Open the output file michael@0: # michael@0: ###################################################################### michael@0: open ( OUT , "> ../tables/transliterate.properties") michael@0: || die "cannot open output ../tables/transliterate.properties file"; michael@0: michael@0: print OUT $header; michael@0: michael@0: # remove comments from $handcoded michael@0: $handcoded =~ s/^#[^#].*\n//mg; michael@0: print OUT $handcoded; michael@0: michael@0: ###################################################################### michael@0: # michael@0: # Process the file line by line michael@0: # michael@0: ###################################################################### michael@0: while() { michael@0: chop; michael@0: @f = split(/;/ , $_); michael@0: $udec = hex($u); michael@0: if(($udec > 256 ) && ($f[5] ne "")) { michael@0: $table{$f[0]}=$f[5]; michael@0: } michael@0: } michael@0: while() { michael@0: chop; michael@0: ###################################################################### michael@0: # michael@0: # Get value from fields michael@0: # michael@0: ###################################################################### michael@0: @f = split(/;/ , $_); michael@0: $u = $f[0]; # The unicode value michael@0: $cmt = $f[1]; # The comment michael@0: $dec = $f[5]; # The decomposed value michael@0: $d1 = $f[6]; michael@0: $d2 = $f[7]; michael@0: $d3 = $f[8]; michael@0: $udec = hex($u); michael@0: michael@0: if($udec > 128) michael@0: { michael@0: # not ASCII michael@0: if($dec ne "") michael@0: { michael@0: # have decomposition michael@0: if($dec =~ //) { michael@0: output($u,$cmt,$udec,&decompose("", $dec)); michael@0: } elsif($dec =~ //) { michael@0: # ignore non ASCII decomposition michael@0: # warning($_); michael@0: } elsif($dec =~ //) { michael@0: output($u,$cmt,$udec,&decompose("", "(".$dec.")")); michael@0: } elsif($dec =~ //) { michael@0: output($u,$cmt,$udec,&decompose("", $dec)); michael@0: } elsif($dec =~ //) { michael@0: output($u,$cmt,$udec,&decompose("", $dec)); michael@0: } elsif($dec =~ //) { michael@0: # warning($_); michael@0: } elsif($dec =~ //) { michael@0: output($u,$cmt,$udec,"^(".&decompose("", $dec).")"); michael@0: } elsif($dec =~ //) { michael@0: output($u,$cmt,$udec,"v(".&decompose("", $dec).")"); michael@0: } elsif($dec =~ //) { michael@0: output($u,$cmt,$udec,&decompose("", $dec)); michael@0: } elsif($dec =~ //) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($dec =~ //) { michael@0: output($u,$cmt,$udec,&decompose("", $dec)); michael@0: } elsif($dec =~ //) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($dec =~ //) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($dec =~ //) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($dec =~ //) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($dec =~ //) { michael@0: if($dec eq " 0020") michael@0: { michael@0: output($u,$cmt,$udec,"\\u0020"); michael@0: } else { michael@0: # ignore michael@0: # warning($_); michael@0: } michael@0: } else { michael@0: warning($_); michael@0: } michael@0: } else { michael@0: # decomposition without format code michael@0: if($cmt =~ /LATIN/) { michael@0: $dec = foldcombining($dec); michael@0: output($u,$cmt,$udec,&decompose("", $dec)); michael@0: } elsif($cmt =~ /CYRILLIC/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /GREEK/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /ARABIC/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /CJK/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /HEBREW/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /DEVANAGARI/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /BENGALI/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /GURMUKHI/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /ORIYA/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /TAMIL/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /TELUGU/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /KANNADA/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /MALAYALAM/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /SINHALA/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /TIBETAN/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /MYANMAR/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /KATAKANA/) { michael@0: # ignore michael@0: # warning($_); michael@0: } elsif($cmt =~ /HIRAGANA/) { michael@0: # ignore michael@0: # warning($_); michael@0: } else { michael@0: # ignore michael@0: # warning($_); michael@0: } michael@0: } michael@0: } else { michael@0: # do not have decomposition michael@0: if ($d1 ne "") michael@0: { michael@0: # are numeric characters michael@0: output($u,$cmt,$udec,$d1); michael@0: } elsif ($d2 ne "") { michael@0: if($cmt =~ /CIRCLED/) { michael@0: # circled michael@0: output($u,$cmt,$udec,"(".$d2.")"); michael@0: } else { michael@0: # others, use [ ] michael@0: output($u,$cmt,$udec,"[".$d2."]"); michael@0: } michael@0: } elsif ($d3 ne "") { michael@0: if($cmt =~ /CIRCLED/) { michael@0: # circled michael@0: output($u,$cmt,$udec,"(".$d3.")"); michael@0: } else { michael@0: # others, use [ ] michael@0: output($u,$cmt,$udec,"[".$d3."]"); michael@0: } michael@0: } else { michael@0: # not numeric characters michael@0: michael@0: } # end of no decomposition michael@0: } # end of have/not decomposition michael@0: } michael@0: } michael@0: ###################################################################### michael@0: # michael@0: # Close files michael@0: # michael@0: ###################################################################### michael@0: close(UNIDATA); michael@0: close(OUT); michael@0: