|
1 #!/usr/bin/perl |
|
2 # |
|
3 # This Source Code Form is subject to the terms of the Mozilla Public |
|
4 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
6 |
|
7 $header = <<END_OF_HEADER; |
|
8 # This Source Code Form is subject to the terms of the Mozilla Public |
|
9 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
10 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
11 |
|
12 # |
|
13 # THIS FILE IS GENERATED BY mozilla/intl/unicharutil/tools/gentransliterate.pl |
|
14 # PLEASE DO NOT MODIFY THIS FILE BY HAND |
|
15 # |
|
16 entity.list.name=transliterate |
|
17 entity.169=(c) |
|
18 # |
|
19 # |
|
20 # Here are the windows-1252 characters from the range 0x80 - 0x9F |
|
21 # |
|
22 END_OF_HEADER |
|
23 |
|
24 $handcoded = <<END_OF_HANDCODED; |
|
25 # EURO SIGN |
|
26 entity.8364=EUR |
|
27 # SINGLE LOW-9 QUOTATION MARK |
|
28 entity.8218=, |
|
29 # LATIN SMALL LETTER F WITH HOOK |
|
30 entity.402=f |
|
31 # DOUBLE LOW-9 QUOTATION MARK |
|
32 entity.8222=" |
|
33 # DAGGER |
|
34 entity.8224=+ |
|
35 # DOUBLE DAGGER |
|
36 entity.8225=++ |
|
37 # MODIFIER LETTER CIRCUMFLEX ACCENT |
|
38 entity.710=^ |
|
39 # PER MILLE SIGN |
|
40 entity.8240=0/00 |
|
41 # SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
|
42 entity.8249=< |
|
43 # LATIN CAPITAL LIGATURE OE |
|
44 entity.338=OE |
|
45 # LEFT SINGLE QUOTATION MARK |
|
46 entity.8216=' |
|
47 # RIGHT SINGLE QUOTATION MARK |
|
48 entity.8217=' |
|
49 # LEFT DOUBLE QUOTATION MARK |
|
50 entity.8220=" |
|
51 # RIGHT DOUBLE QUOTATION MARK |
|
52 entity.8221=" |
|
53 # BULLET |
|
54 entity.8226=. |
|
55 # EN DASH |
|
56 entity.8211=-- |
|
57 # EM DASH |
|
58 entity.8212=--- |
|
59 # SMALL TILDE |
|
60 entity.732=~ |
|
61 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
|
62 entity.8250=> |
|
63 # LATIN SMALL LIGATURE OE |
|
64 entity.339=oe |
|
65 # U+2000 EN QUAD |
|
66 entity.8192=\\u0020 |
|
67 # U+2001 EM QUAD |
|
68 entity.8193=\\u0020 |
|
69 # U+2010 HYPHEN |
|
70 entity.8208=- |
|
71 # U+2011 NON-BREAKING HYPHEN |
|
72 entity.8209=- |
|
73 # U+2012 FIGURE DASH |
|
74 entity.8210=- |
|
75 # U+2015 HORIZONTAL BAR |
|
76 entity.8213=-- |
|
77 # U+200B, ZERO WIDTH SPACE (a.k.a. InvisibleComma) |
|
78 entity.8203= |
|
79 # U+2061, ApplyFunction, character showing function application in presentation tagging |
|
80 entity.8289= |
|
81 # U+2062, InvisibleTimes, marks multiplication when it is understood without a mark |
|
82 entity.8290= |
|
83 # U+2146, DifferentialD, d for use in differentials, e.g., within integrals |
|
84 entity.8518=d |
|
85 # U+2212, MINUS SIGN, official Unicode minus sign |
|
86 entity.8722=- |
|
87 # Hebrew punctuation |
|
88 # U+05BE HEBREW PUNCTUATION MAQAF |
|
89 entity.1470=- |
|
90 # U+05C0 HEBREW PUNCTUATION PASEQ |
|
91 entity.1472=| |
|
92 # U+05C3 HEBREW PUNCTUATION SOF PASUQ |
|
93 entity.1475=: |
|
94 # U+05F3 HEBREW PUNCTUATION GERESH |
|
95 entity.1523=' |
|
96 # U+05F4 HEBREW PUNCTUATION GERSHAYIM |
|
97 entity.1524=" |
|
98 ## |
|
99 ## End of hand coded section |
|
100 ## Below are generated from the unicode character database |
|
101 ## |
|
102 END_OF_HANDCODED |
|
103 |
|
104 @table = (); |
|
105 sub FromLatinComment |
|
106 { |
|
107 my ($cmt) = (@_); |
|
108 $char = ""; |
|
109 if($cmt =~ /PRECEDED BY APOSTROPHE/) { |
|
110 $char = "\'"; |
|
111 } |
|
112 if($cmt =~ /CAPITAL LETTER ([A-Z]*)/) { |
|
113 $char = $char . $1; |
|
114 } |
|
115 if($cmt =~ /SMALL LETTER ([A-Z]*)/) { |
|
116 $char = $char . lc($1); |
|
117 } |
|
118 @f = split(/ / , $cmt); |
|
119 while($item = shift @f) { |
|
120 if($item eq "DOT") { |
|
121 $char .= "."; |
|
122 } elsif ($item eq "DIAERESIS") { |
|
123 $char .= "\""; |
|
124 } elsif ($item eq "BREVE") { |
|
125 $char .= "("; |
|
126 } elsif ($item eq "ACUTE") { |
|
127 $char .= "\'"; |
|
128 } elsif ($item eq "GRAVE") { |
|
129 $char .= "`"; |
|
130 } elsif ($item eq "TILDE") { |
|
131 $char .= "~"; |
|
132 } elsif ($item eq "CARON") { |
|
133 $char .= "("; |
|
134 } elsif ($item eq "HOOK") { |
|
135 $char .= "?"; |
|
136 } elsif ($item eq "CEDILLA") { |
|
137 $char .= ","; |
|
138 } elsif ($item eq "MACRON") { |
|
139 $char .= "-"; |
|
140 } elsif ($item eq "CIRCUMFLEX") { |
|
141 $char .= "^"; |
|
142 } elsif ($item eq "RING") { |
|
143 $char .= "*"; |
|
144 } elsif ($item eq "OGONEK") { |
|
145 $char .= ";"; |
|
146 } elsif ($item eq "LINE") { |
|
147 $char .= "_"; |
|
148 } elsif ($item eq "COMMA") { |
|
149 $char .= ","; |
|
150 } elsif ($item eq "STROKE") { |
|
151 $char .= "/"; |
|
152 } elsif ($item eq "HORN") { |
|
153 $char .= "+"; |
|
154 } elsif ($item =~ /^(LATIN|CAPITAL|SMALL|LETTER|WITH|ABOVE|BELOW|INVERTED|MIDDLE|AND|BY|APOSTROPHE|[A-Z])$/) { |
|
155 # ignore |
|
156 } else { |
|
157 #print "AAAA $item\n"; |
|
158 } |
|
159 } |
|
160 |
|
161 return $char; |
|
162 } |
|
163 sub warning |
|
164 { |
|
165 my ($warning) = (@_); |
|
166 print "WARNING: $warning \n"; |
|
167 } |
|
168 sub doutput |
|
169 { |
|
170 my ($u, $cmt, $udec, $str) = (@_); |
|
171 # don't print out comments - for debugging purposes only |
|
172 # print "# U+$u $cmt\n"; |
|
173 print "entity.$udec=$str\n"; |
|
174 } |
|
175 sub output |
|
176 { |
|
177 my ($u, $cmt, $udec, $str) = (@_); |
|
178 if(decomposeIntoNonASCII($str)) { |
|
179 if(($cmt =~ "LATIN") && ($cmt =~ "LETTER") && !($cmt =~ "LONG")) { |
|
180 $str = FromLatinComment($cmt); |
|
181 output($u,$cmt,$udec,$str); |
|
182 } |
|
183 } else { |
|
184 # don't print out comments - for debugging purposes only |
|
185 # print OUT "# U+$u $cmt\n"; |
|
186 print OUT "entity.$udec=$str\n"; |
|
187 } |
|
188 } |
|
189 |
|
190 sub decomposeIntoNonASCII |
|
191 { |
|
192 my ($dec) = (@_); |
|
193 return $dec =~ /\\u([1-9A-F][0-9A-F][0-9A-F]|[0-9A-F][1-9A-F][0-9A-F]|00[8-9A-F])[0-9A-F]/; |
|
194 } |
|
195 |
|
196 sub foldcombining |
|
197 { |
|
198 my ($dec) = (@_); |
|
199 $grave = "0060"; |
|
200 $acute = "0027"; |
|
201 $hat = "005E"; |
|
202 $hat = "005E"; |
|
203 $tilde = "007E"; |
|
204 $overscore = "002D"; ## should be 00AF but we can only handle ASCII now |
|
205 $umlaut = "0022"; ## should be 00A8 but we can only handle ASCII now |
|
206 $doubleacute = "0022"; |
|
207 $dot = "002E"; |
|
208 $doublegrave = "0060 0060"; |
|
209 |
|
210 |
|
211 $dec =~ s/00A8/$umlaut/eg; |
|
212 $dec =~ s/00AF/$overscore/eg; |
|
213 # $dec =~ s/00B0//eg; |
|
214 $dec =~ s/00B4/$acute/eg; |
|
215 $dec =~ s/00B7/$dot/eg; |
|
216 # $dec =~ s/00B8//eg; |
|
217 $dec =~ s/0300/$grave/eg; |
|
218 $dec =~ s/0301/$acute/eg; |
|
219 $dec =~ s/0302/$hat/eg; |
|
220 $dec =~ s/0303/$tilde/eg; |
|
221 $dec =~ s/0304/$overscore/eg; |
|
222 $dec =~ s/0305/$overscore/eg; |
|
223 #$dec =~ s/0306/?/eg; |
|
224 $dec =~ s/0307/$dot/eg; |
|
225 $dec =~ s/0308/$umlaut/eg; |
|
226 #$dec =~ s/0309/?/eg; |
|
227 #$dec =~ s/030A/?/eg; |
|
228 $dec =~ s/030B/$doubleacute/eg; |
|
229 #$dec =~ s/030C/?/eg; |
|
230 $dec =~ s/030D/$acute/eg; |
|
231 $dec =~ s/030E/$doubleacute/eg; |
|
232 $dec =~ s/030F/$doublegrave/eg; |
|
233 |
|
234 # $dec =~ s/03[0-9A-F][0-9A-F]//eg; ## drop others |
|
235 return $dec; |
|
236 } |
|
237 sub rdecompose |
|
238 { |
|
239 my ($dec) = (@_); |
|
240 if(exists $table{$dec}) { |
|
241 $t = $table{$dec}; |
|
242 $t =~ s/<[a-zA-Z]*>//eg; |
|
243 $t = foldcombining($t); |
|
244 return rdecompose( $table{$t}); |
|
245 } |
|
246 return $dec; |
|
247 } |
|
248 sub decompose |
|
249 { |
|
250 my ($removeprefix, $dec) = (@_); |
|
251 $removeprefix .= " "; |
|
252 |
|
253 $dec =~ s/$removeprefix//eg; |
|
254 if($dec eq "0020") { |
|
255 $dec = "\\u0020"; |
|
256 } elsif($dec eq "005C") { |
|
257 $dec = "\\u005C"; |
|
258 } else { |
|
259 $k = "\/"; |
|
260 $dec =~ s/2044/$k/eg; |
|
261 $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg; |
|
262 $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g; |
|
263 $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg; |
|
264 $dec =~ s/ //eg; |
|
265 } |
|
266 return $dec; |
|
267 } |
|
268 |
|
269 ###################################################################### |
|
270 # |
|
271 # Open the unicode database file |
|
272 # |
|
273 ###################################################################### |
|
274 open ( UNICODATA , "< UnicodeData-Latest.txt") |
|
275 || die "cannot find UnicodeData-Latest.txt"; |
|
276 |
|
277 open ( UNICODATA2 , "< UnicodeData-Latest.txt") |
|
278 || die "cannot find UnicodeData-Latest.txt"; |
|
279 ###################################################################### |
|
280 # |
|
281 # Open the output file |
|
282 # |
|
283 ###################################################################### |
|
284 open ( OUT , "> ../tables/transliterate.properties") |
|
285 || die "cannot open output ../tables/transliterate.properties file"; |
|
286 |
|
287 print OUT $header; |
|
288 |
|
289 # remove comments from $handcoded |
|
290 $handcoded =~ s/^#[^#].*\n//mg; |
|
291 print OUT $handcoded; |
|
292 |
|
293 ###################################################################### |
|
294 # |
|
295 # Process the file line by line |
|
296 # |
|
297 ###################################################################### |
|
298 while(<UNICODATA2>) { |
|
299 chop; |
|
300 @f = split(/;/ , $_); |
|
301 $udec = hex($u); |
|
302 if(($udec > 256 ) && ($f[5] ne "")) { |
|
303 $table{$f[0]}=$f[5]; |
|
304 } |
|
305 } |
|
306 while(<UNICODATA>) { |
|
307 chop; |
|
308 ###################################################################### |
|
309 # |
|
310 # Get value from fields |
|
311 # |
|
312 ###################################################################### |
|
313 @f = split(/;/ , $_); |
|
314 $u = $f[0]; # The unicode value |
|
315 $cmt = $f[1]; # The comment |
|
316 $dec = $f[5]; # The decomposed value |
|
317 $d1 = $f[6]; |
|
318 $d2 = $f[7]; |
|
319 $d3 = $f[8]; |
|
320 $udec = hex($u); |
|
321 |
|
322 if($udec > 128) |
|
323 { |
|
324 # not ASCII |
|
325 if($dec ne "") |
|
326 { |
|
327 # have decomposition |
|
328 if($dec =~ /</) { |
|
329 # formated decomposition |
|
330 if($dec =~ /<wide>/) { |
|
331 output($u,$cmt,$udec,&decompose("<wide>", $dec)); |
|
332 } elsif($dec =~ /<narrow>/) { |
|
333 # ignore non ASCII decomposition |
|
334 # warning($_); |
|
335 } elsif($dec =~ /<circle>/) { |
|
336 output($u,$cmt,$udec,&decompose("<circle>", "(".$dec.")")); |
|
337 } elsif($dec =~ /<fraction>/) { |
|
338 output($u,$cmt,$udec,&decompose("<fraction>", $dec)); |
|
339 } elsif($dec =~ /<small>/) { |
|
340 output($u,$cmt,$udec,&decompose("<small>", $dec)); |
|
341 } elsif($dec =~ /<vertical>/) { |
|
342 # warning($_); |
|
343 } elsif($dec =~ /<super>/) { |
|
344 output($u,$cmt,$udec,"^(".&decompose("<super>", $dec).")"); |
|
345 } elsif($dec =~ /<sub>/) { |
|
346 output($u,$cmt,$udec,"v(".&decompose("<sub>", $dec).")"); |
|
347 } elsif($dec =~ /<font>/) { |
|
348 output($u,$cmt,$udec,&decompose("<font>", $dec)); |
|
349 } elsif($dec =~ /<square>/) { |
|
350 # ignore <square> |
|
351 # warning($_); |
|
352 } elsif($dec =~ /<compat>/) { |
|
353 output($u,$cmt,$udec,&decompose("<compat>", $dec)); |
|
354 } elsif($dec =~ /<isolated>/) { |
|
355 # ignore <isolated> |
|
356 # warning($_); |
|
357 } elsif($dec =~ /<medial>/) { |
|
358 # ignore <medial> |
|
359 # warning($_); |
|
360 } elsif($dec =~ /<final>/) { |
|
361 # ignore <final> |
|
362 # warning($_); |
|
363 } elsif($dec =~ /<initial>/) { |
|
364 # ignore <initial> |
|
365 # warning($_); |
|
366 } elsif($dec =~ /<noBreak>/) { |
|
367 if($dec eq "<noBreak> 0020") |
|
368 { |
|
369 output($u,$cmt,$udec,"\\u0020"); |
|
370 } else { |
|
371 # ignore |
|
372 # warning($_); |
|
373 } |
|
374 } else { |
|
375 warning($_); |
|
376 } |
|
377 } else { |
|
378 # decomposition without format code |
|
379 if($cmt =~ /LATIN/) { |
|
380 $dec = foldcombining($dec); |
|
381 output($u,$cmt,$udec,&decompose("", $dec)); |
|
382 } elsif($cmt =~ /CYRILLIC/) { |
|
383 # ignore |
|
384 # warning($_); |
|
385 } elsif($cmt =~ /GREEK/) { |
|
386 # ignore |
|
387 # warning($_); |
|
388 } elsif($cmt =~ /ARABIC/) { |
|
389 # ignore |
|
390 # warning($_); |
|
391 } elsif($cmt =~ /CJK/) { |
|
392 # ignore |
|
393 # warning($_); |
|
394 } elsif($cmt =~ /HEBREW/) { |
|
395 # ignore |
|
396 # warning($_); |
|
397 } elsif($cmt =~ /DEVANAGARI/) { |
|
398 # ignore |
|
399 # warning($_); |
|
400 } elsif($cmt =~ /BENGALI/) { |
|
401 # ignore |
|
402 # warning($_); |
|
403 } elsif($cmt =~ /GURMUKHI/) { |
|
404 # ignore |
|
405 # warning($_); |
|
406 } elsif($cmt =~ /ORIYA/) { |
|
407 # ignore |
|
408 # warning($_); |
|
409 } elsif($cmt =~ /TAMIL/) { |
|
410 # ignore |
|
411 # warning($_); |
|
412 } elsif($cmt =~ /TELUGU/) { |
|
413 # ignore |
|
414 # warning($_); |
|
415 } elsif($cmt =~ /KANNADA/) { |
|
416 # ignore |
|
417 # warning($_); |
|
418 } elsif($cmt =~ /MALAYALAM/) { |
|
419 # ignore |
|
420 # warning($_); |
|
421 } elsif($cmt =~ /SINHALA/) { |
|
422 # ignore |
|
423 # warning($_); |
|
424 } elsif($cmt =~ /TIBETAN/) { |
|
425 # ignore |
|
426 # warning($_); |
|
427 } elsif($cmt =~ /MYANMAR/) { |
|
428 # ignore |
|
429 # warning($_); |
|
430 } elsif($cmt =~ /KATAKANA/) { |
|
431 # ignore |
|
432 # warning($_); |
|
433 } elsif($cmt =~ /HIRAGANA/) { |
|
434 # ignore |
|
435 # warning($_); |
|
436 } else { |
|
437 # ignore |
|
438 # warning($_); |
|
439 } |
|
440 } |
|
441 } else { |
|
442 # do not have decomposition |
|
443 if ($d1 ne "") |
|
444 { |
|
445 # are numeric characters |
|
446 output($u,$cmt,$udec,$d1); |
|
447 } elsif ($d2 ne "") { |
|
448 if($cmt =~ /CIRCLED/) { |
|
449 # circled |
|
450 output($u,$cmt,$udec,"(".$d2.")"); |
|
451 } else { |
|
452 # others, use [ ] |
|
453 output($u,$cmt,$udec,"[".$d2."]"); |
|
454 } |
|
455 } elsif ($d3 ne "") { |
|
456 if($cmt =~ /CIRCLED/) { |
|
457 # circled |
|
458 output($u,$cmt,$udec,"(".$d3.")"); |
|
459 } else { |
|
460 # others, use [ ] |
|
461 output($u,$cmt,$udec,"[".$d3."]"); |
|
462 } |
|
463 } else { |
|
464 # not numeric characters |
|
465 |
|
466 } # end of no decomposition |
|
467 } # end of have/not decomposition |
|
468 } |
|
469 } |
|
470 ###################################################################### |
|
471 # |
|
472 # Close files |
|
473 # |
|
474 ###################################################################### |
|
475 close(UNIDATA); |
|
476 close(OUT); |
|
477 |