|
1 #!/usr/bin/python |
|
2 |
|
3 import sys |
|
4 |
|
5 if len (sys.argv) != 4: |
|
6 print >>sys.stderr, "usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicMatraCategory.txt Blocks.txt" |
|
7 sys.exit (1) |
|
8 |
|
9 files = [file (x) for x in sys.argv[1:]] |
|
10 |
|
11 headers = [[f.readline () for i in range (2)] for f in files] |
|
12 |
|
13 blocks = {} |
|
14 data = [{} for f in files] |
|
15 values = [{} for f in files] |
|
16 for i, f in enumerate (files): |
|
17 for line in f: |
|
18 |
|
19 j = line.find ('#') |
|
20 if j >= 0: |
|
21 line = line[:j] |
|
22 |
|
23 fields = [x.strip () for x in line.split (';')] |
|
24 if len (fields) == 1: |
|
25 continue |
|
26 |
|
27 uu = fields[0].split ('..') |
|
28 start = int (uu[0], 16) |
|
29 if len (uu) == 1: |
|
30 end = start |
|
31 else: |
|
32 end = int (uu[1], 16) |
|
33 |
|
34 t = fields[1] |
|
35 |
|
36 for u in range (start, end + 1): |
|
37 data[i][u] = t |
|
38 values[i][t] = values[i].get (t, 0) + 1 |
|
39 |
|
40 if i == 2: |
|
41 blocks[t] = (start, end) |
|
42 |
|
43 # Merge data into one dict: |
|
44 defaults = ('Other', 'Not_Applicable', 'No_Block') |
|
45 for i,v in enumerate (defaults): |
|
46 values[i][v] = values[i].get (v, 0) + 1 |
|
47 combined = {} |
|
48 for i,d in enumerate (data): |
|
49 for u,v in d.items (): |
|
50 if i == 2 and not u in combined: |
|
51 continue |
|
52 if not u in combined: |
|
53 combined[u] = list (defaults) |
|
54 combined[u][i] = v |
|
55 data = combined |
|
56 del combined |
|
57 num = len (data) |
|
58 |
|
59 # Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out |
|
60 singles = {} |
|
61 for u in [0x00A0, 0x25CC]: |
|
62 singles[u] = data[u] |
|
63 del data[u] |
|
64 |
|
65 print "/* == Start of generated table == */" |
|
66 print "/*" |
|
67 print " * The following table is generated by running:" |
|
68 print " *" |
|
69 print " * ./gen-indic-table.py IndicSyllabicCategory.txt IndicMatraCategory.txt Blocks.txt" |
|
70 print " *" |
|
71 print " * on files with these headers:" |
|
72 print " *" |
|
73 for h in headers: |
|
74 for l in h: |
|
75 print " * %s" % (l.strip()) |
|
76 print " */" |
|
77 print |
|
78 print '#include "hb-ot-shape-complex-indic-private.hh"' |
|
79 print |
|
80 |
|
81 # Shorten values |
|
82 short = [{ |
|
83 "Bindu": 'Bi', |
|
84 "Visarga": 'Vs', |
|
85 "Vowel": 'Vo', |
|
86 "Vowel_Dependent": 'M', |
|
87 "Other": 'x', |
|
88 },{ |
|
89 "Not_Applicable": 'x', |
|
90 }] |
|
91 all_shorts = [[],[]] |
|
92 |
|
93 # Add some of the values, to make them more readable, and to avoid duplicates |
|
94 |
|
95 |
|
96 for i in range (2): |
|
97 for v,s in short[i].items (): |
|
98 all_shorts[i].append (s) |
|
99 |
|
100 what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"] |
|
101 what_short = ["ISC", "IMC"] |
|
102 for i in range (2): |
|
103 print |
|
104 vv = values[i].keys () |
|
105 vv.sort () |
|
106 for v in vv: |
|
107 v_no_and = v.replace ('_And_', '_') |
|
108 if v in short[i]: |
|
109 s = short[i][v] |
|
110 else: |
|
111 s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')]) |
|
112 if s in all_shorts[i]: |
|
113 raise Exception ("Duplicate short value alias", v, s) |
|
114 all_shorts[i].append (s) |
|
115 short[i][v] = s |
|
116 print "#define %s_%s %s_%s %s/* %3d chars; %s */" % \ |
|
117 (what_short[i], s, what[i], v.upper (), \ |
|
118 ' '* ((48-1 - len (what[i]) - 1 - len (v)) / 8), \ |
|
119 values[i][v], v) |
|
120 print |
|
121 print "#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)" |
|
122 print |
|
123 print |
|
124 |
|
125 total = 0 |
|
126 used = 0 |
|
127 def print_block (block, start, end, data): |
|
128 print |
|
129 print |
|
130 print " /* %s (%04X..%04X) */" % (block, start, end) |
|
131 num = 0 |
|
132 for u in range (start, end+1): |
|
133 if u % 8 == 0: |
|
134 print |
|
135 print " /* %04X */" % u, |
|
136 if u in data: |
|
137 num += 1 |
|
138 d = data.get (u, defaults) |
|
139 sys.stdout.write ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]]))) |
|
140 |
|
141 global total, used |
|
142 total += end - start + 1 |
|
143 used += num |
|
144 |
|
145 uu = data.keys () |
|
146 uu.sort () |
|
147 |
|
148 last = -1 |
|
149 num = 0 |
|
150 offset = 0 |
|
151 starts = [] |
|
152 ends = [] |
|
153 print "static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {" |
|
154 for u in uu: |
|
155 if u <= last: |
|
156 continue |
|
157 block = data[u][2] |
|
158 (start, end) = blocks[block] |
|
159 |
|
160 if start != last + 1: |
|
161 if start - last <= 33: |
|
162 print_block ("FILLER", last+1, start-1, data) |
|
163 last = start-1 |
|
164 else: |
|
165 if last >= 0: |
|
166 ends.append (last + 1) |
|
167 offset += ends[-1] - starts[-1] |
|
168 print |
|
169 print |
|
170 print "#define indic_offset_0x%04x %d" % (start, offset) |
|
171 starts.append (start) |
|
172 |
|
173 print_block (block, start, end, data) |
|
174 last = end |
|
175 ends.append (last + 1) |
|
176 offset += ends[-1] - starts[-1] |
|
177 print |
|
178 print |
|
179 print "#define indic_offset_total %d" % offset |
|
180 print |
|
181 occupancy = used * 100. / total |
|
182 print "}; /* Table occupancy: %d%% */" % occupancy |
|
183 print |
|
184 print "INDIC_TABLE_ELEMENT_TYPE" |
|
185 print "hb_indic_get_categories (hb_codepoint_t u)" |
|
186 print "{" |
|
187 for (start,end) in zip (starts, ends): |
|
188 offset = "indic_offset_0x%04x" % start |
|
189 print " if (0x%04X <= u && u <= 0x%04X) return indic_table[u - 0x%04X + %s];" % (start, end, start, offset) |
|
190 for u,d in singles.items (): |
|
191 print " if (unlikely (u == 0x%04X)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]]) |
|
192 print " return _(x,x);" |
|
193 print "}" |
|
194 print |
|
195 print "#undef _" |
|
196 for i in range (2): |
|
197 print |
|
198 vv = values[i].keys () |
|
199 vv.sort () |
|
200 for v in vv: |
|
201 print "#undef %s_%s" % \ |
|
202 (what_short[i], short[i][v]) |
|
203 print |
|
204 print "/* == End of generated table == */" |
|
205 |
|
206 # Maintain at least 30% occupancy in the table */ |
|
207 if occupancy < 30: |
|
208 raise Exception ("Table too sparse, please investigate: ", occupancy) |