|
1 /* |
|
2 * Copyright (C) 2012 Apple Inc. All rights reserved. |
|
3 * |
|
4 * Redistribution and use in source and binary forms, with or without |
|
5 * modification, are permitted provided that the following conditions |
|
6 * are met: |
|
7 * 1. Redistributions of source code must retain the above copyright |
|
8 * notice, this list of conditions and the following disclaimer. |
|
9 * 2. Redistributions in binary form must reproduce the above copyright |
|
10 * notice, this list of conditions and the following disclaimer in the |
|
11 * documentation and/or other materials provided with the distribution. |
|
12 * |
|
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
|
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
|
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
24 */ |
|
25 |
|
26 // See ES 5.1, 15.10.2.8 |
|
27 function canonicalize(ch) |
|
28 { |
|
29 var u = String.fromCharCode(ch).toUpperCase(); |
|
30 if (u.length > 1) |
|
31 return ch; |
|
32 var cu = u.charCodeAt(0); |
|
33 if (ch >= 128 && cu < 128) |
|
34 return ch; |
|
35 return cu; |
|
36 } |
|
37 |
|
38 var MAX_UCS2 = 0xFFFF; |
|
39 var MAX_LATIN = 0xFF; |
|
40 |
|
41 var groupedCanonically = []; |
|
42 // Pass 1: populate groupedCanonically - this is mapping from canonicalized |
|
43 // values back to the set of character code that canonicalize to them. |
|
44 for (var i = 0; i <= MAX_UCS2; ++i) { |
|
45 var ch = canonicalize(i); |
|
46 if (!groupedCanonically[ch]) |
|
47 groupedCanonically[ch] = []; |
|
48 groupedCanonically[ch].push(i); |
|
49 } |
|
50 |
|
51 var typeInfo = []; |
|
52 var latinTypeInfo = []; |
|
53 var characterSetInfo = []; |
|
54 // Pass 2: populate typeInfo & characterSetInfo. For every character calculate |
|
55 // a typeInfo value, described by the types above, and a value payload. |
|
56 for (cu in groupedCanonically) { |
|
57 // The set of characters that canonicalize to cu |
|
58 var characters = groupedCanonically[cu]; |
|
59 |
|
60 // If there is only one, it is unique. |
|
61 if (characters.length == 1) { |
|
62 typeInfo[characters[0]] = "CanonicalizeUnique:0"; |
|
63 latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0"; |
|
64 continue; |
|
65 } |
|
66 |
|
67 // Sort the array. |
|
68 characters.sort(function(x,y){return x-y;}); |
|
69 |
|
70 // If there are more than two characters, create an entry in characterSetInfo. |
|
71 if (characters.length > 2) { |
|
72 for (i in characters) |
|
73 typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length; |
|
74 characterSetInfo.push(characters); |
|
75 |
|
76 if (characters[1] <= MAX_LATIN) |
|
77 throw new Error("sets with more than one latin character not supported!"); |
|
78 if (characters[0] <= MAX_LATIN) { |
|
79 for (i in characters) |
|
80 latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0]; |
|
81 latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0"; |
|
82 } else { |
|
83 for (i in characters) |
|
84 latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0"; |
|
85 } |
|
86 |
|
87 continue; |
|
88 } |
|
89 |
|
90 // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner. |
|
91 var lo = characters[0]; |
|
92 var hi = characters[1]; |
|
93 var delta = hi - lo; |
|
94 if (delta == 1) { |
|
95 var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0"; |
|
96 typeInfo[lo] = type; |
|
97 typeInfo[hi] = type; |
|
98 } else { |
|
99 typeInfo[lo] = "CanonicalizeRangeLo:" + delta; |
|
100 typeInfo[hi] = "CanonicalizeRangeHi:" + delta; |
|
101 } |
|
102 |
|
103 if (lo > MAX_LATIN) { |
|
104 latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0"; |
|
105 latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0"; |
|
106 } else if (hi > MAX_LATIN) { |
|
107 latinTypeInfo[lo] = "CanonicalizeLatinSelf:0"; |
|
108 latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo; |
|
109 } else { |
|
110 if (delta != 0x20 || lo & 0x20) |
|
111 throw new Error("pairs of latin characters that don't mask with 0x20 not supported!"); |
|
112 latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0"; |
|
113 latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0"; |
|
114 } |
|
115 } |
|
116 |
|
117 var rangeInfo = []; |
|
118 // Pass 3: coallesce types into ranges. |
|
119 for (var end = 0; end <= MAX_UCS2; ++end) { |
|
120 var begin = end; |
|
121 var type = typeInfo[end]; |
|
122 while (end < MAX_UCS2 && typeInfo[end + 1] == type) |
|
123 ++end; |
|
124 rangeInfo.push({begin:begin, end:end, type:type}); |
|
125 } |
|
126 |
|
127 var latinRangeInfo = []; |
|
128 // Pass 4: coallesce latin-1 types into ranges. |
|
129 for (var end = 0; end <= MAX_UCS2; ++end) { |
|
130 var begin = end; |
|
131 var type = latinTypeInfo[end]; |
|
132 while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type) |
|
133 ++end; |
|
134 latinRangeInfo.push({begin:begin, end:end, type:type}); |
|
135 } |
|
136 |
|
137 |
|
138 // Helper function to convert a number to a fixed width hex representation of a C uint16_t. |
|
139 function hex(x) |
|
140 { |
|
141 var s = Number(x).toString(16); |
|
142 while (s.length < 4) |
|
143 s = 0 + s; |
|
144 return "0x" + s + "u"; |
|
145 } |
|
146 |
|
147 var copyright = ( |
|
148 "/*" + "\n" + |
|
149 " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" + |
|
150 " *" + "\n" + |
|
151 " * Redistribution and use in source and binary forms, with or without" + "\n" + |
|
152 " * modification, are permitted provided that the following conditions" + "\n" + |
|
153 " * are met:" + "\n" + |
|
154 " * 1. Redistributions of source code must retain the above copyright" + "\n" + |
|
155 " * notice, this list of conditions and the following disclaimer." + "\n" + |
|
156 " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" + |
|
157 " * notice, this list of conditions and the following disclaimer in the" + "\n" + |
|
158 " * documentation and/or other materials provided with the distribution." + "\n" + |
|
159 " *" + "\n" + |
|
160 " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" + |
|
161 " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" + |
|
162 " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" + |
|
163 " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" + |
|
164 " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" + |
|
165 " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" + |
|
166 " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" + |
|
167 " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" + |
|
168 " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" + |
|
169 " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" + |
|
170 " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" + |
|
171 " */"); |
|
172 |
|
173 print(copyright); |
|
174 print(); |
|
175 print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js"); |
|
176 print(); |
|
177 print('#include "yarr/YarrCanonicalizeUCS2.h"'); |
|
178 print(); |
|
179 print('#include <stddef.h>'); |
|
180 print(); |
|
181 print("namespace JSC { namespace Yarr {"); |
|
182 print(); |
|
183 |
|
184 for (i in characterSetInfo) { |
|
185 var characters = "" |
|
186 var set = characterSetInfo[i]; |
|
187 for (var j in set) |
|
188 characters += hex(set[j]) + ", "; |
|
189 print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };"); |
|
190 } |
|
191 print(); |
|
192 print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";"); |
|
193 print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {"); |
|
194 for (i in characterSetInfo) |
|
195 print(" ucs2CharacterSet" + i + ","); |
|
196 print("};"); |
|
197 print(); |
|
198 print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";"); |
|
199 print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {"); |
|
200 for (i in rangeInfo) { |
|
201 var info = rangeInfo[i]; |
|
202 var typeAndValue = info.type.split(':'); |
|
203 print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); |
|
204 } |
|
205 print("};"); |
|
206 print(); |
|
207 print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";"); |
|
208 print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {"); |
|
209 for (i in latinRangeInfo) { |
|
210 var info = latinRangeInfo[i]; |
|
211 var typeAndValue = info.type.split(':'); |
|
212 print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); |
|
213 } |
|
214 print("};"); |
|
215 print(); |
|
216 print("} } // JSC::Yarr"); |
|
217 print(); |
|
218 |