1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/js/src/yarr/YarrCanonicalizeUCS2.js Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,218 @@ 1.4 +/* 1.5 + * Copyright (C) 2012 Apple Inc. All rights reserved. 1.6 + * 1.7 + * Redistribution and use in source and binary forms, with or without 1.8 + * modification, are permitted provided that the following conditions 1.9 + * are met: 1.10 + * 1. Redistributions of source code must retain the above copyright 1.11 + * notice, this list of conditions and the following disclaimer. 1.12 + * 2. Redistributions in binary form must reproduce the above copyright 1.13 + * notice, this list of conditions and the following disclaimer in the 1.14 + * documentation and/or other materials provided with the distribution. 1.15 + * 1.16 + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 1.17 + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1.18 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 1.19 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 1.20 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.21 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.22 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.23 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 1.24 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 1.25 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 1.26 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.27 + */ 1.28 + 1.29 +// See ES 5.1, 15.10.2.8 1.30 +function canonicalize(ch) 1.31 +{ 1.32 + var u = String.fromCharCode(ch).toUpperCase(); 1.33 + if (u.length > 1) 1.34 + return ch; 1.35 + var cu = u.charCodeAt(0); 1.36 + if (ch >= 128 && cu < 128) 1.37 + return ch; 1.38 + return cu; 1.39 +} 1.40 + 1.41 +var MAX_UCS2 = 0xFFFF; 1.42 +var MAX_LATIN = 0xFF; 1.43 + 1.44 +var groupedCanonically = []; 1.45 +// Pass 1: populate groupedCanonically - this is mapping from canonicalized 1.46 +// values back to the set of character code that canonicalize to them. 1.47 +for (var i = 0; i <= MAX_UCS2; ++i) { 1.48 + var ch = canonicalize(i); 1.49 + if (!groupedCanonically[ch]) 1.50 + groupedCanonically[ch] = []; 1.51 + groupedCanonically[ch].push(i); 1.52 +} 1.53 + 1.54 +var typeInfo = []; 1.55 +var latinTypeInfo = []; 1.56 +var characterSetInfo = []; 1.57 +// Pass 2: populate typeInfo & characterSetInfo. For every character calculate 1.58 +// a typeInfo value, described by the types above, and a value payload. 1.59 +for (cu in groupedCanonically) { 1.60 + // The set of characters that canonicalize to cu 1.61 + var characters = groupedCanonically[cu]; 1.62 + 1.63 + // If there is only one, it is unique. 1.64 + if (characters.length == 1) { 1.65 + typeInfo[characters[0]] = "CanonicalizeUnique:0"; 1.66 + latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0"; 1.67 + continue; 1.68 + } 1.69 + 1.70 + // Sort the array. 1.71 + characters.sort(function(x,y){return x-y;}); 1.72 + 1.73 + // If there are more than two characters, create an entry in characterSetInfo. 1.74 + if (characters.length > 2) { 1.75 + for (i in characters) 1.76 + typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length; 1.77 + characterSetInfo.push(characters); 1.78 + 1.79 + if (characters[1] <= MAX_LATIN) 1.80 + throw new Error("sets with more than one latin character not supported!"); 1.81 + if (characters[0] <= MAX_LATIN) { 1.82 + for (i in characters) 1.83 + latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0]; 1.84 + latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0"; 1.85 + } else { 1.86 + for (i in characters) 1.87 + latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0"; 1.88 + } 1.89 + 1.90 + continue; 1.91 + } 1.92 + 1.93 + // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner. 1.94 + var lo = characters[0]; 1.95 + var hi = characters[1]; 1.96 + var delta = hi - lo; 1.97 + if (delta == 1) { 1.98 + var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0"; 1.99 + typeInfo[lo] = type; 1.100 + typeInfo[hi] = type; 1.101 + } else { 1.102 + typeInfo[lo] = "CanonicalizeRangeLo:" + delta; 1.103 + typeInfo[hi] = "CanonicalizeRangeHi:" + delta; 1.104 + } 1.105 + 1.106 + if (lo > MAX_LATIN) { 1.107 + latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0"; 1.108 + latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0"; 1.109 + } else if (hi > MAX_LATIN) { 1.110 + latinTypeInfo[lo] = "CanonicalizeLatinSelf:0"; 1.111 + latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo; 1.112 + } else { 1.113 + if (delta != 0x20 || lo & 0x20) 1.114 + throw new Error("pairs of latin characters that don't mask with 0x20 not supported!"); 1.115 + latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0"; 1.116 + latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0"; 1.117 + } 1.118 +} 1.119 + 1.120 +var rangeInfo = []; 1.121 +// Pass 3: coallesce types into ranges. 1.122 +for (var end = 0; end <= MAX_UCS2; ++end) { 1.123 + var begin = end; 1.124 + var type = typeInfo[end]; 1.125 + while (end < MAX_UCS2 && typeInfo[end + 1] == type) 1.126 + ++end; 1.127 + rangeInfo.push({begin:begin, end:end, type:type}); 1.128 +} 1.129 + 1.130 +var latinRangeInfo = []; 1.131 +// Pass 4: coallesce latin-1 types into ranges. 1.132 +for (var end = 0; end <= MAX_UCS2; ++end) { 1.133 + var begin = end; 1.134 + var type = latinTypeInfo[end]; 1.135 + while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type) 1.136 + ++end; 1.137 + latinRangeInfo.push({begin:begin, end:end, type:type}); 1.138 +} 1.139 + 1.140 + 1.141 +// Helper function to convert a number to a fixed width hex representation of a C uint16_t. 1.142 +function hex(x) 1.143 +{ 1.144 + var s = Number(x).toString(16); 1.145 + while (s.length < 4) 1.146 + s = 0 + s; 1.147 + return "0x" + s + "u"; 1.148 +} 1.149 + 1.150 +var copyright = ( 1.151 + "/*" + "\n" + 1.152 + " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" + 1.153 + " *" + "\n" + 1.154 + " * Redistribution and use in source and binary forms, with or without" + "\n" + 1.155 + " * modification, are permitted provided that the following conditions" + "\n" + 1.156 + " * are met:" + "\n" + 1.157 + " * 1. Redistributions of source code must retain the above copyright" + "\n" + 1.158 + " * notice, this list of conditions and the following disclaimer." + "\n" + 1.159 + " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" + 1.160 + " * notice, this list of conditions and the following disclaimer in the" + "\n" + 1.161 + " * documentation and/or other materials provided with the distribution." + "\n" + 1.162 + " *" + "\n" + 1.163 + " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" + 1.164 + " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" + 1.165 + " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" + 1.166 + " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" + 1.167 + " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" + 1.168 + " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" + 1.169 + " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" + 1.170 + " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" + 1.171 + " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" + 1.172 + " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" + 1.173 + " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" + 1.174 + " */"); 1.175 + 1.176 +print(copyright); 1.177 +print(); 1.178 +print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js"); 1.179 +print(); 1.180 +print('#include "yarr/YarrCanonicalizeUCS2.h"'); 1.181 +print(); 1.182 +print('#include <stddef.h>'); 1.183 +print(); 1.184 +print("namespace JSC { namespace Yarr {"); 1.185 +print(); 1.186 + 1.187 +for (i in characterSetInfo) { 1.188 + var characters = "" 1.189 + var set = characterSetInfo[i]; 1.190 + for (var j in set) 1.191 + characters += hex(set[j]) + ", "; 1.192 + print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };"); 1.193 +} 1.194 +print(); 1.195 +print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";"); 1.196 +print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {"); 1.197 +for (i in characterSetInfo) 1.198 +print(" ucs2CharacterSet" + i + ","); 1.199 +print("};"); 1.200 +print(); 1.201 +print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";"); 1.202 +print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {"); 1.203 +for (i in rangeInfo) { 1.204 + var info = rangeInfo[i]; 1.205 + var typeAndValue = info.type.split(':'); 1.206 + print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); 1.207 +} 1.208 +print("};"); 1.209 +print(); 1.210 +print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";"); 1.211 +print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {"); 1.212 +for (i in latinRangeInfo) { 1.213 + var info = latinRangeInfo[i]; 1.214 + var typeAndValue = info.type.split(':'); 1.215 + print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); 1.216 +} 1.217 +print("};"); 1.218 +print(); 1.219 +print("} } // JSC::Yarr"); 1.220 +print(); 1.221 +