michael@0: /* michael@0: * Copyright (C) 2012 Apple Inc. All rights reserved. michael@0: * michael@0: * Redistribution and use in source and binary forms, with or without michael@0: * modification, are permitted provided that the following conditions michael@0: * are met: michael@0: * 1. Redistributions of source code must retain the above copyright michael@0: * notice, this list of conditions and the following disclaimer. michael@0: * 2. Redistributions in binary form must reproduce the above copyright michael@0: * notice, this list of conditions and the following disclaimer in the michael@0: * documentation and/or other materials provided with the distribution. michael@0: * michael@0: * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY michael@0: * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE michael@0: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR michael@0: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR michael@0: * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, michael@0: * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, michael@0: * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR michael@0: * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY michael@0: * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT michael@0: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE michael@0: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. michael@0: */ michael@0: michael@0: // See ES 5.1, 15.10.2.8 michael@0: function canonicalize(ch) michael@0: { michael@0: var u = String.fromCharCode(ch).toUpperCase(); michael@0: if (u.length > 1) michael@0: return ch; michael@0: var cu = u.charCodeAt(0); michael@0: if (ch >= 128 && cu < 128) michael@0: return ch; michael@0: return cu; michael@0: } michael@0: michael@0: var MAX_UCS2 = 0xFFFF; michael@0: var MAX_LATIN = 0xFF; michael@0: michael@0: var groupedCanonically = []; michael@0: // Pass 1: populate groupedCanonically - this is mapping from canonicalized michael@0: // values back to the set of character code that canonicalize to them. michael@0: for (var i = 0; i <= MAX_UCS2; ++i) { michael@0: var ch = canonicalize(i); michael@0: if (!groupedCanonically[ch]) michael@0: groupedCanonically[ch] = []; michael@0: groupedCanonically[ch].push(i); michael@0: } michael@0: michael@0: var typeInfo = []; michael@0: var latinTypeInfo = []; michael@0: var characterSetInfo = []; michael@0: // Pass 2: populate typeInfo & characterSetInfo. For every character calculate michael@0: // a typeInfo value, described by the types above, and a value payload. michael@0: for (cu in groupedCanonically) { michael@0: // The set of characters that canonicalize to cu michael@0: var characters = groupedCanonically[cu]; michael@0: michael@0: // If there is only one, it is unique. michael@0: if (characters.length == 1) { michael@0: typeInfo[characters[0]] = "CanonicalizeUnique:0"; michael@0: latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0"; michael@0: continue; michael@0: } michael@0: michael@0: // Sort the array. michael@0: characters.sort(function(x,y){return x-y;}); michael@0: michael@0: // If there are more than two characters, create an entry in characterSetInfo. michael@0: if (characters.length > 2) { michael@0: for (i in characters) michael@0: typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length; michael@0: characterSetInfo.push(characters); michael@0: michael@0: if (characters[1] <= MAX_LATIN) michael@0: throw new Error("sets with more than one latin character not supported!"); michael@0: if (characters[0] <= MAX_LATIN) { michael@0: for (i in characters) michael@0: latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0]; michael@0: latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0"; michael@0: } else { michael@0: for (i in characters) michael@0: latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0"; michael@0: } michael@0: michael@0: continue; michael@0: } michael@0: michael@0: // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner. michael@0: var lo = characters[0]; michael@0: var hi = characters[1]; michael@0: var delta = hi - lo; michael@0: if (delta == 1) { michael@0: var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0"; michael@0: typeInfo[lo] = type; michael@0: typeInfo[hi] = type; michael@0: } else { michael@0: typeInfo[lo] = "CanonicalizeRangeLo:" + delta; michael@0: typeInfo[hi] = "CanonicalizeRangeHi:" + delta; michael@0: } michael@0: michael@0: if (lo > MAX_LATIN) { michael@0: latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0"; michael@0: latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0"; michael@0: } else if (hi > MAX_LATIN) { michael@0: latinTypeInfo[lo] = "CanonicalizeLatinSelf:0"; michael@0: latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo; michael@0: } else { michael@0: if (delta != 0x20 || lo & 0x20) michael@0: throw new Error("pairs of latin characters that don't mask with 0x20 not supported!"); michael@0: latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0"; michael@0: latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0"; michael@0: } michael@0: } michael@0: michael@0: var rangeInfo = []; michael@0: // Pass 3: coallesce types into ranges. michael@0: for (var end = 0; end <= MAX_UCS2; ++end) { michael@0: var begin = end; michael@0: var type = typeInfo[end]; michael@0: while (end < MAX_UCS2 && typeInfo[end + 1] == type) michael@0: ++end; michael@0: rangeInfo.push({begin:begin, end:end, type:type}); michael@0: } michael@0: michael@0: var latinRangeInfo = []; michael@0: // Pass 4: coallesce latin-1 types into ranges. michael@0: for (var end = 0; end <= MAX_UCS2; ++end) { michael@0: var begin = end; michael@0: var type = latinTypeInfo[end]; michael@0: while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type) michael@0: ++end; michael@0: latinRangeInfo.push({begin:begin, end:end, type:type}); michael@0: } michael@0: michael@0: michael@0: // Helper function to convert a number to a fixed width hex representation of a C uint16_t. michael@0: function hex(x) michael@0: { michael@0: var s = Number(x).toString(16); michael@0: while (s.length < 4) michael@0: s = 0 + s; michael@0: return "0x" + s + "u"; michael@0: } michael@0: michael@0: var copyright = ( michael@0: "/*" + "\n" + michael@0: " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" + michael@0: " *" + "\n" + michael@0: " * Redistribution and use in source and binary forms, with or without" + "\n" + michael@0: " * modification, are permitted provided that the following conditions" + "\n" + michael@0: " * are met:" + "\n" + michael@0: " * 1. Redistributions of source code must retain the above copyright" + "\n" + michael@0: " * notice, this list of conditions and the following disclaimer." + "\n" + michael@0: " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" + michael@0: " * notice, this list of conditions and the following disclaimer in the" + "\n" + michael@0: " * documentation and/or other materials provided with the distribution." + "\n" + michael@0: " *" + "\n" + michael@0: " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" + michael@0: " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" + michael@0: " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" + michael@0: " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" + michael@0: " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" + michael@0: " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" + michael@0: " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" + michael@0: " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" + michael@0: " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" + michael@0: " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" + michael@0: " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" + michael@0: " */"); michael@0: michael@0: print(copyright); michael@0: print(); michael@0: print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js"); michael@0: print(); michael@0: print('#include "yarr/YarrCanonicalizeUCS2.h"'); michael@0: print(); michael@0: print('#include '); michael@0: print(); michael@0: print("namespace JSC { namespace Yarr {"); michael@0: print(); michael@0: michael@0: for (i in characterSetInfo) { michael@0: var characters = "" michael@0: var set = characterSetInfo[i]; michael@0: for (var j in set) michael@0: characters += hex(set[j]) + ", "; michael@0: print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };"); michael@0: } michael@0: print(); michael@0: print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";"); michael@0: print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {"); michael@0: for (i in characterSetInfo) michael@0: print(" ucs2CharacterSet" + i + ","); michael@0: print("};"); michael@0: print(); michael@0: print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";"); michael@0: print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {"); michael@0: for (i in rangeInfo) { michael@0: var info = rangeInfo[i]; michael@0: var typeAndValue = info.type.split(':'); michael@0: print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); michael@0: } michael@0: print("};"); michael@0: print(); michael@0: print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";"); michael@0: print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {"); michael@0: for (i in latinRangeInfo) { michael@0: var info = latinRangeInfo[i]; michael@0: var typeAndValue = info.type.split(':'); michael@0: print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); michael@0: } michael@0: print("};"); michael@0: print(); michael@0: print("} } // JSC::Yarr"); michael@0: print(); michael@0: