js/src/yarr/YarrCanonicalizeUCS2.js

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/js/src/yarr/YarrCanonicalizeUCS2.js	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,218 @@
     1.4 +/*
     1.5 + * Copyright (C) 2012 Apple Inc. All rights reserved.
     1.6 + *
     1.7 + * Redistribution and use in source and binary forms, with or without
     1.8 + * modification, are permitted provided that the following conditions
     1.9 + * are met:
    1.10 + * 1. Redistributions of source code must retain the above copyright
    1.11 + *    notice, this list of conditions and the following disclaimer.
    1.12 + * 2. Redistributions in binary form must reproduce the above copyright
    1.13 + *    notice, this list of conditions and the following disclaimer in the
    1.14 + *    documentation and/or other materials provided with the distribution.
    1.15 + *
    1.16 + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
    1.17 + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    1.18 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
    1.19 + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
    1.20 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    1.21 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    1.22 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    1.23 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
    1.24 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    1.25 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    1.26 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
    1.27 + */
    1.28 +
    1.29 +// See ES 5.1, 15.10.2.8
    1.30 +function canonicalize(ch)
    1.31 +{
    1.32 +    var u = String.fromCharCode(ch).toUpperCase();
    1.33 +    if (u.length > 1)
    1.34 +        return ch;
    1.35 +    var cu = u.charCodeAt(0);
    1.36 +    if (ch >= 128 && cu < 128)
    1.37 +        return ch;
    1.38 +    return cu;
    1.39 +}
    1.40 +
    1.41 +var MAX_UCS2 = 0xFFFF;
    1.42 +var MAX_LATIN = 0xFF;
    1.43 +
    1.44 +var groupedCanonically = [];
    1.45 +// Pass 1: populate groupedCanonically - this is mapping from canonicalized
    1.46 +// values back to the set of character code that canonicalize to them.
    1.47 +for (var i = 0; i <= MAX_UCS2; ++i) {
    1.48 +    var ch = canonicalize(i);
    1.49 +    if (!groupedCanonically[ch])
    1.50 +        groupedCanonically[ch] = [];
    1.51 +    groupedCanonically[ch].push(i);
    1.52 +}
    1.53 +
    1.54 +var typeInfo = [];
    1.55 +var latinTypeInfo = [];
    1.56 +var characterSetInfo = [];
    1.57 +// Pass 2: populate typeInfo & characterSetInfo. For every character calculate
    1.58 +// a typeInfo value, described by the types above, and a value payload.
    1.59 +for (cu in groupedCanonically) {
    1.60 +    // The set of characters that canonicalize to cu
    1.61 +    var characters = groupedCanonically[cu];
    1.62 +
    1.63 +    // If there is only one, it is unique.
    1.64 +    if (characters.length == 1) {
    1.65 +        typeInfo[characters[0]] = "CanonicalizeUnique:0";
    1.66 +        latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0";
    1.67 +        continue;
    1.68 +    }
    1.69 +
    1.70 +    // Sort the array.
    1.71 +    characters.sort(function(x,y){return x-y;});
    1.72 +
    1.73 +    // If there are more than two characters, create an entry in characterSetInfo.
    1.74 +    if (characters.length > 2) {
    1.75 +        for (i in characters)
    1.76 +            typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length;
    1.77 +        characterSetInfo.push(characters);
    1.78 +
    1.79 +        if (characters[1] <= MAX_LATIN)
    1.80 +            throw new Error("sets with more than one latin character not supported!");
    1.81 +        if (characters[0] <= MAX_LATIN) {
    1.82 +            for (i in characters)
    1.83 +                latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0];
    1.84 +            latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0";
    1.85 +        } else {
    1.86 +            for (i in characters)
    1.87 +                latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0";
    1.88 +        }
    1.89 +
    1.90 +        continue;
    1.91 +    }
    1.92 +
    1.93 +    // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner.
    1.94 +    var lo = characters[0];
    1.95 +    var hi = characters[1];
    1.96 +    var delta = hi - lo;
    1.97 +    if (delta == 1) {
    1.98 +        var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0";
    1.99 +        typeInfo[lo] = type;
   1.100 +        typeInfo[hi] = type;
   1.101 +    } else {
   1.102 +        typeInfo[lo] = "CanonicalizeRangeLo:" + delta;
   1.103 +        typeInfo[hi] = "CanonicalizeRangeHi:" + delta;
   1.104 +    }
   1.105 +
   1.106 +    if (lo > MAX_LATIN) {
   1.107 +        latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0"; 
   1.108 +        latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0";
   1.109 +    } else if (hi > MAX_LATIN) {
   1.110 +        latinTypeInfo[lo] = "CanonicalizeLatinSelf:0"; 
   1.111 +        latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo;
   1.112 +    } else {
   1.113 +        if (delta != 0x20 || lo & 0x20)
   1.114 +            throw new Error("pairs of latin characters that don't mask with 0x20 not supported!");
   1.115 +        latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0";
   1.116 +        latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0";
   1.117 +    }
   1.118 +}
   1.119 +
   1.120 +var rangeInfo = [];
   1.121 +// Pass 3: coallesce types into ranges.
   1.122 +for (var end = 0; end <= MAX_UCS2; ++end) {
   1.123 +    var begin = end;
   1.124 +    var type = typeInfo[end];
   1.125 +    while (end < MAX_UCS2 && typeInfo[end + 1] == type)
   1.126 +        ++end;
   1.127 +    rangeInfo.push({begin:begin, end:end, type:type});
   1.128 +}
   1.129 +
   1.130 +var latinRangeInfo = [];
   1.131 +// Pass 4: coallesce latin-1 types into ranges.
   1.132 +for (var end = 0; end <= MAX_UCS2; ++end) {
   1.133 +    var begin = end;
   1.134 +    var type = latinTypeInfo[end];
   1.135 +    while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type)
   1.136 +        ++end;
   1.137 +    latinRangeInfo.push({begin:begin, end:end, type:type});
   1.138 +}
   1.139 +
   1.140 +
   1.141 +// Helper function to convert a number to a fixed width hex representation of a C uint16_t.
   1.142 +function hex(x)
   1.143 +{
   1.144 +    var s = Number(x).toString(16);
   1.145 +    while (s.length < 4)
   1.146 +        s = 0 + s;
   1.147 +    return "0x" + s + "u";
   1.148 +}
   1.149 +
   1.150 +var copyright = (
   1.151 +    "/*"                                                                            + "\n" +
   1.152 +    " * Copyright (C) 2012 Apple Inc. All rights reserved."                         + "\n" +
   1.153 +    " *"                                                                            + "\n" +
   1.154 +    " * Redistribution and use in source and binary forms, with or without"         + "\n" +
   1.155 +    " * modification, are permitted provided that the following conditions"         + "\n" +
   1.156 +    " * are met:"                                                                   + "\n" +
   1.157 +    " * 1. Redistributions of source code must retain the above copyright"          + "\n" +
   1.158 +    " *    notice, this list of conditions and the following disclaimer."           + "\n" +
   1.159 +    " * 2. Redistributions in binary form must reproduce the above copyright"       + "\n" +
   1.160 +    " *    notice, this list of conditions and the following disclaimer in the"     + "\n" +
   1.161 +    " *    documentation and/or other materials provided with the distribution."    + "\n" +
   1.162 +    " *"                                                                            + "\n" +
   1.163 +    " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY"                  + "\n" +
   1.164 +    " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE"          + "\n" +
   1.165 +    " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR"         + "\n" +
   1.166 +    " * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR"                   + "\n" +
   1.167 +    " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,"      + "\n" +
   1.168 +    " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,"        + "\n" +
   1.169 +    " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR"         + "\n" +
   1.170 +    " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY"        + "\n" +
   1.171 +    " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT"               + "\n" +
   1.172 +    " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE"      + "\n" +
   1.173 +    " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "      + "\n" +
   1.174 +    " */");
   1.175 +
   1.176 +print(copyright);
   1.177 +print();
   1.178 +print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js");
   1.179 +print();
   1.180 +print('#include "yarr/YarrCanonicalizeUCS2.h"');
   1.181 +print();
   1.182 +print('#include <stddef.h>');
   1.183 +print();
   1.184 +print("namespace JSC { namespace Yarr {");
   1.185 +print();
   1.186 +
   1.187 +for (i in characterSetInfo) {
   1.188 +    var characters = ""
   1.189 +    var set = characterSetInfo[i];
   1.190 +    for (var j in set)
   1.191 +        characters += hex(set[j]) + ", ";
   1.192 +    print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };");
   1.193 +}
   1.194 +print();
   1.195 +print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";");
   1.196 +print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {");
   1.197 +for (i in characterSetInfo)
   1.198 +print("    ucs2CharacterSet" + i + ",");
   1.199 +print("};");
   1.200 +print();
   1.201 +print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";");
   1.202 +print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {");
   1.203 +for (i in rangeInfo) {
   1.204 +    var info = rangeInfo[i];
   1.205 +    var typeAndValue = info.type.split(':');
   1.206 +    print("    { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
   1.207 +}
   1.208 +print("};");
   1.209 +print();
   1.210 +print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";");
   1.211 +print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {");
   1.212 +for (i in latinRangeInfo) {
   1.213 +    var info = latinRangeInfo[i];
   1.214 +    var typeAndValue = info.type.split(':');
   1.215 +    print("    { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
   1.216 +}
   1.217 +print("};");
   1.218 +print();
   1.219 +print("} } // JSC::Yarr");
   1.220 +print();
   1.221 +

mercurial