js/src/yarr/YarrCanonicalizeUCS2.js

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2  * Copyright (C) 2012 Apple Inc. All rights reserved.
     3  *
     4  * Redistribution and use in source and binary forms, with or without
     5  * modification, are permitted provided that the following conditions
     6  * are met:
     7  * 1. Redistributions of source code must retain the above copyright
     8  *    notice, this list of conditions and the following disclaimer.
     9  * 2. Redistributions in binary form must reproduce the above copyright
    10  *    notice, this list of conditions and the following disclaimer in the
    11  *    documentation and/or other materials provided with the distribution.
    12  *
    13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
    14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
    16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
    17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
    21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
    24  */
    26 // See ES 5.1, 15.10.2.8
    27 function canonicalize(ch)
    28 {
    29     var u = String.fromCharCode(ch).toUpperCase();
    30     if (u.length > 1)
    31         return ch;
    32     var cu = u.charCodeAt(0);
    33     if (ch >= 128 && cu < 128)
    34         return ch;
    35     return cu;
    36 }
    38 var MAX_UCS2 = 0xFFFF;
    39 var MAX_LATIN = 0xFF;
    41 var groupedCanonically = [];
    42 // Pass 1: populate groupedCanonically - this is mapping from canonicalized
    43 // values back to the set of character code that canonicalize to them.
    44 for (var i = 0; i <= MAX_UCS2; ++i) {
    45     var ch = canonicalize(i);
    46     if (!groupedCanonically[ch])
    47         groupedCanonically[ch] = [];
    48     groupedCanonically[ch].push(i);
    49 }
    51 var typeInfo = [];
    52 var latinTypeInfo = [];
    53 var characterSetInfo = [];
    54 // Pass 2: populate typeInfo & characterSetInfo. For every character calculate
    55 // a typeInfo value, described by the types above, and a value payload.
    56 for (cu in groupedCanonically) {
    57     // The set of characters that canonicalize to cu
    58     var characters = groupedCanonically[cu];
    60     // If there is only one, it is unique.
    61     if (characters.length == 1) {
    62         typeInfo[characters[0]] = "CanonicalizeUnique:0";
    63         latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0";
    64         continue;
    65     }
    67     // Sort the array.
    68     characters.sort(function(x,y){return x-y;});
    70     // If there are more than two characters, create an entry in characterSetInfo.
    71     if (characters.length > 2) {
    72         for (i in characters)
    73             typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length;
    74         characterSetInfo.push(characters);
    76         if (characters[1] <= MAX_LATIN)
    77             throw new Error("sets with more than one latin character not supported!");
    78         if (characters[0] <= MAX_LATIN) {
    79             for (i in characters)
    80                 latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0];
    81             latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0";
    82         } else {
    83             for (i in characters)
    84                 latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0";
    85         }
    87         continue;
    88     }
    90     // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner.
    91     var lo = characters[0];
    92     var hi = characters[1];
    93     var delta = hi - lo;
    94     if (delta == 1) {
    95         var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0";
    96         typeInfo[lo] = type;
    97         typeInfo[hi] = type;
    98     } else {
    99         typeInfo[lo] = "CanonicalizeRangeLo:" + delta;
   100         typeInfo[hi] = "CanonicalizeRangeHi:" + delta;
   101     }
   103     if (lo > MAX_LATIN) {
   104         latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0"; 
   105         latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0";
   106     } else if (hi > MAX_LATIN) {
   107         latinTypeInfo[lo] = "CanonicalizeLatinSelf:0"; 
   108         latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo;
   109     } else {
   110         if (delta != 0x20 || lo & 0x20)
   111             throw new Error("pairs of latin characters that don't mask with 0x20 not supported!");
   112         latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0";
   113         latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0";
   114     }
   115 }
   117 var rangeInfo = [];
   118 // Pass 3: coallesce types into ranges.
   119 for (var end = 0; end <= MAX_UCS2; ++end) {
   120     var begin = end;
   121     var type = typeInfo[end];
   122     while (end < MAX_UCS2 && typeInfo[end + 1] == type)
   123         ++end;
   124     rangeInfo.push({begin:begin, end:end, type:type});
   125 }
   127 var latinRangeInfo = [];
   128 // Pass 4: coallesce latin-1 types into ranges.
   129 for (var end = 0; end <= MAX_UCS2; ++end) {
   130     var begin = end;
   131     var type = latinTypeInfo[end];
   132     while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type)
   133         ++end;
   134     latinRangeInfo.push({begin:begin, end:end, type:type});
   135 }
   138 // Helper function to convert a number to a fixed width hex representation of a C uint16_t.
   139 function hex(x)
   140 {
   141     var s = Number(x).toString(16);
   142     while (s.length < 4)
   143         s = 0 + s;
   144     return "0x" + s + "u";
   145 }
   147 var copyright = (
   148     "/*"                                                                            + "\n" +
   149     " * Copyright (C) 2012 Apple Inc. All rights reserved."                         + "\n" +
   150     " *"                                                                            + "\n" +
   151     " * Redistribution and use in source and binary forms, with or without"         + "\n" +
   152     " * modification, are permitted provided that the following conditions"         + "\n" +
   153     " * are met:"                                                                   + "\n" +
   154     " * 1. Redistributions of source code must retain the above copyright"          + "\n" +
   155     " *    notice, this list of conditions and the following disclaimer."           + "\n" +
   156     " * 2. Redistributions in binary form must reproduce the above copyright"       + "\n" +
   157     " *    notice, this list of conditions and the following disclaimer in the"     + "\n" +
   158     " *    documentation and/or other materials provided with the distribution."    + "\n" +
   159     " *"                                                                            + "\n" +
   160     " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY"                  + "\n" +
   161     " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE"          + "\n" +
   162     " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR"         + "\n" +
   163     " * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR"                   + "\n" +
   164     " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,"      + "\n" +
   165     " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,"        + "\n" +
   166     " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR"         + "\n" +
   167     " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY"        + "\n" +
   168     " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT"               + "\n" +
   169     " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE"      + "\n" +
   170     " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "      + "\n" +
   171     " */");
   173 print(copyright);
   174 print();
   175 print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js");
   176 print();
   177 print('#include "yarr/YarrCanonicalizeUCS2.h"');
   178 print();
   179 print('#include <stddef.h>');
   180 print();
   181 print("namespace JSC { namespace Yarr {");
   182 print();
   184 for (i in characterSetInfo) {
   185     var characters = ""
   186     var set = characterSetInfo[i];
   187     for (var j in set)
   188         characters += hex(set[j]) + ", ";
   189     print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };");
   190 }
   191 print();
   192 print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";");
   193 print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {");
   194 for (i in characterSetInfo)
   195 print("    ucs2CharacterSet" + i + ",");
   196 print("};");
   197 print();
   198 print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";");
   199 print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {");
   200 for (i in rangeInfo) {
   201     var info = rangeInfo[i];
   202     var typeAndValue = info.type.split(':');
   203     print("    { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
   204 }
   205 print("};");
   206 print();
   207 print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";");
   208 print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {");
   209 for (i in latinRangeInfo) {
   210     var info = latinRangeInfo[i];
   211     var typeAndValue = info.type.split(':');
   212     print("    { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
   213 }
   214 print("};");
   215 print();
   216 print("} } // JSC::Yarr");
   217 print();

mercurial