js/src/yarr/YarrCanonicalizeUCS2.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/js/src/yarr/YarrCanonicalizeUCS2.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,141 @@
     1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
     1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99:
     1.6 + *
     1.7 + * Copyright (C) 2012 Apple Inc. All rights reserved.
     1.8 + *
     1.9 + * Redistribution and use in source and binary forms, with or without
    1.10 + * modification, are permitted provided that the following conditions
    1.11 + * are met:
    1.12 + * 1. Redistributions of source code must retain the above copyright
    1.13 + *    notice, this list of conditions and the following disclaimer.
    1.14 + * 2. Redistributions in binary form must reproduce the above copyright
    1.15 + *    notice, this list of conditions and the following disclaimer in the
    1.16 + *    documentation and/or other materials provided with the distribution.
    1.17 + *
    1.18 + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
    1.19 + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    1.20 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
    1.21 + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
    1.22 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    1.23 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    1.24 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    1.25 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
    1.26 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    1.27 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    1.28 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.29 + */
    1.30 +
    1.31 +#ifndef yarr_YarrCanonicalizeUCS2_h
    1.32 +#define yarr_YarrCanonicalizeUCS2_h
    1.33 +
    1.34 +#include <stdint.h>
    1.35 +
    1.36 +#include "yarr/wtfbridge.h"
    1.37 +
    1.38 +namespace JSC { namespace Yarr {
    1.39 +
    1.40 +// This set of data (autogenerated using YarrCanonicalizeUCS2.js into YarrCanonicalizeUCS2.cpp)
    1.41 +// provides information for each UCS2 code point as to the set of code points that it should
    1.42 +// match under the ES5.1 case insensitive RegExp matching rules, specified in 15.10.2.8.
    1.43 +enum UCS2CanonicalizationType {
    1.44 +    CanonicalizeUnique,               // No canonically equal values, e.g. 0x0.
    1.45 +    CanonicalizeSet,                  // Value indicates a set in characterSetInfo.
    1.46 +    CanonicalizeRangeLo,              // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61.
    1.47 +    CanonicalizeRangeHi,              // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41.
    1.48 +    CanonicalizeAlternatingAligned,   // Aligned consequtive pair, e.g. 0x1f4,0x1f5.
    1.49 +    CanonicalizeAlternatingUnaligned  // Unaligned consequtive pair, e.g. 0x241,0x242.
    1.50 +};
    1.51 +struct UCS2CanonicalizationRange { uint16_t begin, end, value, type; };
    1.52 +extern const size_t UCS2_CANONICALIZATION_RANGES;
    1.53 +extern const uint16_t* const characterSetInfo[];
    1.54 +extern const UCS2CanonicalizationRange rangeInfo[];
    1.55 +
    1.56 +// This table is similar to the full rangeInfo table, however this maps from UCS2 codepoints to
    1.57 +// the set of Latin1 codepoints that could match.
    1.58 +enum LatinCanonicalizationType {
    1.59 +    CanonicalizeLatinSelf,     // This character is in the Latin1 range, but has no canonical equivalent in the range.
    1.60 +    CanonicalizeLatinMask0x20, // One of a pair of characters, under the mask 0x20.
    1.61 +    CanonicalizeLatinOther,    // This character is not in the Latin1 range, but canonicalizes to another that is.
    1.62 +    CanonicalizeLatinInvalid   // Cannot match against Latin1 input.
    1.63 +};
    1.64 +struct LatinCanonicalizationRange { uint16_t begin, end, value, type; };
    1.65 +extern const size_t LATIN_CANONICALIZATION_RANGES;
    1.66 +extern const LatinCanonicalizationRange latinRangeInfo[];
    1.67 +
    1.68 +// This searches in log2 time over ~364 entries, so should typically result in 8 compares.
    1.69 +inline const UCS2CanonicalizationRange* rangeInfoFor(UChar ch)
    1.70 +{
    1.71 +    const UCS2CanonicalizationRange* info = rangeInfo;
    1.72 +    size_t entries = UCS2_CANONICALIZATION_RANGES;
    1.73 +
    1.74 +    while (true) {
    1.75 +        size_t candidate = entries >> 1;
    1.76 +        const UCS2CanonicalizationRange* candidateInfo = info + candidate;
    1.77 +        if (ch < candidateInfo->begin)
    1.78 +            entries = candidate;
    1.79 +        else if (ch <= candidateInfo->end)
    1.80 +            return candidateInfo;
    1.81 +        else {
    1.82 +            info = candidateInfo + 1;
    1.83 +            entries -= (candidate + 1);
    1.84 +        }
    1.85 +    }
    1.86 +}
    1.87 +
    1.88 +// Should only be called for characters that have one canonically matching value.
    1.89 +inline UChar getCanonicalPair(const UCS2CanonicalizationRange* info, UChar ch)
    1.90 +{
    1.91 +    ASSERT(ch >= info->begin && ch <= info->end);
    1.92 +    switch (info->type) {
    1.93 +    case CanonicalizeRangeLo:
    1.94 +        return ch + info->value;
    1.95 +    case CanonicalizeRangeHi:
    1.96 +        return ch - info->value;
    1.97 +    case CanonicalizeAlternatingAligned:
    1.98 +        return ch ^ 1;
    1.99 +    case CanonicalizeAlternatingUnaligned:
   1.100 +        return ((ch - 1) ^ 1) + 1;
   1.101 +    default:
   1.102 +        ASSERT_NOT_REACHED();
   1.103 +    }
   1.104 +    ASSERT_NOT_REACHED();
   1.105 +    return 0;
   1.106 +}
   1.107 +
   1.108 +// Returns true if no other UCS2 codepoint can match this value.
   1.109 +inline bool isCanonicallyUnique(UChar ch)
   1.110 +{
   1.111 +    return rangeInfoFor(ch)->type == CanonicalizeUnique;
   1.112 +}
   1.113 +
   1.114 +// Returns true if values are equal, under the canonicalization rules.
   1.115 +inline bool areCanonicallyEquivalent(UChar a, UChar b)
   1.116 +{
   1.117 +    const UCS2CanonicalizationRange* info = rangeInfoFor(a);
   1.118 +    switch (info->type) {
   1.119 +    case CanonicalizeUnique:
   1.120 +        return a == b;
   1.121 +    case CanonicalizeSet: {
   1.122 +        for (const uint16_t* set = characterSetInfo[info->value]; (a = *set); ++set) {
   1.123 +            if (a == b)
   1.124 +                return true;
   1.125 +        }
   1.126 +        return false;
   1.127 +    }
   1.128 +    case CanonicalizeRangeLo:
   1.129 +        return (a == b) || (a + info->value == b);
   1.130 +    case CanonicalizeRangeHi:
   1.131 +        return (a == b) || (a - info->value == b);
   1.132 +    case CanonicalizeAlternatingAligned:
   1.133 +        return (a | 1) == (b | 1);
   1.134 +    case CanonicalizeAlternatingUnaligned:
   1.135 +        return ((a - 1) | 1) == ((b - 1) | 1);
   1.136 +    }
   1.137 +
   1.138 +    ASSERT_NOT_REACHED();
   1.139 +    return false;
   1.140 +}
   1.141 +
   1.142 +} } // JSC::Yarr
   1.143 +
   1.144 +#endif /* yarr_YarrCanonicalizeUCS2_h */

mercurial