intl/icu/source/common/unisetspan.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 ******************************************************************************
     3 *
     4 *   Copyright (C) 2007, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 ******************************************************************************
     8 *   file name:  unisetspan.h
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2007mar01
    14 *   created by: Markus W. Scherer
    15 */
    17 #ifndef __UNISETSPAN_H__
    18 #define __UNISETSPAN_H__
    20 #include "unicode/utypes.h"
    21 #include "unicode/uniset.h"
    23 U_NAMESPACE_BEGIN
    25 /*
    26  * Implement span() etc. for a set with strings.
    27  * Avoid recursion because of its exponential complexity.
    28  * Instead, try multiple paths at once and track them with an IndexList.
    29  */
    30 class UnicodeSetStringSpan : public UMemory {
    31 public:
    32     /*
    33      * Which span() variant will be used?
    34      * The object is either built for one variant and used once,
    35      * or built for all and may be used many times.
    36      */
    37     enum {
    38         FWD             = 0x20,
    39         BACK            = 0x10,
    40         UTF16           = 8,
    41         UTF8            = 4,
    42         CONTAINED       = 2,
    43         NOT_CONTAINED   = 1,
    45         ALL             = 0x3f,
    47         FWD_UTF16_CONTAINED     = FWD  | UTF16 |     CONTAINED,
    48         FWD_UTF16_NOT_CONTAINED = FWD  | UTF16 | NOT_CONTAINED,
    49         FWD_UTF8_CONTAINED      = FWD  | UTF8  |     CONTAINED,
    50         FWD_UTF8_NOT_CONTAINED  = FWD  | UTF8  | NOT_CONTAINED,
    51         BACK_UTF16_CONTAINED    = BACK | UTF16 |     CONTAINED,
    52         BACK_UTF16_NOT_CONTAINED= BACK | UTF16 | NOT_CONTAINED,
    53         BACK_UTF8_CONTAINED     = BACK | UTF8  |     CONTAINED,
    54         BACK_UTF8_NOT_CONTAINED = BACK | UTF8  | NOT_CONTAINED
    55     };
    57     UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which);
    59     // Copy constructor. Assumes which==ALL for a frozen set.
    60     UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings);
    62     ~UnicodeSetStringSpan();
    64     /*
    65      * Do the strings need to be checked in span() etc.?
    66      * @return TRUE if strings need to be checked (call span() here),
    67      *         FALSE if not (use a BMPSet for best performance).
    68      */
    69     inline UBool needsStringSpanUTF16();
    70     inline UBool needsStringSpanUTF8();
    72     // For fast UnicodeSet::contains(c).
    73     inline UBool contains(UChar32 c) const;
    75     int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
    77     int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
    79     int32_t spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
    81     int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
    83 private:
    84     // Special spanLength byte values.
    85     enum {
    86         // The spanLength is >=0xfe.
    87         LONG_SPAN=0xfe,
    88         // All code points in the string are contained in the parent set.
    89         ALL_CP_CONTAINED=0xff
    90     };
    92     // Add a starting or ending string character to the spanNotSet
    93     // so that a character span ends before any string.
    94     void addToSpanNotSet(UChar32 c);
    96     int32_t spanNot(const UChar *s, int32_t length) const;
    97     int32_t spanNotBack(const UChar *s, int32_t length) const;
    98     int32_t spanNotUTF8(const uint8_t *s, int32_t length) const;
    99     int32_t spanNotBackUTF8(const uint8_t *s, int32_t length) const;
   101     // Set for span(). Same as parent but without strings.
   102     UnicodeSet spanSet;
   104     // Set for span(not contained).
   105     // Same as spanSet, plus characters that start or end strings.
   106     UnicodeSet *pSpanNotSet;
   108     // The strings of the parent set.
   109     const UVector &strings;
   111     // Pointer to the UTF-8 string lengths.
   112     // Also pointer to further allocated storage for meta data and
   113     // UTF-8 string contents as necessary.
   114     int32_t *utf8Lengths;
   116     // Pointer to the part of the (utf8Lengths) memory block that stores
   117     // the lengths of span(), spanBack() etc. for each string.
   118     uint8_t *spanLengths;
   120     // Pointer to the part of the (utf8Lengths) memory block that stores
   121     // the UTF-8 versions of the parent set's strings.
   122     uint8_t *utf8;
   124     // Number of bytes for all UTF-8 versions of strings together.
   125     int32_t utf8Length;
   127     // Maximum lengths of relevant strings.
   128     int32_t maxLength16;
   129     int32_t maxLength8;
   131     // Set up for all variants of span()?
   132     UBool all;
   134     // Memory for small numbers and lengths of strings.
   135     // For example, for 8 strings:
   136     // 8 UTF-8 lengths, 8*4 bytes span lengths, 8*2 3-byte UTF-8 characters
   137     // = 112 bytes = int32_t[28].
   138     int32_t staticLengths[32];
   139 };
   141 UBool UnicodeSetStringSpan::needsStringSpanUTF16() {
   142     return (UBool)(maxLength16!=0);
   143 }
   145 UBool UnicodeSetStringSpan::needsStringSpanUTF8() {
   146     return (UBool)(maxLength8!=0);
   147 }
   149 UBool UnicodeSetStringSpan::contains(UChar32 c) const {
   150     return spanSet.contains(c);
   151 }
   153 U_NAMESPACE_END
   155 #endif

mercurial