intl/icu/source/tools/toolutil/ppucd.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/tools/toolutil/ppucd.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,174 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*   Copyright (C) 2011-2013, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +*******************************************************************************
     1.9 +*   file name:  ppucd.h
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   created on: 2011dec11
    1.15 +*   created by: Markus W. Scherer
    1.16 +*/
    1.17 +
    1.18 +#ifndef __PPUCD_H__
    1.19 +#define __PPUCD_H__
    1.20 +
    1.21 +#include "unicode/utypes.h"
    1.22 +#include "unicode/uniset.h"
    1.23 +#include "unicode/unistr.h"
    1.24 +
    1.25 +#include <stdio.h>
    1.26 +
    1.27 +/** Additions to the uchar.h enum UProperty. */
    1.28 +enum {
    1.29 +    /** Name_Alias */
    1.30 +    PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
    1.31 +    PPUCD_CONDITIONAL_CASE_MAPPINGS,
    1.32 +    PPUCD_TURKIC_CASE_FOLDING
    1.33 +};
    1.34 +
    1.35 +U_NAMESPACE_BEGIN
    1.36 +
    1.37 +class U_TOOLUTIL_API PropertyNames {
    1.38 +public:
    1.39 +    virtual ~PropertyNames();
    1.40 +    virtual int32_t getPropertyEnum(const char *name) const;
    1.41 +    virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const;
    1.42 +};
    1.43 +
    1.44 +struct U_TOOLUTIL_API UniProps {
    1.45 +    UniProps();
    1.46 +    ~UniProps();
    1.47 +
    1.48 +    int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }
    1.49 +
    1.50 +    UChar32 start, end;
    1.51 +    UBool binProps[UCHAR_BINARY_LIMIT];
    1.52 +    int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
    1.53 +    UVersionInfo age;
    1.54 +    UChar32 bmg, bpb;
    1.55 +    UChar32 scf, slc, stc, suc;
    1.56 +    int32_t digitValue;
    1.57 +    const char *numericValue;
    1.58 +    const char *name;
    1.59 +    const char *nameAlias;
    1.60 +    UnicodeString cf, lc, tc, uc;
    1.61 +    UnicodeSet scx;
    1.62 +};
    1.63 +
    1.64 +class U_TOOLUTIL_API PreparsedUCD {
    1.65 +public:
    1.66 +    enum LineType {
    1.67 +        /** No line, end of file. */
    1.68 +        NO_LINE,
    1.69 +        /** Empty line. (Might contain a comment.) */
    1.70 +        EMPTY_LINE,
    1.71 +
    1.72 +        /** ucd;6.1.0 */
    1.73 +        UNICODE_VERSION_LINE,
    1.74 +
    1.75 +        /** property;Binary;Alpha;Alphabetic */
    1.76 +        PROPERTY_LINE,
    1.77 +        /** binary;N;No;F;False */
    1.78 +        BINARY_LINE,
    1.79 +        /** value;gc;Zs;Space_Separator */
    1.80 +        VALUE_LINE,
    1.81 +
    1.82 +        /** defaults;0000..10FFFF;age=NA;bc=L;... */
    1.83 +        DEFAULTS_LINE,
    1.84 +        /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
    1.85 +        BLOCK_LINE,
    1.86 +        /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
    1.87 +        CP_LINE,
    1.88 +
    1.89 +        /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
    1.90 +        ALG_NAMES_RANGE_LINE,
    1.91 +
    1.92 +        LINE_TYPE_COUNT
    1.93 +    };
    1.94 +
    1.95 +    /**
    1.96 +     * Constructor.
    1.97 +     * Prepare this object for a new, empty package.
    1.98 +     */
    1.99 +    PreparsedUCD(const char *filename, UErrorCode &errorCode);
   1.100 +
   1.101 +    /** Destructor. */
   1.102 +    ~PreparsedUCD();
   1.103 +
   1.104 +    /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */
   1.105 +    void setPropertyNames(const PropertyNames *pn) { pnames=pn; }
   1.106 +
   1.107 +    /**
   1.108 +     * Reads a line from the preparsed UCD file.
   1.109 +     * Splits the line by replacing each ';' with a NUL.
   1.110 +     */
   1.111 +    LineType readLine(UErrorCode &errorCode);
   1.112 +
   1.113 +    /** Returns the number of the line read by readLine(). */
   1.114 +    int32_t getLineNumber() const { return lineNumber; }
   1.115 +
   1.116 +    /** Returns the line's next field, or NULL. */
   1.117 +    const char *nextField();
   1.118 +
   1.119 +    /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
   1.120 +    const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }
   1.121 +
   1.122 +    /** Returns TRUE if the current line has property values. */
   1.123 +    UBool lineHasPropertyValues() const { return DEFAULTS_LINE<=lineType && lineType<=CP_LINE; }
   1.124 +
   1.125 +    /**
   1.126 +     * Parses properties from the current line.
   1.127 +     * Clears newValues and sets UProperty codes for property values mentioned
   1.128 +     * on the current line (as opposed to being inherited).
   1.129 +     * Returns a pointer to the filled-in UniProps, or NULL if something went wrong.
   1.130 +     * The returned UniProps are usable until the next line of the same type is read.
   1.131 +     */
   1.132 +    const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);
   1.133 +
   1.134 +    /**
   1.135 +     * Returns the code point range for the current algnamesrange line.
   1.136 +     * Calls & parses nextField().
   1.137 +     * Further nextField() calls will yield the range's type & prefix string.
   1.138 +     * Returns U_SUCCESS(errorCode).
   1.139 +     */
   1.140 +    UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);
   1.141 +
   1.142 +private:
   1.143 +    UBool isLineBufferAvailable(int32_t i) {
   1.144 +        return defaultLineIndex!=i && blockLineIndex!=i;
   1.145 +    }
   1.146 +
   1.147 +    /** Resets the field iterator and returns the line's first field (the line type field). */
   1.148 +    const char *firstField();
   1.149 +
   1.150 +    UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
   1.151 +                        UErrorCode &errorCode);
   1.152 +    UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
   1.153 +    UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
   1.154 +    void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
   1.155 +    void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
   1.156 +
   1.157 +    static const int32_t kNumLineBuffers=3;
   1.158 +
   1.159 +    PropertyNames *icuPnames;  // owned
   1.160 +    const PropertyNames *pnames;  // aliased
   1.161 +    FILE *file;
   1.162 +    int32_t defaultLineIndex, blockLineIndex, lineIndex;
   1.163 +    int32_t lineNumber;
   1.164 +    LineType lineType;
   1.165 +    char *fieldLimit;
   1.166 +    char *lineLimit;
   1.167 +
   1.168 +    UVersionInfo ucdVersion;
   1.169 +    UniProps defaultProps, blockProps, cpProps;
   1.170 +    // Multiple lines so that default and block properties can maintain pointers
   1.171 +    // into their line buffers.
   1.172 +    char lines[kNumLineBuffers][4096];
   1.173 +};
   1.174 +
   1.175 +U_NAMESPACE_END
   1.176 +
   1.177 +#endif  // __PPUCD_H__

mercurial