intl/icu/source/tools/toolutil/ppucd.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 *******************************************************************************
     3 *   Copyright (C) 2011-2013, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 *******************************************************************************
     6 *   file name:  ppucd.cpp
     7 *   encoding:   US-ASCII
     8 *   tab size:   8 (not used)
     9 *   indentation:4
    10 *
    11 *   created on: 2011dec11
    12 *   created by: Markus W. Scherer
    13 */
    15 #include "unicode/utypes.h"
    16 #include "unicode/uchar.h"
    17 #include "charstr.h"
    18 #include "cstring.h"
    19 #include "ppucd.h"
    20 #include "uassert.h"
    21 #include "uparse.h"
    23 #include <stdio.h>
    24 #include <string.h>
    26 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    28 U_NAMESPACE_BEGIN
    30 PropertyNames::~PropertyNames() {}
    32 int32_t
    33 PropertyNames::getPropertyEnum(const char *name) const {
    34     return u_getPropertyEnum(name);
    35 }
    37 int32_t
    38 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
    39     return u_getPropertyValueEnum((UProperty)property, name);
    40 }
    42 UniProps::UniProps()
    43         : start(U_SENTINEL), end(U_SENTINEL),
    44           bmg(U_SENTINEL), bpb(U_SENTINEL),
    45           scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
    46           digitValue(-1), numericValue(NULL),
    47           name(NULL), nameAlias(NULL) {
    48     memset(binProps, 0, sizeof(binProps));
    49     memset(intProps, 0, sizeof(intProps));
    50     memset(age, 0, 4);
    51 }
    53 UniProps::~UniProps() {}
    55 const int32_t PreparsedUCD::kNumLineBuffers;
    57 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
    58         : icuPnames(new PropertyNames()), pnames(icuPnames),
    59           file(NULL),
    60           defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
    61           lineNumber(0),
    62           lineType(NO_LINE),
    63           fieldLimit(NULL), lineLimit(NULL) {
    64     if(U_FAILURE(errorCode)) { return; }
    66     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
    67         filename=NULL;
    68         file=stdin;
    69     } else {
    70         file=fopen(filename, "r");
    71     }
    72     if(file==NULL) {
    73         perror("error opening preparsed UCD");
    74         fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
    75         errorCode=U_FILE_ACCESS_ERROR;
    76         return;
    77     }
    79     memset(ucdVersion, 0, 4);
    80     lines[0][0]=0;
    81 }
    83 PreparsedUCD::~PreparsedUCD() {
    84     if(file!=stdin) {
    85         fclose(file);
    86     }
    87     delete icuPnames;
    88 }
    90 // Same order as the LineType values.
    91 static const char *lineTypeStrings[]={
    92     NULL,
    93     NULL,
    94     "ucd",
    95     "property",
    96     "binary",
    97     "value",
    98     "defaults",
    99     "block",
   100     "cp",
   101     "algnamesrange"
   102 };
   104 PreparsedUCD::LineType
   105 PreparsedUCD::readLine(UErrorCode &errorCode) {
   106     if(U_FAILURE(errorCode)) { return NO_LINE; }
   107     // Select the next available line buffer.
   108     while(!isLineBufferAvailable(lineIndex)) {
   109         ++lineIndex;
   110         if (lineIndex == kNumLineBuffers) {
   111             lineIndex = 0;
   112         }
   113     }
   114     char *line=lines[lineIndex];
   115     *line=0;
   116     lineLimit=fieldLimit=line;
   117     lineType=NO_LINE;
   118     char *result=fgets(line, sizeof(lines[0]), file);
   119     if(result==NULL) {
   120         if(ferror(file)) {
   121             perror("error reading preparsed UCD");
   122             fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
   123             errorCode=U_FILE_ACCESS_ERROR;
   124         }
   125         return NO_LINE;
   126     }
   127     ++lineNumber;
   128     if(*line=='#') {
   129         fieldLimit=strchr(line, 0);
   130         return lineType=EMPTY_LINE;
   131     }
   132     // Remove trailing /r/n.
   133     char c;
   134     char *limit=strchr(line, 0);
   135     while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
   136     // Remove trailing white space.
   137     while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
   138     *limit=0;
   139     lineLimit=limit;
   140     if(line==limit) {
   141         fieldLimit=limit;
   142         return lineType=EMPTY_LINE;
   143     }
   144     // Split by ';'.
   145     char *semi=line;
   146     while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
   147     fieldLimit=strchr(line, 0);
   148     // Determine the line type.
   149     int32_t type;
   150     for(type=EMPTY_LINE+1;; ++type) {
   151         if(type==LINE_TYPE_COUNT) {
   152             fprintf(stderr,
   153                     "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
   154                     line, (long)lineNumber);
   155             errorCode=U_PARSE_ERROR;
   156             return NO_LINE;
   157         }
   158         if(0==strcmp(line, lineTypeStrings[type])) {
   159             break;
   160         }
   161     }
   162     lineType=(LineType)type;
   163     if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
   164         u_versionFromString(ucdVersion, fieldLimit+1);
   165     }
   166     return lineType;
   167 }
   169 const char *
   170 PreparsedUCD::firstField() {
   171     char *field=lines[lineIndex];
   172     fieldLimit=strchr(field, 0);
   173     return field;
   174 }
   176 const char *
   177 PreparsedUCD::nextField() {
   178     if(fieldLimit==lineLimit) { return NULL; }
   179     char *field=fieldLimit+1;
   180     fieldLimit=strchr(field, 0);
   181     return field;
   182 }
   184 const UniProps *
   185 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
   186     if(U_FAILURE(errorCode)) { return NULL; }
   187     newValues.clear();
   188     if(!lineHasPropertyValues()) {
   189         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   190         return NULL;
   191     }
   192     firstField();
   193     const char *field=nextField();
   194     if(field==NULL) {
   195         // No range field after the type.
   196         fprintf(stderr,
   197                 "error in preparsed UCD: missing default/block/cp range field "
   198                 "(no second field) on line %ld\n",
   199                 (long)lineNumber);
   200         errorCode=U_PARSE_ERROR;
   201         return NULL;
   202     }
   203     UChar32 start, end;
   204     if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
   205     UniProps *props;
   206     switch(lineType) {
   207     case DEFAULTS_LINE:
   208         if(defaultLineIndex>=0) {
   209             fprintf(stderr,
   210                     "error in preparsed UCD: second line with default properties on line %ld\n",
   211                     (long)lineNumber);
   212             errorCode=U_PARSE_ERROR;
   213             return NULL;
   214         }
   215         if(start!=0 || end!=0x10ffff) {
   216             fprintf(stderr,
   217                     "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
   218                     field, (long)lineNumber);
   219             errorCode=U_PARSE_ERROR;
   220             return NULL;
   221         }
   222         props=&defaultProps;
   223         defaultLineIndex=lineIndex;
   224         break;
   225     case BLOCK_LINE:
   226         blockProps=defaultProps;  // Block inherits default properties.
   227         props=&blockProps;
   228         blockLineIndex=lineIndex;
   229         break;
   230     case CP_LINE:
   231         if(blockProps.start<=start && end<=blockProps.end) {
   232             // Code point range fully inside the last block inherits the block properties.
   233             cpProps=blockProps;
   234         } else if(start>blockProps.end || end<blockProps.start) {
   235             // Code point range fully outside the last block inherits the default properties.
   236             cpProps=defaultProps;
   237         } else {
   238             // Code point range partially overlapping with the last block is illegal.
   239             fprintf(stderr,
   240                     "error in preparsed UCD: cp range %s on line %ld only "
   241                     "partially overlaps with block range %04lX..%04lX\n",
   242                     field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
   243             errorCode=U_PARSE_ERROR;
   244             return NULL;
   245         }
   246         props=&cpProps;
   247         break;
   248     default:
   249         // Will not occur because of the range check above.
   250         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   251         return NULL;
   252     }
   253     props->start=start;
   254     props->end=end;
   255     while((field=nextField())!=NULL) {
   256         if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
   257     }
   258     return props;
   259 }
   261 static const struct {
   262     const char *name;
   263     int32_t prop;
   264 } ppucdProperties[]={
   265     { "Name_Alias", PPUCD_NAME_ALIAS },
   266     { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
   267     { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
   268 };
   270 // Returns TRUE for "ok to continue parsing fields".
   271 UBool
   272 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
   273                             UErrorCode &errorCode) {
   274     CharString pBuffer;
   275     const char *p=field;
   276     const char *v=strchr(p, '=');
   277     int binaryValue;
   278     if(*p=='-') {
   279         if(v!=NULL) {
   280             fprintf(stderr,
   281                     "error in preparsed UCD: mix of binary-property-no and "
   282                     "enum-property syntax '%s' on line %ld\n",
   283                     field, (long)lineNumber);
   284             errorCode=U_PARSE_ERROR;
   285             return FALSE;
   286         }
   287         binaryValue=0;
   288         ++p;
   289     } else if(v==NULL) {
   290         binaryValue=1;
   291     } else {
   292         binaryValue=-1;
   293         // Copy out the property name rather than modifying the field (writing a NUL).
   294         pBuffer.append(p, (int32_t)(v-p), errorCode);
   295         p=pBuffer.data();
   296         ++v;
   297     }
   298     int32_t prop=pnames->getPropertyEnum(p);
   299     if(prop<0) {
   300         for(int32_t i=0;; ++i) {
   301             if(i==LENGTHOF(ppucdProperties)) {
   302                 // Ignore unknown property names.
   303                 return TRUE;
   304             }
   305             if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
   306                 prop=ppucdProperties[i].prop;
   307                 U_ASSERT(prop>=0);
   308                 break;
   309             }
   310         }
   311     }
   312     if(prop<UCHAR_BINARY_LIMIT) {
   313         if(binaryValue>=0) {
   314             props.binProps[prop]=(UBool)binaryValue;
   315         } else {
   316             // No binary value for a binary property.
   317             fprintf(stderr,
   318                     "error in preparsed UCD: enum-property syntax '%s' "
   319                     "for binary property on line %ld\n",
   320                     field, (long)lineNumber);
   321             errorCode=U_PARSE_ERROR;
   322         }
   323     } else if(binaryValue>=0) {
   324         // Binary value for a non-binary property.
   325         fprintf(stderr,
   326                 "error in preparsed UCD: binary-property syntax '%s' "
   327                 "for non-binary property on line %ld\n",
   328                 field, (long)lineNumber);
   329         errorCode=U_PARSE_ERROR;
   330     } else if (prop < UCHAR_INT_START) {
   331         fprintf(stderr,
   332                 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
   333                 prop, (long)lineNumber);
   334         errorCode=U_PARSE_ERROR;
   335     } else if(prop<UCHAR_INT_LIMIT) {
   336         int32_t value=pnames->getPropertyValueEnum(prop, v);
   337         if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
   338             // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
   339             char *end;
   340             unsigned long ccc=uprv_strtoul(v, &end, 10);
   341             if(v<end && *end==0 && ccc<=254) {
   342                 value=(int32_t)ccc;
   343             }
   344         }
   345         if(value==UCHAR_INVALID_CODE) {
   346             fprintf(stderr,
   347                     "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
   348                     field, (long)lineNumber);
   349             errorCode=U_PARSE_ERROR;
   350         } else {
   351             props.intProps[prop-UCHAR_INT_START]=value;
   352         }
   353     } else if(*v=='<') {
   354         // Do not parse default values like <code point>, just set null values.
   355         switch(prop) {
   356         case UCHAR_BIDI_MIRRORING_GLYPH:
   357             props.bmg=U_SENTINEL;
   358             break;
   359         case UCHAR_BIDI_PAIRED_BRACKET:
   360             props.bpb=U_SENTINEL;
   361             break;
   362         case UCHAR_SIMPLE_CASE_FOLDING:
   363             props.scf=U_SENTINEL;
   364             break;
   365         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
   366             props.slc=U_SENTINEL;
   367             break;
   368         case UCHAR_SIMPLE_TITLECASE_MAPPING:
   369             props.stc=U_SENTINEL;
   370             break;
   371         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
   372             props.suc=U_SENTINEL;
   373             break;
   374         case UCHAR_CASE_FOLDING:
   375             props.cf.remove();
   376             break;
   377         case UCHAR_LOWERCASE_MAPPING:
   378             props.lc.remove();
   379             break;
   380         case UCHAR_TITLECASE_MAPPING:
   381             props.tc.remove();
   382             break;
   383         case UCHAR_UPPERCASE_MAPPING:
   384             props.uc.remove();
   385             break;
   386         case UCHAR_SCRIPT_EXTENSIONS:
   387             props.scx.clear();
   388             break;
   389         default:
   390             fprintf(stderr,
   391                     "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
   392                     field, (long)lineNumber);
   393             errorCode=U_PARSE_ERROR;
   394         }
   395     } else {
   396         char c;
   397         switch(prop) {
   398         case UCHAR_NUMERIC_VALUE:
   399             props.numericValue=v;
   400             c=*v;
   401             if('0'<=c && c<='9' && v[1]==0) {
   402                 props.digitValue=c-'0';
   403             } else {
   404                 props.digitValue=-1;
   405             }
   406             break;
   407         case UCHAR_NAME:
   408             props.name=v;
   409             break;
   410         case UCHAR_AGE:
   411             u_versionFromString(props.age, v);  // Writes 0.0.0.0 if v is not numeric.
   412             break;
   413         case UCHAR_BIDI_MIRRORING_GLYPH:
   414             props.bmg=parseCodePoint(v, errorCode);
   415             break;
   416         case UCHAR_BIDI_PAIRED_BRACKET:
   417             props.bpb=parseCodePoint(v, errorCode);
   418             break;
   419         case UCHAR_SIMPLE_CASE_FOLDING:
   420             props.scf=parseCodePoint(v, errorCode);
   421             break;
   422         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
   423             props.slc=parseCodePoint(v, errorCode);
   424             break;
   425         case UCHAR_SIMPLE_TITLECASE_MAPPING:
   426             props.stc=parseCodePoint(v, errorCode);
   427             break;
   428         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
   429             props.suc=parseCodePoint(v, errorCode);
   430             break;
   431         case UCHAR_CASE_FOLDING:
   432             parseString(v, props.cf, errorCode);
   433             break;
   434         case UCHAR_LOWERCASE_MAPPING:
   435             parseString(v, props.lc, errorCode);
   436             break;
   437         case UCHAR_TITLECASE_MAPPING:
   438             parseString(v, props.tc, errorCode);
   439             break;
   440         case UCHAR_UPPERCASE_MAPPING:
   441             parseString(v, props.uc, errorCode);
   442             break;
   443         case PPUCD_NAME_ALIAS:
   444             props.nameAlias=v;
   445             break;
   446         case PPUCD_CONDITIONAL_CASE_MAPPINGS:
   447         case PPUCD_TURKIC_CASE_FOLDING:
   448             // No need to parse their values: They are hardcoded in the runtime library.
   449             break;
   450         case UCHAR_SCRIPT_EXTENSIONS:
   451             parseScriptExtensions(v, props.scx, errorCode);
   452             break;
   453         default:
   454             // Ignore unhandled properties.
   455             return TRUE;
   456         }
   457     }
   458     if(U_SUCCESS(errorCode)) {
   459         newValues.add((UChar32)prop);
   460         return TRUE;
   461     } else {
   462         return FALSE;
   463     }
   464 }
   466 UBool
   467 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
   468     if(U_FAILURE(errorCode)) { return FALSE; }
   469     if(lineType!=ALG_NAMES_RANGE_LINE) {
   470         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   471         return FALSE;
   472     }
   473     firstField();
   474     const char *field=nextField();
   475     if(field==NULL) {
   476         // No range field after the type.
   477         fprintf(stderr,
   478                 "error in preparsed UCD: missing algnamesrange range field "
   479                 "(no second field) on line %ld\n",
   480                 (long)lineNumber);
   481         errorCode=U_PARSE_ERROR;
   482         return FALSE;
   483     }
   484     return parseCodePointRange(field, start, end, errorCode);
   485 }
   487 UChar32
   488 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
   489     char *end;
   490     uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
   491     if(end<=s || *end!=0 || value>=0x110000) {
   492         fprintf(stderr,
   493                 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
   494                 s, (long)lineNumber);
   495         errorCode=U_PARSE_ERROR;
   496         return U_SENTINEL;
   497     }
   498     return (UChar32)value;
   499 }
   501 UBool
   502 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
   503     uint32_t st, e;
   504     u_parseCodePointRange(s, &st, &e, &errorCode);
   505     if(U_FAILURE(errorCode)) {
   506         fprintf(stderr,
   507                 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
   508                 s, (long)lineNumber);
   509         return FALSE;
   510     }
   511     start=(UChar32)st;
   512     end=(UChar32)e;
   513     return TRUE;
   514 }
   516 void
   517 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
   518     UChar *buffer=uni.getBuffer(-1);
   519     int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
   520     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
   521         errorCode=U_ZERO_ERROR;
   522         uni.releaseBuffer(0);
   523         buffer=uni.getBuffer(length);
   524         length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
   525     }
   526     uni.releaseBuffer(length);
   527     if(U_FAILURE(errorCode)) {
   528         fprintf(stderr,
   529                 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
   530                 s, (long)lineNumber);
   531     }
   532 }
   534 void
   535 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
   536     if(U_FAILURE(errorCode)) { return; }
   537     scx.clear();
   538     CharString scString;
   539     for(;;) {
   540         const char *scs;
   541         const char *scLimit=strchr(s, ' ');
   542         if(scLimit!=NULL) {
   543             scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
   544             if(U_FAILURE(errorCode)) { return; }
   545         } else {
   546             scs=s;
   547         }
   548         int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
   549         if(script==UCHAR_INVALID_CODE) {
   550             fprintf(stderr,
   551                     "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
   552                     scs, (long)lineNumber);
   553             errorCode=U_PARSE_ERROR;
   554             return;
   555         } else if(scx.contains(script)) {
   556             fprintf(stderr,
   557                     "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
   558                     scs, (long)lineNumber);
   559             errorCode=U_PARSE_ERROR;
   560             return;
   561         } else {
   562             scx.add(script);
   563         }
   564         if(scLimit!=NULL) {
   565             s=scLimit+1;
   566         } else {
   567             break;
   568         }
   569     }
   570     if(scx.isEmpty()) {
   571         fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
   572         errorCode=U_PARSE_ERROR;
   573     }
   574 }
   576 U_NAMESPACE_END

mercurial