michael@0: /* michael@0: ******************************************************************************* michael@0: * Copyright (C) 2011-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ******************************************************************************* michael@0: * file name: ppucd.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2011dec11 michael@0: * created by: Markus W. Scherer michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uchar.h" michael@0: #include "charstr.h" michael@0: #include "cstring.h" michael@0: #include "ppucd.h" michael@0: #include "uassert.h" michael@0: #include "uparse.h" michael@0: michael@0: #include michael@0: #include michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: PropertyNames::~PropertyNames() {} michael@0: michael@0: int32_t michael@0: PropertyNames::getPropertyEnum(const char *name) const { michael@0: return u_getPropertyEnum(name); michael@0: } michael@0: michael@0: int32_t michael@0: PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const { michael@0: return u_getPropertyValueEnum((UProperty)property, name); michael@0: } michael@0: michael@0: UniProps::UniProps() michael@0: : start(U_SENTINEL), end(U_SENTINEL), michael@0: bmg(U_SENTINEL), bpb(U_SENTINEL), michael@0: scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL), michael@0: digitValue(-1), numericValue(NULL), michael@0: name(NULL), nameAlias(NULL) { michael@0: memset(binProps, 0, sizeof(binProps)); michael@0: memset(intProps, 0, sizeof(intProps)); michael@0: memset(age, 0, 4); michael@0: } michael@0: michael@0: UniProps::~UniProps() {} michael@0: michael@0: const int32_t PreparsedUCD::kNumLineBuffers; michael@0: michael@0: PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode) michael@0: : icuPnames(new PropertyNames()), pnames(icuPnames), michael@0: file(NULL), michael@0: defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0), michael@0: lineNumber(0), michael@0: lineType(NO_LINE), michael@0: fieldLimit(NULL), lineLimit(NULL) { michael@0: if(U_FAILURE(errorCode)) { return; } michael@0: michael@0: if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) { michael@0: filename=NULL; michael@0: file=stdin; michael@0: } else { michael@0: file=fopen(filename, "r"); michael@0: } michael@0: if(file==NULL) { michael@0: perror("error opening preparsed UCD"); michael@0: fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\""); michael@0: errorCode=U_FILE_ACCESS_ERROR; michael@0: return; michael@0: } michael@0: michael@0: memset(ucdVersion, 0, 4); michael@0: lines[0][0]=0; michael@0: } michael@0: michael@0: PreparsedUCD::~PreparsedUCD() { michael@0: if(file!=stdin) { michael@0: fclose(file); michael@0: } michael@0: delete icuPnames; michael@0: } michael@0: michael@0: // Same order as the LineType values. michael@0: static const char *lineTypeStrings[]={ michael@0: NULL, michael@0: NULL, michael@0: "ucd", michael@0: "property", michael@0: "binary", michael@0: "value", michael@0: "defaults", michael@0: "block", michael@0: "cp", michael@0: "algnamesrange" michael@0: }; michael@0: michael@0: PreparsedUCD::LineType michael@0: PreparsedUCD::readLine(UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { return NO_LINE; } michael@0: // Select the next available line buffer. michael@0: while(!isLineBufferAvailable(lineIndex)) { michael@0: ++lineIndex; michael@0: if (lineIndex == kNumLineBuffers) { michael@0: lineIndex = 0; michael@0: } michael@0: } michael@0: char *line=lines[lineIndex]; michael@0: *line=0; michael@0: lineLimit=fieldLimit=line; michael@0: lineType=NO_LINE; michael@0: char *result=fgets(line, sizeof(lines[0]), file); michael@0: if(result==NULL) { michael@0: if(ferror(file)) { michael@0: perror("error reading preparsed UCD"); michael@0: fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber); michael@0: errorCode=U_FILE_ACCESS_ERROR; michael@0: } michael@0: return NO_LINE; michael@0: } michael@0: ++lineNumber; michael@0: if(*line=='#') { michael@0: fieldLimit=strchr(line, 0); michael@0: return lineType=EMPTY_LINE; michael@0: } michael@0: // Remove trailing /r/n. michael@0: char c; michael@0: char *limit=strchr(line, 0); michael@0: while(line=0) { michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: second line with default properties on line %ld\n", michael@0: (long)lineNumber); michael@0: errorCode=U_PARSE_ERROR; michael@0: return NULL; michael@0: } michael@0: if(start!=0 || end!=0x10ffff) { michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n", michael@0: field, (long)lineNumber); michael@0: errorCode=U_PARSE_ERROR; michael@0: return NULL; michael@0: } michael@0: props=&defaultProps; michael@0: defaultLineIndex=lineIndex; michael@0: break; michael@0: case BLOCK_LINE: michael@0: blockProps=defaultProps; // Block inherits default properties. michael@0: props=&blockProps; michael@0: blockLineIndex=lineIndex; michael@0: break; michael@0: case CP_LINE: michael@0: if(blockProps.start<=start && end<=blockProps.end) { michael@0: // Code point range fully inside the last block inherits the block properties. michael@0: cpProps=blockProps; michael@0: } else if(start>blockProps.end || endstart=start; michael@0: props->end=end; michael@0: while((field=nextField())!=NULL) { michael@0: if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; } michael@0: } michael@0: return props; michael@0: } michael@0: michael@0: static const struct { michael@0: const char *name; michael@0: int32_t prop; michael@0: } ppucdProperties[]={ michael@0: { "Name_Alias", PPUCD_NAME_ALIAS }, michael@0: { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS }, michael@0: { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING } michael@0: }; michael@0: michael@0: // Returns TRUE for "ok to continue parsing fields". michael@0: UBool michael@0: PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, michael@0: UErrorCode &errorCode) { michael@0: CharString pBuffer; michael@0: const char *p=field; michael@0: const char *v=strchr(p, '='); michael@0: int binaryValue; michael@0: if(*p=='-') { michael@0: if(v!=NULL) { michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: mix of binary-property-no and " michael@0: "enum-property syntax '%s' on line %ld\n", michael@0: field, (long)lineNumber); michael@0: errorCode=U_PARSE_ERROR; michael@0: return FALSE; michael@0: } michael@0: binaryValue=0; michael@0: ++p; michael@0: } else if(v==NULL) { michael@0: binaryValue=1; michael@0: } else { michael@0: binaryValue=-1; michael@0: // Copy out the property name rather than modifying the field (writing a NUL). michael@0: pBuffer.append(p, (int32_t)(v-p), errorCode); michael@0: p=pBuffer.data(); michael@0: ++v; michael@0: } michael@0: int32_t prop=pnames->getPropertyEnum(p); michael@0: if(prop<0) { michael@0: for(int32_t i=0;; ++i) { michael@0: if(i==LENGTHOF(ppucdProperties)) { michael@0: // Ignore unknown property names. michael@0: return TRUE; michael@0: } michael@0: if(0==uprv_stricmp(p, ppucdProperties[i].name)) { michael@0: prop=ppucdProperties[i].prop; michael@0: U_ASSERT(prop>=0); michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: if(prop=0) { michael@0: props.binProps[prop]=(UBool)binaryValue; michael@0: } else { michael@0: // No binary value for a binary property. michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: enum-property syntax '%s' " michael@0: "for binary property on line %ld\n", michael@0: field, (long)lineNumber); michael@0: errorCode=U_PARSE_ERROR; michael@0: } michael@0: } else if(binaryValue>=0) { michael@0: // Binary value for a non-binary property. michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: binary-property syntax '%s' " michael@0: "for non-binary property on line %ld\n", michael@0: field, (long)lineNumber); michael@0: errorCode=U_PARSE_ERROR; michael@0: } else if (prop < UCHAR_INT_START) { michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n", michael@0: prop, (long)lineNumber); michael@0: errorCode=U_PARSE_ERROR; michael@0: } else if(propgetPropertyValueEnum(prop, v); michael@0: if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) { michael@0: // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work. michael@0: char *end; michael@0: unsigned long ccc=uprv_strtoul(v, &end, 10); michael@0: if(v, just set null values. michael@0: switch(prop) { michael@0: case UCHAR_BIDI_MIRRORING_GLYPH: michael@0: props.bmg=U_SENTINEL; michael@0: break; michael@0: case UCHAR_BIDI_PAIRED_BRACKET: michael@0: props.bpb=U_SENTINEL; michael@0: break; michael@0: case UCHAR_SIMPLE_CASE_FOLDING: michael@0: props.scf=U_SENTINEL; michael@0: break; michael@0: case UCHAR_SIMPLE_LOWERCASE_MAPPING: michael@0: props.slc=U_SENTINEL; michael@0: break; michael@0: case UCHAR_SIMPLE_TITLECASE_MAPPING: michael@0: props.stc=U_SENTINEL; michael@0: break; michael@0: case UCHAR_SIMPLE_UPPERCASE_MAPPING: michael@0: props.suc=U_SENTINEL; michael@0: break; michael@0: case UCHAR_CASE_FOLDING: michael@0: props.cf.remove(); michael@0: break; michael@0: case UCHAR_LOWERCASE_MAPPING: michael@0: props.lc.remove(); michael@0: break; michael@0: case UCHAR_TITLECASE_MAPPING: michael@0: props.tc.remove(); michael@0: break; michael@0: case UCHAR_UPPERCASE_MAPPING: michael@0: props.uc.remove(); michael@0: break; michael@0: case UCHAR_SCRIPT_EXTENSIONS: michael@0: props.scx.clear(); michael@0: break; michael@0: default: michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: '%s' is not a valid default value on line %ld\n", michael@0: field, (long)lineNumber); michael@0: errorCode=U_PARSE_ERROR; michael@0: } michael@0: } else { michael@0: char c; michael@0: switch(prop) { michael@0: case UCHAR_NUMERIC_VALUE: michael@0: props.numericValue=v; michael@0: c=*v; michael@0: if('0'<=c && c<='9' && v[1]==0) { michael@0: props.digitValue=c-'0'; michael@0: } else { michael@0: props.digitValue=-1; michael@0: } michael@0: break; michael@0: case UCHAR_NAME: michael@0: props.name=v; michael@0: break; michael@0: case UCHAR_AGE: michael@0: u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric. michael@0: break; michael@0: case UCHAR_BIDI_MIRRORING_GLYPH: michael@0: props.bmg=parseCodePoint(v, errorCode); michael@0: break; michael@0: case UCHAR_BIDI_PAIRED_BRACKET: michael@0: props.bpb=parseCodePoint(v, errorCode); michael@0: break; michael@0: case UCHAR_SIMPLE_CASE_FOLDING: michael@0: props.scf=parseCodePoint(v, errorCode); michael@0: break; michael@0: case UCHAR_SIMPLE_LOWERCASE_MAPPING: michael@0: props.slc=parseCodePoint(v, errorCode); michael@0: break; michael@0: case UCHAR_SIMPLE_TITLECASE_MAPPING: michael@0: props.stc=parseCodePoint(v, errorCode); michael@0: break; michael@0: case UCHAR_SIMPLE_UPPERCASE_MAPPING: michael@0: props.suc=parseCodePoint(v, errorCode); michael@0: break; michael@0: case UCHAR_CASE_FOLDING: michael@0: parseString(v, props.cf, errorCode); michael@0: break; michael@0: case UCHAR_LOWERCASE_MAPPING: michael@0: parseString(v, props.lc, errorCode); michael@0: break; michael@0: case UCHAR_TITLECASE_MAPPING: michael@0: parseString(v, props.tc, errorCode); michael@0: break; michael@0: case UCHAR_UPPERCASE_MAPPING: michael@0: parseString(v, props.uc, errorCode); michael@0: break; michael@0: case PPUCD_NAME_ALIAS: michael@0: props.nameAlias=v; michael@0: break; michael@0: case PPUCD_CONDITIONAL_CASE_MAPPINGS: michael@0: case PPUCD_TURKIC_CASE_FOLDING: michael@0: // No need to parse their values: They are hardcoded in the runtime library. michael@0: break; michael@0: case UCHAR_SCRIPT_EXTENSIONS: michael@0: parseScriptExtensions(v, props.scx, errorCode); michael@0: break; michael@0: default: michael@0: // Ignore unhandled properties. michael@0: return TRUE; michael@0: } michael@0: } michael@0: if(U_SUCCESS(errorCode)) { michael@0: newValues.add((UChar32)prop); michael@0: return TRUE; michael@0: } else { michael@0: return FALSE; michael@0: } michael@0: } michael@0: michael@0: UBool michael@0: PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { return FALSE; } michael@0: if(lineType!=ALG_NAMES_RANGE_LINE) { michael@0: errorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return FALSE; michael@0: } michael@0: firstField(); michael@0: const char *field=nextField(); michael@0: if(field==NULL) { michael@0: // No range field after the type. michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: missing algnamesrange range field " michael@0: "(no second field) on line %ld\n", michael@0: (long)lineNumber); michael@0: errorCode=U_PARSE_ERROR; michael@0: return FALSE; michael@0: } michael@0: return parseCodePointRange(field, start, end, errorCode); michael@0: } michael@0: michael@0: UChar32 michael@0: PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) { michael@0: char *end; michael@0: uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16); michael@0: if(end<=s || *end!=0 || value>=0x110000) { michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: '%s' is not a valid code point on line %ld\n", michael@0: s, (long)lineNumber); michael@0: errorCode=U_PARSE_ERROR; michael@0: return U_SENTINEL; michael@0: } michael@0: return (UChar32)value; michael@0: } michael@0: michael@0: UBool michael@0: PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) { michael@0: uint32_t st, e; michael@0: u_parseCodePointRange(s, &st, &e, &errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n", michael@0: s, (long)lineNumber); michael@0: return FALSE; michael@0: } michael@0: start=(UChar32)st; michael@0: end=(UChar32)e; michael@0: return TRUE; michael@0: } michael@0: michael@0: void michael@0: PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) { michael@0: UChar *buffer=uni.getBuffer(-1); michael@0: int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode); michael@0: if(errorCode==U_BUFFER_OVERFLOW_ERROR) { michael@0: errorCode=U_ZERO_ERROR; michael@0: uni.releaseBuffer(0); michael@0: buffer=uni.getBuffer(length); michael@0: length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode); michael@0: } michael@0: uni.releaseBuffer(length); michael@0: if(U_FAILURE(errorCode)) { michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n", michael@0: s, (long)lineNumber); michael@0: } michael@0: } michael@0: michael@0: void michael@0: PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { return; } michael@0: scx.clear(); michael@0: CharString scString; michael@0: for(;;) { michael@0: const char *scs; michael@0: const char *scLimit=strchr(s, ' '); michael@0: if(scLimit!=NULL) { michael@0: scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data(); michael@0: if(U_FAILURE(errorCode)) { return; } michael@0: } else { michael@0: scs=s; michael@0: } michael@0: int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs); michael@0: if(script==UCHAR_INVALID_CODE) { michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: '%s' is not a valid script code on line %ld\n", michael@0: scs, (long)lineNumber); michael@0: errorCode=U_PARSE_ERROR; michael@0: return; michael@0: } else if(scx.contains(script)) { michael@0: fprintf(stderr, michael@0: "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n", michael@0: scs, (long)lineNumber); michael@0: errorCode=U_PARSE_ERROR; michael@0: return; michael@0: } else { michael@0: scx.add(script); michael@0: } michael@0: if(scLimit!=NULL) { michael@0: s=scLimit+1; michael@0: } else { michael@0: break; michael@0: } michael@0: } michael@0: if(scx.isEmpty()) { michael@0: fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber); michael@0: errorCode=U_PARSE_ERROR; michael@0: } michael@0: } michael@0: michael@0: U_NAMESPACE_END