1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/gennorm2/n2builder.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1231 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2009-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: n2builder.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2009nov25 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* Builds Normalizer2 data and writes a binary .nrm file. 1.20 +* For the file format see source/common/normalizer2impl.h. 1.21 +*/ 1.22 + 1.23 +#include "unicode/utypes.h" 1.24 +#include "n2builder.h" 1.25 + 1.26 +#include <stdio.h> 1.27 +#include <stdlib.h> 1.28 +#include <string.h> 1.29 +#if U_HAVE_STD_STRING 1.30 +#include <vector> 1.31 +#endif 1.32 +#include "unicode/errorcode.h" 1.33 +#include "unicode/localpointer.h" 1.34 +#include "unicode/putil.h" 1.35 +#include "unicode/udata.h" 1.36 +#include "unicode/uniset.h" 1.37 +#include "unicode/unistr.h" 1.38 +#include "unicode/ustring.h" 1.39 +#include "hash.h" 1.40 +#include "normalizer2impl.h" 1.41 +#include "toolutil.h" 1.42 +#include "unewdata.h" 1.43 +#include "utrie2.h" 1.44 +#include "uvectr32.h" 1.45 + 1.46 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.47 + 1.48 +#if !UCONFIG_NO_NORMALIZATION 1.49 + 1.50 +/* UDataInfo cf. udata.h */ 1.51 +static UDataInfo dataInfo={ 1.52 + sizeof(UDataInfo), 1.53 + 0, 1.54 + 1.55 + U_IS_BIG_ENDIAN, 1.56 + U_CHARSET_FAMILY, 1.57 + U_SIZEOF_UCHAR, 1.58 + 0, 1.59 + 1.60 + { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ 1.61 + { 2, 0, 0, 0 }, /* formatVersion */ 1.62 + { 5, 2, 0, 0 } /* dataVersion (Unicode version) */ 1.63 +}; 1.64 + 1.65 +U_NAMESPACE_BEGIN 1.66 + 1.67 +class HangulIterator { 1.68 +public: 1.69 + struct Range { 1.70 + UChar32 start, limit; 1.71 + uint16_t norm16; 1.72 + }; 1.73 + 1.74 + HangulIterator() : rangeIndex(0) {} 1.75 + const Range *nextRange() { 1.76 + if(rangeIndex<LENGTHOF(ranges)) { 1.77 + return ranges+rangeIndex++; 1.78 + } else { 1.79 + return NULL; 1.80 + } 1.81 + } 1.82 + void reset() { rangeIndex=0; } 1.83 +private: 1.84 + static const Range ranges[4]; 1.85 + int32_t rangeIndex; 1.86 +}; 1.87 + 1.88 +const HangulIterator::Range HangulIterator::ranges[4]={ 1.89 + { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 }, 1.90 + { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT }, 1.91 + // JAMO_T_BASE+1: not U+11A7 1.92 + { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT }, 1.93 + { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 }, // will become minYesNo 1.94 +}; 1.95 + 1.96 +struct CompositionPair { 1.97 + CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {} 1.98 + UChar32 trail, composite; 1.99 +}; 1.100 + 1.101 +struct Norm { 1.102 + enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY }; 1.103 + 1.104 + UBool hasMapping() const { return mappingType>REMOVED; } 1.105 + 1.106 + // Requires hasMapping() and well-formed mapping. 1.107 + void setMappingCP() { 1.108 + UChar32 c; 1.109 + if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) { 1.110 + mappingCP=c; 1.111 + } else { 1.112 + mappingCP=U_SENTINEL; 1.113 + } 1.114 + } 1.115 + 1.116 + const CompositionPair *getCompositionPairs(int32_t &length) const { 1.117 + if(compositions==NULL) { 1.118 + length=0; 1.119 + return NULL; 1.120 + } else { 1.121 + length=compositions->size()/2; 1.122 + return reinterpret_cast<const CompositionPair *>(compositions->getBuffer()); 1.123 + } 1.124 + } 1.125 + 1.126 + UnicodeString *mapping; 1.127 + UnicodeString *rawMapping; // non-NULL if the mapping is further decomposed 1.128 + UChar32 mappingCP; // >=0 if mapping to 1 code point 1.129 + int32_t mappingPhase; 1.130 + MappingType mappingType; 1.131 + 1.132 + UVector32 *compositions; // (trail, composite) pairs 1.133 + uint8_t cc; 1.134 + UBool combinesBack; 1.135 + UBool hasNoCompBoundaryAfter; 1.136 + 1.137 + enum OffsetType { 1.138 + OFFSET_NONE, 1.139 + // Composition for back-combining character. Allowed, but not normally used. 1.140 + OFFSET_MAYBE_YES, 1.141 + // Composition for a starter that does not have a decomposition mapping. 1.142 + OFFSET_YES_YES, 1.143 + // Round-trip mapping & composition for a starter. 1.144 + OFFSET_YES_NO_MAPPING_AND_COMPOSITION, 1.145 + // Round-trip mapping for a starter that itself does not combine-forward. 1.146 + OFFSET_YES_NO_MAPPING_ONLY, 1.147 + // One-way mapping. 1.148 + OFFSET_NO_NO, 1.149 + // Delta for an algorithmic one-way mapping. 1.150 + OFFSET_DELTA 1.151 + }; 1.152 + enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 }; 1.153 + int32_t offset; 1.154 +}; 1.155 + 1.156 +class Normalizer2DBEnumerator { 1.157 +public: 1.158 + Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {} 1.159 + virtual ~Normalizer2DBEnumerator() {} 1.160 + virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0; 1.161 + Normalizer2DBEnumerator *ptr() { return this; } 1.162 +protected: 1.163 + Normalizer2DataBuilder &builder; 1.164 +}; 1.165 + 1.166 +U_CDECL_BEGIN 1.167 + 1.168 +static UBool U_CALLCONV 1.169 +enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1.170 + return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value); 1.171 +} 1.172 + 1.173 +U_CDECL_END 1.174 + 1.175 +Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : 1.176 + phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL) { 1.177 + memset(unicodeVersion, 0, sizeof(unicodeVersion)); 1.178 + normTrie=utrie2_open(0, 0, &errorCode); 1.179 + normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm)); 1.180 + norms=allocNorm(); // unused Norm struct at index 0 1.181 + memset(indexes, 0, sizeof(indexes)); 1.182 + memset(smallFCD, 0, sizeof(smallFCD)); 1.183 +} 1.184 + 1.185 +Normalizer2DataBuilder::~Normalizer2DataBuilder() { 1.186 + utrie2_close(normTrie); 1.187 + int32_t normsLength=utm_countItems(normMem); 1.188 + for(int32_t i=1; i<normsLength; ++i) { 1.189 + delete norms[i].mapping; 1.190 + delete norms[i].rawMapping; 1.191 + delete norms[i].compositions; 1.192 + } 1.193 + utm_close(normMem); 1.194 + utrie2_close(norm16Trie); 1.195 +} 1.196 + 1.197 +void 1.198 +Normalizer2DataBuilder::setUnicodeVersion(const char *v) { 1.199 + UVersionInfo nullVersion={ 0, 0, 0, 0 }; 1.200 + UVersionInfo version; 1.201 + u_versionFromString(version, v); 1.202 + if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) && 1.203 + 0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH) 1.204 + ) { 1.205 + char buffer[U_MAX_VERSION_STRING_LENGTH]; 1.206 + u_versionToString(unicodeVersion, buffer); 1.207 + fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n", 1.208 + buffer, v); 1.209 + exit(U_ILLEGAL_ARGUMENT_ERROR); 1.210 + } 1.211 + memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH); 1.212 +} 1.213 + 1.214 +Norm *Normalizer2DataBuilder::allocNorm() { 1.215 + Norm *p=(Norm *)utm_alloc(normMem); 1.216 + norms=(Norm *)utm_getStart(normMem); // in case it got reallocated 1.217 + return p; 1.218 +} 1.219 + 1.220 +/* get an existing Norm unit */ 1.221 +Norm *Normalizer2DataBuilder::getNorm(UChar32 c) { 1.222 + uint32_t i=utrie2_get32(normTrie, c); 1.223 + if(i==0) { 1.224 + return NULL; 1.225 + } 1.226 + return norms+i; 1.227 +} 1.228 + 1.229 +const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const { 1.230 + return norms[utrie2_get32(normTrie, c)]; 1.231 +} 1.232 + 1.233 +/* 1.234 + * get or create a Norm unit; 1.235 + * get or create the intermediate trie entries for it as well 1.236 + */ 1.237 +Norm *Normalizer2DataBuilder::createNorm(UChar32 c) { 1.238 + uint32_t i=utrie2_get32(normTrie, c); 1.239 + if(i!=0) { 1.240 + return norms+i; 1.241 + } else { 1.242 + /* allocate Norm */ 1.243 + Norm *p=allocNorm(); 1.244 + IcuToolErrorCode errorCode("gennorm2/createNorm()"); 1.245 + utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode); 1.246 + return p; 1.247 + } 1.248 +} 1.249 + 1.250 +Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { 1.251 + if(p!=NULL) { 1.252 + if(p->mappingType!=Norm::NONE) { 1.253 + if( overrideHandling==OVERRIDE_NONE || 1.254 + (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) 1.255 + ) { 1.256 + fprintf(stderr, 1.257 + "error in gennorm2 phase %d: " 1.258 + "not permitted to override mapping for U+%04lX from phase %d\n", 1.259 + (int)phase, (long)c, (int)p->mappingPhase); 1.260 + exit(U_INVALID_FORMAT_ERROR); 1.261 + } 1.262 + delete p->mapping; 1.263 + p->mapping=NULL; 1.264 + } 1.265 + p->mappingPhase=phase; 1.266 + } 1.267 + return p; 1.268 +} 1.269 + 1.270 +void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { 1.271 + overrideHandling=oh; 1.272 + ++phase; 1.273 +} 1.274 + 1.275 +void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { 1.276 + createNorm(c)->cc=cc; 1.277 +} 1.278 + 1.279 +uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const { 1.280 + return getNormRef(c).cc; 1.281 +} 1.282 + 1.283 +static UBool isWellFormed(const UnicodeString &s) { 1.284 + UErrorCode errorCode=U_ZERO_ERROR; 1.285 + u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode); 1.286 + return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; 1.287 +} 1.288 + 1.289 +void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { 1.290 + if(!isWellFormed(m)) { 1.291 + fprintf(stderr, 1.292 + "error in gennorm2 phase %d: " 1.293 + "illegal one-way mapping from U+%04lX to malformed string\n", 1.294 + (int)phase, (long)c); 1.295 + exit(U_INVALID_FORMAT_ERROR); 1.296 + } 1.297 + Norm *p=checkNormForMapping(createNorm(c), c); 1.298 + p->mapping=new UnicodeString(m); 1.299 + p->mappingType=Norm::ONE_WAY; 1.300 + p->setMappingCP(); 1.301 +} 1.302 + 1.303 +void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { 1.304 + if(U_IS_SURROGATE(c)) { 1.305 + fprintf(stderr, 1.306 + "error in gennorm2 phase %d: " 1.307 + "illegal round-trip mapping from surrogate code point U+%04lX\n", 1.308 + (int)phase, (long)c); 1.309 + exit(U_INVALID_FORMAT_ERROR); 1.310 + } 1.311 + if(!isWellFormed(m)) { 1.312 + fprintf(stderr, 1.313 + "error in gennorm2 phase %d: " 1.314 + "illegal round-trip mapping from U+%04lX to malformed string\n", 1.315 + (int)phase, (long)c); 1.316 + exit(U_INVALID_FORMAT_ERROR); 1.317 + } 1.318 + int32_t numCP=u_countChar32(m.getBuffer(), m.length()); 1.319 + if(numCP!=2) { 1.320 + fprintf(stderr, 1.321 + "error in gennorm2 phase %d: " 1.322 + "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", 1.323 + (int)phase, (long)c, (int)numCP); 1.324 + exit(U_INVALID_FORMAT_ERROR); 1.325 + } 1.326 + Norm *p=checkNormForMapping(createNorm(c), c); 1.327 + p->mapping=new UnicodeString(m); 1.328 + p->mappingType=Norm::ROUND_TRIP; 1.329 + p->mappingCP=U_SENTINEL; 1.330 +} 1.331 + 1.332 +void Normalizer2DataBuilder::removeMapping(UChar32 c) { 1.333 + Norm *p=checkNormForMapping(getNorm(c), c); 1.334 + if(p!=NULL) { 1.335 + p->mappingType=Norm::REMOVED; 1.336 + } 1.337 +} 1.338 + 1.339 +class CompositionBuilder : public Normalizer2DBEnumerator { 1.340 +public: 1.341 + CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 1.342 + virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 1.343 + builder.addComposition(start, end, value); 1.344 + return TRUE; 1.345 + } 1.346 +}; 1.347 + 1.348 +void 1.349 +Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) { 1.350 + if(norms[value].mappingType==Norm::ROUND_TRIP) { 1.351 + if(start!=end) { 1.352 + fprintf(stderr, 1.353 + "gennorm2 error: same round-trip mapping for " 1.354 + "more than 1 code point U+%04lX..U+%04lX\n", 1.355 + (long)start, (long)end); 1.356 + exit(U_INVALID_FORMAT_ERROR); 1.357 + } 1.358 + if(norms[value].cc!=0) { 1.359 + fprintf(stderr, 1.360 + "gennorm2 error: " 1.361 + "U+%04lX has a round-trip mapping and ccc!=0, " 1.362 + "not possible in Unicode normalization\n", 1.363 + (long)start); 1.364 + exit(U_INVALID_FORMAT_ERROR); 1.365 + } 1.366 + // setRoundTripMapping() ensured that there are exactly two code points. 1.367 + const UnicodeString &m=*norms[value].mapping; 1.368 + UChar32 lead=m.char32At(0); 1.369 + UChar32 trail=m.char32At(m.length()-1); 1.370 + if(getCC(lead)!=0) { 1.371 + fprintf(stderr, 1.372 + "gennorm2 error: " 1.373 + "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " 1.374 + "not possible in Unicode normalization\n", 1.375 + (long)start, (long)lead); 1.376 + exit(U_INVALID_FORMAT_ERROR); 1.377 + } 1.378 + // Flag for trailing character. 1.379 + createNorm(trail)->combinesBack=TRUE; 1.380 + // Insert (trail, composite) pair into compositions list for the lead character. 1.381 + IcuToolErrorCode errorCode("gennorm2/addComposition()"); 1.382 + Norm *leadNorm=createNorm(lead); 1.383 + UVector32 *compositions=leadNorm->compositions; 1.384 + int32_t i; 1.385 + if(compositions==NULL) { 1.386 + compositions=leadNorm->compositions=new UVector32(errorCode); 1.387 + i=0; // "insert" the first pair at index 0 1.388 + } else { 1.389 + // Insertion sort, and check for duplicate trail characters. 1.390 + int32_t length; 1.391 + const CompositionPair *pairs=leadNorm->getCompositionPairs(length); 1.392 + for(i=0; i<length; ++i) { 1.393 + if(trail==pairs[i].trail) { 1.394 + fprintf(stderr, 1.395 + "gennorm2 error: same round-trip mapping for " 1.396 + "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n", 1.397 + (long)start, (long)lead, (long)trail); 1.398 + exit(U_INVALID_FORMAT_ERROR); 1.399 + } 1.400 + if(trail<pairs[i].trail) { 1.401 + break; 1.402 + } 1.403 + } 1.404 + } 1.405 + compositions->insertElementAt(trail, 2*i, errorCode); 1.406 + compositions->insertElementAt(start, 2*i+1, errorCode); 1.407 + } 1.408 +} 1.409 + 1.410 +UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm, 1.411 + uint8_t lowCC, uint8_t highCC) const { 1.412 + if((highCC-lowCC)>=2) { 1.413 + int32_t length; 1.414 + const CompositionPair *pairs=norm.getCompositionPairs(length); 1.415 + for(int32_t i=0; i<length; ++i) { 1.416 + uint8_t trailCC=getCC(pairs[i].trail); 1.417 + if(lowCC<trailCC && trailCC<highCC) { 1.418 + return TRUE; 1.419 + } 1.420 + } 1.421 + } 1.422 + return FALSE; 1.423 +} 1.424 + 1.425 +UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const { 1.426 + int32_t length; 1.427 + const CompositionPair *pairs=norm.getCompositionPairs(length); 1.428 + for(int32_t i=0; i<length; ++i) { 1.429 + if(trail==pairs[i].trail) { 1.430 + return pairs[i].composite; 1.431 + } 1.432 + if(trail<pairs[i].trail) { 1.433 + break; 1.434 + } 1.435 + } 1.436 + return U_SENTINEL; 1.437 +} 1.438 + 1.439 +class Decomposer : public Normalizer2DBEnumerator { 1.440 +public: 1.441 + Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {} 1.442 + virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 1.443 + didDecompose|=builder.decompose(start, end, value); 1.444 + return TRUE; 1.445 + } 1.446 + UBool didDecompose; 1.447 +}; 1.448 + 1.449 +UBool 1.450 +Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) { 1.451 + if(norms[value].hasMapping()) { 1.452 + Norm &norm=norms[value]; 1.453 + const UnicodeString &m=*norm.mapping; 1.454 + UnicodeString *decomposed=NULL; 1.455 + const UChar *s=m.getBuffer(); 1.456 + int32_t length=m.length(); 1.457 + int32_t prev, i=0; 1.458 + UChar32 c; 1.459 + while(i<length) { 1.460 + prev=i; 1.461 + U16_NEXT(s, i, length, c); 1.462 + if(start<=c && c<=end) { 1.463 + fprintf(stderr, 1.464 + "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", 1.465 + (long)c); 1.466 + exit(U_INVALID_FORMAT_ERROR); 1.467 + } 1.468 + const Norm &cNorm=getNormRef(c); 1.469 + if(cNorm.hasMapping()) { 1.470 + if(norm.mappingType==Norm::ROUND_TRIP) { 1.471 + if(prev==0) { 1.472 + if(cNorm.mappingType!=Norm::ROUND_TRIP) { 1.473 + fprintf(stderr, 1.474 + "gennorm2 error: " 1.475 + "U+%04lX's round-trip mapping's starter " 1.476 + "U+%04lX one-way-decomposes, " 1.477 + "not possible in Unicode normalization\n", 1.478 + (long)start, (long)c); 1.479 + exit(U_INVALID_FORMAT_ERROR); 1.480 + } 1.481 + uint8_t myTrailCC=getCC(m.char32At(i)); 1.482 + UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1); 1.483 + uint8_t cTrailCC=getCC(cTrailChar); 1.484 + if(cTrailCC>myTrailCC) { 1.485 + fprintf(stderr, 1.486 + "gennorm2 error: " 1.487 + "U+%04lX's round-trip mapping's starter " 1.488 + "U+%04lX decomposes and the " 1.489 + "inner/earlier tccc=%hu > outer/following tccc=%hu, " 1.490 + "not possible in Unicode normalization\n", 1.491 + (long)start, (long)c, 1.492 + (short)cTrailCC, (short)myTrailCC); 1.493 + exit(U_INVALID_FORMAT_ERROR); 1.494 + } 1.495 + } else { 1.496 + fprintf(stderr, 1.497 + "gennorm2 error: " 1.498 + "U+%04lX's round-trip mapping's non-starter " 1.499 + "U+%04lX decomposes, " 1.500 + "not possible in Unicode normalization\n", 1.501 + (long)start, (long)c); 1.502 + exit(U_INVALID_FORMAT_ERROR); 1.503 + } 1.504 + } 1.505 + if(decomposed==NULL) { 1.506 + decomposed=new UnicodeString(m, 0, prev); 1.507 + } 1.508 + decomposed->append(*cNorm.mapping); 1.509 + } else if(Hangul::isHangul(c)) { 1.510 + UChar buffer[3]; 1.511 + int32_t hangulLength=Hangul::decompose(c, buffer); 1.512 + if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) { 1.513 + fprintf(stderr, 1.514 + "gennorm2 error: " 1.515 + "U+%04lX's round-trip mapping's non-starter " 1.516 + "U+%04lX decomposes, " 1.517 + "not possible in Unicode normalization\n", 1.518 + (long)start, (long)c); 1.519 + exit(U_INVALID_FORMAT_ERROR); 1.520 + } 1.521 + if(decomposed==NULL) { 1.522 + decomposed=new UnicodeString(m, 0, prev); 1.523 + } 1.524 + decomposed->append(buffer, hangulLength); 1.525 + } else if(decomposed!=NULL) { 1.526 + decomposed->append(m, prev, i-prev); 1.527 + } 1.528 + } 1.529 + if(decomposed!=NULL) { 1.530 + if(norm.rawMapping==NULL) { 1.531 + // Remember the original mapping when decomposing recursively. 1.532 + norm.rawMapping=norm.mapping; 1.533 + } else { 1.534 + delete norm.mapping; 1.535 + } 1.536 + norm.mapping=decomposed; 1.537 + // Not norm.setMappingCP(); because the original mapping 1.538 + // is most likely to be encodable as a delta. 1.539 + return TRUE; 1.540 + } 1.541 + } 1.542 + return FALSE; 1.543 +} 1.544 + 1.545 +class BuilderReorderingBuffer { 1.546 +public: 1.547 + BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {} 1.548 + void reset() { 1.549 + fLength=0; 1.550 + fLastStarterIndex=-1; 1.551 + fDidReorder=FALSE; 1.552 + } 1.553 + int32_t length() const { return fLength; } 1.554 + UBool isEmpty() const { return fLength==0; } 1.555 + int32_t lastStarterIndex() const { return fLastStarterIndex; } 1.556 + UChar32 charAt(int32_t i) const { return fArray[i]>>8; } 1.557 + uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; } 1.558 + UBool didReorder() const { return fDidReorder; } 1.559 + void append(UChar32 c, uint8_t cc) { 1.560 + if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { 1.561 + if(cc==0) { 1.562 + fLastStarterIndex=fLength; 1.563 + } 1.564 + fArray[fLength++]=(c<<8)|cc; 1.565 + return; 1.566 + } 1.567 + // Let this character bubble back to its canonical order. 1.568 + int32_t i=fLength-1; 1.569 + while(i>fLastStarterIndex && ccAt(i)>cc) { 1.570 + --i; 1.571 + } 1.572 + ++i; // after the last starter or prevCC<=cc 1.573 + // Move this and the following characters forward one to make space. 1.574 + for(int32_t j=fLength; i<j; --j) { 1.575 + fArray[j]=fArray[j-1]; 1.576 + } 1.577 + fArray[i]=(c<<8)|cc; 1.578 + ++fLength; 1.579 + fDidReorder=TRUE; 1.580 + } 1.581 + void toString(UnicodeString &dest) { 1.582 + dest.remove(); 1.583 + for(int32_t i=0; i<fLength; ++i) { 1.584 + dest.append(charAt(i)); 1.585 + } 1.586 + } 1.587 + void setComposite(UChar32 composite, int32_t combMarkIndex) { 1.588 + fArray[fLastStarterIndex]=composite<<8; 1.589 + // Remove the combining mark that contributed to the composite. 1.590 + --fLength; 1.591 + while(combMarkIndex<fLength) { 1.592 + fArray[combMarkIndex]=fArray[combMarkIndex+1]; 1.593 + ++combMarkIndex; 1.594 + } 1.595 + } 1.596 +private: 1.597 + int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK]; 1.598 + int32_t fLength; 1.599 + int32_t fLastStarterIndex; 1.600 + UBool fDidReorder; 1.601 +}; 1.602 + 1.603 +void 1.604 +Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) { 1.605 + UnicodeString &m=*p->mapping; 1.606 + int32_t length=m.length(); 1.607 + if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 1.608 + return; // writeMapping() will complain about it and print the code point. 1.609 + } 1.610 + const UChar *s=m.getBuffer(); 1.611 + int32_t i=0; 1.612 + UChar32 c; 1.613 + while(i<length) { 1.614 + U16_NEXT(s, i, length, c); 1.615 + buffer.append(c, getCC(c)); 1.616 + } 1.617 + if(buffer.didReorder()) { 1.618 + buffer.toString(m); 1.619 + } 1.620 +} 1.621 + 1.622 +/* 1.623 + * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter(). 1.624 + * A starter character with a mapping does not have a composition boundary after it 1.625 + * if the character itself combines-forward (which is tested by the caller of this function), 1.626 + * or it is deleted (mapped to the empty string), 1.627 + * or its mapping contains no starter, 1.628 + * or the last starter combines-forward. 1.629 + */ 1.630 +UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) { 1.631 + if(buffer.isEmpty()) { 1.632 + return TRUE; // maps-to-empty-string is no boundary of any kind 1.633 + } 1.634 + int32_t lastStarterIndex=buffer.lastStarterIndex(); 1.635 + if(lastStarterIndex<0) { 1.636 + return TRUE; // no starter 1.637 + } 1.638 + UChar32 starter=buffer.charAt(lastStarterIndex); 1.639 + if( Hangul::isJamoL(starter) || 1.640 + (Hangul::isJamoV(starter) && 1.641 + 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1))) 1.642 + ) { 1.643 + // A Jamo leading consonant or an LV pair combines-forward if it is at the end, 1.644 + // otherwise it is blocked. 1.645 + return lastStarterIndex==buffer.length()-1; 1.646 + } 1.647 + // Note: There can be no Hangul syllable in the fully decomposed mapping. 1.648 + const Norm *starterNorm=&getNormRef(starter); 1.649 + if(starterNorm->compositions==NULL) { 1.650 + return FALSE; // the last starter does not combine forward 1.651 + } 1.652 + // Compose as far as possible, and see if further compositions are possible. 1.653 + uint8_t prevCC=0; 1.654 + for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) { 1.655 + uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starter 1.656 + if(combinesWithCCBetween(*starterNorm, prevCC, cc)) { 1.657 + return TRUE; 1.658 + } 1.659 + if( prevCC<cc && 1.660 + (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0 1.661 + ) { 1.662 + buffer.setComposite(starter, combMarkIndex); 1.663 + starterNorm=&getNormRef(starter); 1.664 + if(starterNorm->compositions==NULL) { 1.665 + return FALSE; // the composite does not combine further 1.666 + } 1.667 + } else { 1.668 + prevCC=cc; 1.669 + ++combMarkIndex; 1.670 + } 1.671 + } 1.672 + // TRUE if the final, forward-combining starter is at the end. 1.673 + return prevCC==0; 1.674 +} 1.675 + 1.676 +// Requires p->hasMapping(). 1.677 +// Returns the offset of the "first unit" from the beginning of the extraData for c. 1.678 +// That is the same as the length of the optional data for the raw mapping and the ccc/lccc word. 1.679 +int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) { 1.680 + UnicodeString &m=*p->mapping; 1.681 + int32_t length=m.length(); 1.682 + if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 1.683 + fprintf(stderr, 1.684 + "gennorm2 error: " 1.685 + "mapping for U+%04lX longer than maximum of %d\n", 1.686 + (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); 1.687 + exit(U_INVALID_FORMAT_ERROR); 1.688 + } 1.689 + int32_t leadCC, trailCC; 1.690 + if(length==0) { 1.691 + leadCC=trailCC=0; 1.692 + } else { 1.693 + leadCC=getCC(m.char32At(0)); 1.694 + trailCC=getCC(m.char32At(length-1)); 1.695 + } 1.696 + if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) { 1.697 + fprintf(stderr, 1.698 + "gennorm2 error: " 1.699 + "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n", 1.700 + (long)c); 1.701 + exit(U_INVALID_FORMAT_ERROR); 1.702 + } 1.703 + // Write small-FCD data. 1.704 + if((leadCC|trailCC)!=0) { 1.705 + UChar32 lead= c<=0xffff ? c : U16_LEAD(c); 1.706 + smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); 1.707 + } 1.708 + // Write the mapping & raw mapping extraData. 1.709 + int32_t firstUnit=length|(trailCC<<8); 1.710 + int32_t preMappingLength=0; 1.711 + if(p->rawMapping!=NULL) { 1.712 + UnicodeString &rm=*p->rawMapping; 1.713 + int32_t rmLength=rm.length(); 1.714 + if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) { 1.715 + fprintf(stderr, 1.716 + "gennorm2 error: " 1.717 + "raw mapping for U+%04lX longer than maximum of %d\n", 1.718 + (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); 1.719 + exit(U_INVALID_FORMAT_ERROR); 1.720 + } 1.721 + UChar rm0=rm.charAt(0); 1.722 + if( rmLength==length-1 && 1.723 + // 99: overlong substring lengths get pinned to remainder lengths anyway 1.724 + 0==rm.compare(1, 99, m, 2, 99) && 1.725 + rm0>Normalizer2Impl::MAPPING_LENGTH_MASK 1.726 + ) { 1.727 + // Compression: 1.728 + // rawMapping=rm0+mapping.substring(2) -> store only rm0 1.729 + // 1.730 + // The raw mapping is the same as the final mapping after replacing 1.731 + // the final mapping's first two code units with the raw mapping's first one. 1.732 + // In this case, we store only that first unit, rm0. 1.733 + // This helps with a few hundred mappings. 1.734 + dataString.append(rm0); 1.735 + preMappingLength=1; 1.736 + } else { 1.737 + // Store the raw mapping with its length. 1.738 + dataString.append(rm); 1.739 + dataString.append((UChar)rmLength); 1.740 + preMappingLength=rmLength+1; 1.741 + } 1.742 + firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING; 1.743 + } 1.744 + int32_t cccLccc=p->cc|(leadCC<<8); 1.745 + if(cccLccc!=0) { 1.746 + dataString.append((UChar)cccLccc); 1.747 + ++preMappingLength; 1.748 + firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; 1.749 + } 1.750 + if(p->hasNoCompBoundaryAfter) { 1.751 + firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER; 1.752 + } 1.753 + dataString.append((UChar)firstUnit); 1.754 + dataString.append(m); 1.755 + return preMappingLength; 1.756 +} 1.757 + 1.758 +// Requires p->compositions!=NULL. 1.759 +void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) { 1.760 + if(p->cc!=0) { 1.761 + fprintf(stderr, 1.762 + "gennorm2 error: " 1.763 + "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", 1.764 + (long)c); 1.765 + exit(U_INVALID_FORMAT_ERROR); 1.766 + } 1.767 + int32_t length; 1.768 + const CompositionPair *pairs=p->getCompositionPairs(length); 1.769 + for(int32_t i=0; i<length; ++i) { 1.770 + const CompositionPair &pair=pairs[i]; 1.771 + // 22 bits for the composite character and whether it combines forward. 1.772 + UChar32 compositeAndFwd=pair.composite<<1; 1.773 + if(getNormRef(pair.composite).compositions!=NULL) { 1.774 + compositeAndFwd|=1; // The composite character also combines-forward. 1.775 + } 1.776 + // Encode most pairs in two units and some in three. 1.777 + int32_t firstUnit, secondUnit, thirdUnit; 1.778 + if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { 1.779 + if(compositeAndFwd<=0xffff) { 1.780 + firstUnit=pair.trail<<1; 1.781 + secondUnit=compositeAndFwd; 1.782 + thirdUnit=-1; 1.783 + } else { 1.784 + firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; 1.785 + secondUnit=compositeAndFwd>>16; 1.786 + thirdUnit=compositeAndFwd; 1.787 + } 1.788 + } else { 1.789 + firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ 1.790 + (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| 1.791 + Normalizer2Impl::COMP_1_TRIPLE; 1.792 + secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| 1.793 + (compositeAndFwd>>16); 1.794 + thirdUnit=compositeAndFwd; 1.795 + } 1.796 + // Set the high bit of the first unit if this is the last composition pair. 1.797 + if(i==(length-1)) { 1.798 + firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; 1.799 + } 1.800 + dataString.append((UChar)firstUnit).append((UChar)secondUnit); 1.801 + if(thirdUnit>=0) { 1.802 + dataString.append((UChar)thirdUnit); 1.803 + } 1.804 + } 1.805 +} 1.806 + 1.807 +class ExtraDataWriter : public Normalizer2DBEnumerator { 1.808 +public: 1.809 + ExtraDataWriter(Normalizer2DataBuilder &b) : 1.810 + Normalizer2DBEnumerator(b), 1.811 + yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions 1.812 + yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data 1.813 + virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 1.814 + if(value!=0) { 1.815 + if(start!=end) { 1.816 + fprintf(stderr, 1.817 + "gennorm2 error: unexpected shared data for " 1.818 + "multiple code points U+%04lX..U+%04lX\n", 1.819 + (long)start, (long)end); 1.820 + exit(U_INTERNAL_PROGRAM_ERROR); 1.821 + } 1.822 + builder.writeExtraData(start, value, *this); 1.823 + } 1.824 + return TRUE; 1.825 + } 1.826 + UnicodeString maybeYesCompositions; 1.827 + UnicodeString yesYesCompositions; 1.828 + UnicodeString yesNoMappingsAndCompositions; 1.829 + UnicodeString yesNoMappingsOnly; 1.830 + UnicodeString noNoMappings; 1.831 + Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode. 1.832 +}; 1.833 + 1.834 +void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) { 1.835 + Norm *p=norms+value; 1.836 + if(!p->hasMapping()) { 1.837 + // Write small-FCD data. 1.838 + // There is similar code in writeMapping() for characters that do have a mapping. 1.839 + if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) { 1.840 + fprintf(stderr, 1.841 + "gennorm2 error: " 1.842 + "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n", 1.843 + (long)c); 1.844 + exit(U_INVALID_FORMAT_ERROR); 1.845 + } 1.846 + if(p->cc!=0) { 1.847 + UChar32 lead= c<=0xffff ? c : U16_LEAD(c); 1.848 + smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); 1.849 + } 1.850 + } 1.851 + if(p->combinesBack) { 1.852 + if(p->hasMapping()) { 1.853 + fprintf(stderr, 1.854 + "gennorm2 error: " 1.855 + "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n", 1.856 + (long)c); 1.857 + exit(U_INVALID_FORMAT_ERROR); 1.858 + } 1.859 + if(p->compositions!=NULL) { 1.860 + p->offset= 1.861 + (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)| 1.862 + Norm::OFFSET_MAYBE_YES; 1.863 + writeCompositions(c, p, writer.maybeYesCompositions); 1.864 + } 1.865 + } else if(!p->hasMapping()) { 1.866 + if(p->compositions!=NULL) { 1.867 + p->offset= 1.868 + (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)| 1.869 + Norm::OFFSET_YES_YES; 1.870 + writeCompositions(c, p, writer.yesYesCompositions); 1.871 + } 1.872 + } else if(p->mappingType==Norm::ROUND_TRIP) { 1.873 + if(p->compositions!=NULL) { 1.874 + int32_t offset=writer.yesNoMappingsAndCompositions.length()+ 1.875 + writeMapping(c, p, writer.yesNoMappingsAndCompositions); 1.876 + p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION; 1.877 + writeCompositions(c, p, writer.yesNoMappingsAndCompositions); 1.878 + } else { 1.879 + int32_t offset=writer.yesNoMappingsOnly.length()+ 1.880 + writeMapping(c, p, writer.yesNoMappingsOnly); 1.881 + p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY; 1.882 + } 1.883 + } else /* one-way */ { 1.884 + if(p->compositions!=NULL) { 1.885 + fprintf(stderr, 1.886 + "gennorm2 error: " 1.887 + "U+%04lX combines-forward and has a one-way mapping, " 1.888 + "not possible in Unicode normalization\n", 1.889 + (long)c); 1.890 + exit(U_INVALID_FORMAT_ERROR); 1.891 + } 1.892 + if(p->cc==0 && optimization!=OPTIMIZE_FAST) { 1.893 + // Try a compact, algorithmic encoding. 1.894 + // Only for ccc=0, because we can't store additional information 1.895 + // and we do not recursively follow an algorithmic encoding for access to the ccc. 1.896 + // 1.897 + // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding 1.898 + // if the mappingCP decomposes further, to ensure that there is a place to store it. 1.899 + // We want to see that the final mapping does not have exactly 1 code point, 1.900 + // or else we would have to recursively ensure that the final mapping is stored 1.901 + // in normal extraData. 1.902 + if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) { 1.903 + int32_t delta=p->mappingCP-c; 1.904 + if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { 1.905 + p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA; 1.906 + } 1.907 + } 1.908 + } 1.909 + if(p->offset==0) { 1.910 + int32_t oldNoNoLength=writer.noNoMappings.length(); 1.911 + int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings); 1.912 + UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength); 1.913 + int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping); 1.914 + if(previousOffset!=0) { 1.915 + // Duplicate, remove the new units and point to the old ones. 1.916 + writer.noNoMappings.truncate(oldNoNoLength); 1.917 + p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO; 1.918 + } else { 1.919 + // Enter this new mapping into the hashtable, avoiding value 0 which is "not found". 1.920 + IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()"); 1.921 + writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode); 1.922 + p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO; 1.923 + } 1.924 + } 1.925 + } 1.926 +} 1.927 + 1.928 +class Norm16Writer : public Normalizer2DBEnumerator { 1.929 +public: 1.930 + Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 1.931 + virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 1.932 + builder.writeNorm16(start, end, value); 1.933 + return TRUE; 1.934 + } 1.935 +}; 1.936 + 1.937 +void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) { 1.938 + if(value!=0) { 1.939 + const Norm *p=norms+value; 1.940 + int32_t offset=p->offset>>Norm::OFFSET_SHIFT; 1.941 + int32_t norm16=0; 1.942 + UBool isDecompNo=FALSE; 1.943 + UBool isCompNoMaybe=FALSE; 1.944 + switch(p->offset&Norm::OFFSET_MASK) { 1.945 + case Norm::OFFSET_NONE: 1.946 + // No mapping, no compositions list. 1.947 + if(p->combinesBack) { 1.948 + norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc; 1.949 + isDecompNo=(UBool)(p->cc!=0); 1.950 + isCompNoMaybe=TRUE; 1.951 + } else if(p->cc!=0) { 1.952 + norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc; 1.953 + isDecompNo=isCompNoMaybe=TRUE; 1.954 + } 1.955 + break; 1.956 + case Norm::OFFSET_MAYBE_YES: 1.957 + norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset; 1.958 + isCompNoMaybe=TRUE; 1.959 + break; 1.960 + case Norm::OFFSET_YES_YES: 1.961 + norm16=offset; 1.962 + break; 1.963 + case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION: 1.964 + norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset; 1.965 + isDecompNo=TRUE; 1.966 + break; 1.967 + case Norm::OFFSET_YES_NO_MAPPING_ONLY: 1.968 + norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset; 1.969 + isDecompNo=TRUE; 1.970 + break; 1.971 + case Norm::OFFSET_NO_NO: 1.972 + norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset; 1.973 + isDecompNo=isCompNoMaybe=TRUE; 1.974 + break; 1.975 + case Norm::OFFSET_DELTA: 1.976 + norm16=getCenterNoNoDelta()+offset; 1.977 + isDecompNo=isCompNoMaybe=TRUE; 1.978 + break; 1.979 + default: // Should not occur. 1.980 + exit(U_INTERNAL_PROGRAM_ERROR); 1.981 + } 1.982 + IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); 1.983 + utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode); 1.984 + if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 1.985 + indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; 1.986 + } 1.987 + if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { 1.988 + indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; 1.989 + } 1.990 + } 1.991 +} 1.992 + 1.993 +void Normalizer2DataBuilder::setHangulData() { 1.994 + HangulIterator hi; 1.995 + const HangulIterator::Range *range; 1.996 + // Check that none of the Hangul/Jamo code points have data. 1.997 + while((range=hi.nextRange())!=NULL) { 1.998 + for(UChar32 c=range->start; c<range->limit; ++c) { 1.999 + if(utrie2_get32(norm16Trie, c)!=0) { 1.1000 + fprintf(stderr, 1.1001 + "gennorm2 error: " 1.1002 + "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", 1.1003 + (long)c); 1.1004 + exit(U_INVALID_FORMAT_ERROR); 1.1005 + } 1.1006 + } 1.1007 + } 1.1008 + // Set data for algorithmic runtime handling. 1.1009 + IcuToolErrorCode errorCode("gennorm2/setHangulData()"); 1.1010 + hi.reset(); 1.1011 + while((range=hi.nextRange())!=NULL) { 1.1012 + uint16_t norm16=range->norm16; 1.1013 + if(norm16==0) { 1.1014 + norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO]; // Hangul LV/LVT encoded as minYesNo 1.1015 + if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 1.1016 + indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start; 1.1017 + } 1.1018 + } else { 1.1019 + if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { // Jamo V/T are maybeYes 1.1020 + indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start; 1.1021 + } 1.1022 + } 1.1023 + utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode); 1.1024 + errorCode.assertSuccess(); 1.1025 + } 1.1026 +} 1.1027 + 1.1028 +U_CDECL_BEGIN 1.1029 + 1.1030 +static UBool U_CALLCONV 1.1031 +enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { 1.1032 + uint32_t *pMaxValue=(uint32_t *)context; 1.1033 + if(value>*pMaxValue) { 1.1034 + *pMaxValue=value; 1.1035 + } 1.1036 + return TRUE; 1.1037 +} 1.1038 + 1.1039 +U_CDECL_END 1.1040 + 1.1041 +void Normalizer2DataBuilder::processData() { 1.1042 + IcuToolErrorCode errorCode("gennorm2/processData()"); 1.1043 + norm16Trie=utrie2_open(0, 0, errorCode); 1.1044 + errorCode.assertSuccess(); 1.1045 + 1.1046 + utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr()); 1.1047 + 1.1048 + Decomposer decomposer(*this); 1.1049 + do { 1.1050 + decomposer.didDecompose=FALSE; 1.1051 + utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer); 1.1052 + } while(decomposer.didDecompose); 1.1053 + 1.1054 + BuilderReorderingBuffer buffer; 1.1055 + int32_t normsLength=utm_countItems(normMem); 1.1056 + for(int32_t i=1; i<normsLength; ++i) { 1.1057 + // Set the hasNoCompBoundaryAfter flag for use by the last code branch 1.1058 + // in Normalizer2Impl::hasCompBoundaryAfter(). 1.1059 + // For details see the comments on hasNoCompBoundaryAfter(buffer). 1.1060 + const Norm &norm=norms[i]; 1.1061 + if(norm.hasMapping()) { 1.1062 + if(norm.compositions!=NULL) { 1.1063 + norms[i].hasNoCompBoundaryAfter=TRUE; 1.1064 + } else { 1.1065 + buffer.reset(); 1.1066 + reorder(norms+i, buffer); 1.1067 + norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer); 1.1068 + } 1.1069 + } 1.1070 + } 1.1071 + 1.1072 + indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; 1.1073 + indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; 1.1074 + 1.1075 + ExtraDataWriter extraDataWriter(*this); 1.1076 + utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter); 1.1077 + 1.1078 + extraData=extraDataWriter.maybeYesCompositions; 1.1079 + extraData.append(extraDataWriter.yesYesCompositions). 1.1080 + append(extraDataWriter.yesNoMappingsAndCompositions). 1.1081 + append(extraDataWriter.yesNoMappingsOnly). 1.1082 + append(extraDataWriter.noNoMappings); 1.1083 + // Pad to even length for 4-byte alignment of following data. 1.1084 + if(extraData.length()&1) { 1.1085 + extraData.append((UChar)0); 1.1086 + } 1.1087 + 1.1088 + indexes[Normalizer2Impl::IX_MIN_YES_NO]= 1.1089 + extraDataWriter.yesYesCompositions.length(); 1.1090 + indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]= 1.1091 + indexes[Normalizer2Impl::IX_MIN_YES_NO]+ 1.1092 + extraDataWriter.yesNoMappingsAndCompositions.length(); 1.1093 + indexes[Normalizer2Impl::IX_MIN_NO_NO]= 1.1094 + indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+ 1.1095 + extraDataWriter.yesNoMappingsOnly.length(); 1.1096 + indexes[Normalizer2Impl::IX_LIMIT_NO_NO]= 1.1097 + indexes[Normalizer2Impl::IX_MIN_NO_NO]+ 1.1098 + extraDataWriter.noNoMappings.length(); 1.1099 + indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]= 1.1100 + Normalizer2Impl::MIN_NORMAL_MAYBE_YES- 1.1101 + extraDataWriter.maybeYesCompositions.length(); 1.1102 + 1.1103 + int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA; 1.1104 + if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { 1.1105 + fprintf(stderr, 1.1106 + "gennorm2 error: " 1.1107 + "data structure overflow, too much mapping composition data\n"); 1.1108 + exit(U_BUFFER_OVERFLOW_ERROR); 1.1109 + } 1.1110 + 1.1111 + utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr()); 1.1112 + 1.1113 + setHangulData(); 1.1114 + 1.1115 + // Look for the "worst" norm16 value of any supplementary code point 1.1116 + // corresponding to a lead surrogate, and set it as that surrogate's value. 1.1117 + // Enables quick check inner loops to look at only code units. 1.1118 + // 1.1119 + // We could be more sophisticated: 1.1120 + // We could collect a bit set for whether there are values in the different 1.1121 + // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) 1.1122 + // and select the best value that only breaks the composition and/or decomposition 1.1123 + // inner loops if necessary. 1.1124 + // However, that seems like overkill for an optimization for supplementary characters. 1.1125 + for(UChar lead=0xd800; lead<0xdc00; ++lead) { 1.1126 + uint32_t maxValue=utrie2_get32(norm16Trie, lead); 1.1127 + utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue); 1.1128 + if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] && 1.1129 + maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO] 1.1130 + ) { 1.1131 + // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. 1.1132 + // Otherwise it might end up at something like JAMO_VT which stays in 1.1133 + // the inner decomposition quick check loop. 1.1134 + maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1; 1.1135 + } 1.1136 + utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode); 1.1137 + } 1.1138 + 1.1139 + // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. 1.1140 + // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) 1.1141 + // which is harmless. 1.1142 + // As a result, the minimum code points are always BMP code points. 1.1143 + int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; 1.1144 + if(minCP>=0x10000) { 1.1145 + indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); 1.1146 + } 1.1147 + minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; 1.1148 + if(minCP>=0x10000) { 1.1149 + indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); 1.1150 + } 1.1151 +} 1.1152 + 1.1153 +void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { 1.1154 + processData(); 1.1155 + 1.1156 + IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); 1.1157 + utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode); 1.1158 + int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode); 1.1159 + if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { 1.1160 + fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n", 1.1161 + errorCode.errorName()); 1.1162 + exit(errorCode.reset()); 1.1163 + } 1.1164 + errorCode.reset(); 1.1165 + LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); 1.1166 + utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); 1.1167 + errorCode.assertSuccess(); 1.1168 + 1.1169 + int32_t offset=(int32_t)sizeof(indexes); 1.1170 + indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; 1.1171 + offset+=norm16TrieLength; 1.1172 + indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; 1.1173 + offset+=extraData.length()*2; 1.1174 + indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; 1.1175 + offset+=sizeof(smallFCD); 1.1176 + int32_t totalSize=offset; 1.1177 + for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { 1.1178 + indexes[i]=totalSize; 1.1179 + } 1.1180 + 1.1181 + if(beVerbose) { 1.1182 + printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength); 1.1183 + printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length()); 1.1184 + printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD)); 1.1185 + printf("size of binary data file contents: %5ld bytes\n", (long)totalSize); 1.1186 + printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]); 1.1187 + printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); 1.1188 + printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]); 1.1189 + printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]); 1.1190 + printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]); 1.1191 + printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); 1.1192 + printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]); 1.1193 + } 1.1194 + 1.1195 + UVersionInfo nullVersion={ 0, 0, 0, 0 }; 1.1196 + if(0==memcmp(nullVersion, unicodeVersion, 4)) { 1.1197 + u_versionFromString(unicodeVersion, U_UNICODE_VERSION); 1.1198 + } 1.1199 + memcpy(dataInfo.dataVersion, unicodeVersion, 4); 1.1200 + UNewDataMemory *pData= 1.1201 + udata_create(NULL, NULL, filename, &dataInfo, 1.1202 + haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); 1.1203 + if(errorCode.isFailure()) { 1.1204 + fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", 1.1205 + filename, errorCode.errorName()); 1.1206 + exit(errorCode.reset()); 1.1207 + } 1.1208 + udata_writeBlock(pData, indexes, sizeof(indexes)); 1.1209 + udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength); 1.1210 + udata_writeUString(pData, extraData.getBuffer(), extraData.length()); 1.1211 + udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); 1.1212 + int32_t writtenSize=udata_finish(pData, errorCode); 1.1213 + if(errorCode.isFailure()) { 1.1214 + fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); 1.1215 + exit(errorCode.reset()); 1.1216 + } 1.1217 + if(writtenSize!=totalSize) { 1.1218 + fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", 1.1219 + (long)writtenSize, (long)totalSize); 1.1220 + exit(U_INTERNAL_PROGRAM_ERROR); 1.1221 + } 1.1222 +} 1.1223 + 1.1224 +U_NAMESPACE_END 1.1225 + 1.1226 +#endif /* #if !UCONFIG_NO_NORMALIZATION */ 1.1227 + 1.1228 +/* 1.1229 + * Hey, Emacs, please set the following: 1.1230 + * 1.1231 + * Local Variables: 1.1232 + * indent-tabs-mode: nil 1.1233 + * End: 1.1234 + */