Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2002-2013, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * |
michael@0 | 7 | * File gendict.cpp |
michael@0 | 8 | */ |
michael@0 | 9 | |
michael@0 | 10 | #include "unicode/utypes.h" |
michael@0 | 11 | #include "unicode/uchar.h" |
michael@0 | 12 | #include "unicode/ucnv.h" |
michael@0 | 13 | #include "unicode/uniset.h" |
michael@0 | 14 | #include "unicode/unistr.h" |
michael@0 | 15 | #include "unicode/uclean.h" |
michael@0 | 16 | #include "unicode/udata.h" |
michael@0 | 17 | #include "unicode/putil.h" |
michael@0 | 18 | #include "unicode/ucharstriebuilder.h" |
michael@0 | 19 | #include "unicode/bytestriebuilder.h" |
michael@0 | 20 | #include "unicode/ucharstrie.h" |
michael@0 | 21 | #include "unicode/bytestrie.h" |
michael@0 | 22 | #include "unicode/ucnv.h" |
michael@0 | 23 | #include "unicode/utf16.h" |
michael@0 | 24 | |
michael@0 | 25 | #include "charstr.h" |
michael@0 | 26 | #include "dictionarydata.h" |
michael@0 | 27 | #include "uoptions.h" |
michael@0 | 28 | #include "unewdata.h" |
michael@0 | 29 | #include "cmemory.h" |
michael@0 | 30 | #include "uassert.h" |
michael@0 | 31 | #include "ucbuf.h" |
michael@0 | 32 | #include "toolutil.h" |
michael@0 | 33 | #include "cstring.h" |
michael@0 | 34 | |
michael@0 | 35 | #include <stdio.h> |
michael@0 | 36 | #include <stdlib.h> |
michael@0 | 37 | #include <string.h> |
michael@0 | 38 | |
michael@0 | 39 | #include "putilimp.h" |
michael@0 | 40 | UDate startTime; |
michael@0 | 41 | |
michael@0 | 42 | static int elapsedTime() { |
michael@0 | 43 | return (int)uprv_floor((uprv_getRawUTCtime()-startTime)/1000.0); |
michael@0 | 44 | } |
michael@0 | 45 | |
michael@0 | 46 | #if U_PLATFORM_IMPLEMENTS_POSIX && !U_PLATFORM_HAS_WIN32_API |
michael@0 | 47 | |
michael@0 | 48 | #include <signal.h> |
michael@0 | 49 | #include <unistd.h> |
michael@0 | 50 | |
michael@0 | 51 | const char *wToolname="gendict"; |
michael@0 | 52 | const char *wOutname="(some file)"; |
michael@0 | 53 | |
michael@0 | 54 | const int firstSeconds = 5; /* seconds between notices*/ |
michael@0 | 55 | const int nextSeconds = 15; /* seconds between notices*/ |
michael@0 | 56 | |
michael@0 | 57 | static void alarm_fn(int /*n*/) { |
michael@0 | 58 | printf("%s: still writing\t%s (%ds)\t...\n", wToolname, wOutname, elapsedTime()); |
michael@0 | 59 | |
michael@0 | 60 | signal(SIGALRM, &alarm_fn); |
michael@0 | 61 | alarm(nextSeconds); // reset the alarm |
michael@0 | 62 | } |
michael@0 | 63 | |
michael@0 | 64 | static void install_watchdog(const char *toolName, const char *outFileName) { |
michael@0 | 65 | wToolname=toolName; |
michael@0 | 66 | wOutname=outFileName; |
michael@0 | 67 | |
michael@0 | 68 | signal(SIGALRM, &alarm_fn); |
michael@0 | 69 | |
michael@0 | 70 | alarm(firstSeconds); // set the alarm |
michael@0 | 71 | } |
michael@0 | 72 | |
michael@0 | 73 | #else |
michael@0 | 74 | static void install_watchdog(const char*, const char*) { |
michael@0 | 75 | // not implemented |
michael@0 | 76 | } |
michael@0 | 77 | #endif |
michael@0 | 78 | |
michael@0 | 79 | |
michael@0 | 80 | |
michael@0 | 81 | |
michael@0 | 82 | U_NAMESPACE_USE |
michael@0 | 83 | |
michael@0 | 84 | static char *progName; |
michael@0 | 85 | static UOption options[]={ |
michael@0 | 86 | UOPTION_HELP_H, /* 0 */ |
michael@0 | 87 | UOPTION_HELP_QUESTION_MARK, /* 1 */ |
michael@0 | 88 | UOPTION_VERBOSE, /* 2 */ |
michael@0 | 89 | UOPTION_ICUDATADIR, /* 4 */ |
michael@0 | 90 | UOPTION_COPYRIGHT, /* 5 */ |
michael@0 | 91 | { "uchars", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 6 */ |
michael@0 | 92 | { "bytes", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0}, /* 7 */ |
michael@0 | 93 | { "transform", NULL, NULL, NULL, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */ |
michael@0 | 94 | }; |
michael@0 | 95 | |
michael@0 | 96 | enum arguments { |
michael@0 | 97 | ARG_HELP = 0, |
michael@0 | 98 | ARG_QMARK, |
michael@0 | 99 | ARG_VERBOSE, |
michael@0 | 100 | ARG_ICUDATADIR, |
michael@0 | 101 | ARG_COPYRIGHT, |
michael@0 | 102 | ARG_UCHARS, |
michael@0 | 103 | ARG_BYTES, |
michael@0 | 104 | ARG_TRANSFORM |
michael@0 | 105 | }; |
michael@0 | 106 | |
michael@0 | 107 | // prints out the standard usage method describing command line arguments, |
michael@0 | 108 | // then bails out with the desired exit code |
michael@0 | 109 | static void usageAndDie(UErrorCode retCode) { |
michael@0 | 110 | fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName); |
michael@0 | 111 | fprintf((U_SUCCESS(retCode) ? stdout : stderr), |
michael@0 | 112 | "\tRead in a word list and write out a string trie dictionary\n" |
michael@0 | 113 | "options:\n" |
michael@0 | 114 | "\t-h or -? or --help this usage text\n" |
michael@0 | 115 | "\t-V or --version show a version message\n" |
michael@0 | 116 | "\t-c or --copyright include a copyright notice\n" |
michael@0 | 117 | "\t-v or --verbose turn on verbose output\n" |
michael@0 | 118 | "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option |
michael@0 | 119 | "\t followed by path, defaults to %s\n" |
michael@0 | 120 | "\t--uchars output a UCharsTrie (mutually exclusive with -b!)\n" |
michael@0 | 121 | "\t--bytes output a BytesTrie (mutually exclusive with -u!)\n" |
michael@0 | 122 | "\t--transform the kind of transform to use (eg --transform offset-40A3,\n" |
michael@0 | 123 | "\t which specifies an offset transform with constant 0x40A3)\n", |
michael@0 | 124 | u_getDataDirectory()); |
michael@0 | 125 | exit(retCode); |
michael@0 | 126 | } |
michael@0 | 127 | |
michael@0 | 128 | |
michael@0 | 129 | /* UDataInfo cf. udata.h */ |
michael@0 | 130 | static UDataInfo dataInfo = { |
michael@0 | 131 | sizeof(UDataInfo), |
michael@0 | 132 | 0, |
michael@0 | 133 | |
michael@0 | 134 | U_IS_BIG_ENDIAN, |
michael@0 | 135 | U_CHARSET_FAMILY, |
michael@0 | 136 | U_SIZEOF_UCHAR, |
michael@0 | 137 | 0, |
michael@0 | 138 | |
michael@0 | 139 | { 0x44, 0x69, 0x63, 0x74 }, /* "Dict" */ |
michael@0 | 140 | { 1, 0, 0, 0 }, /* format version */ |
michael@0 | 141 | { 0, 0, 0, 0 } /* data version */ |
michael@0 | 142 | }; |
michael@0 | 143 | |
michael@0 | 144 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 145 | |
michael@0 | 146 | // A wrapper for both BytesTrieBuilder and UCharsTrieBuilder. |
michael@0 | 147 | // may want to put this somewhere in ICU, as it could be useful outside |
michael@0 | 148 | // of this tool? |
michael@0 | 149 | class DataDict { |
michael@0 | 150 | private: |
michael@0 | 151 | BytesTrieBuilder *bt; |
michael@0 | 152 | UCharsTrieBuilder *ut; |
michael@0 | 153 | UChar32 transformConstant; |
michael@0 | 154 | int32_t transformType; |
michael@0 | 155 | public: |
michael@0 | 156 | // constructs a new data dictionary. if there is an error, |
michael@0 | 157 | // it will be returned in status |
michael@0 | 158 | // isBytesTrie != 0 will produce a BytesTrieBuilder, |
michael@0 | 159 | // isBytesTrie == 0 will produce a UCharsTrieBuilder |
michael@0 | 160 | DataDict(UBool isBytesTrie, UErrorCode &status) : bt(NULL), ut(NULL), |
michael@0 | 161 | transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) { |
michael@0 | 162 | if (isBytesTrie) { |
michael@0 | 163 | bt = new BytesTrieBuilder(status); |
michael@0 | 164 | } else { |
michael@0 | 165 | ut = new UCharsTrieBuilder(status); |
michael@0 | 166 | } |
michael@0 | 167 | } |
michael@0 | 168 | |
michael@0 | 169 | ~DataDict() { |
michael@0 | 170 | delete bt; |
michael@0 | 171 | delete ut; |
michael@0 | 172 | } |
michael@0 | 173 | |
michael@0 | 174 | private: |
michael@0 | 175 | char transform(UChar32 c, UErrorCode &status) { |
michael@0 | 176 | if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) { |
michael@0 | 177 | if (c == 0x200D) { return (char)0xFF; } |
michael@0 | 178 | else if (c == 0x200C) { return (char)0xFE; } |
michael@0 | 179 | int32_t delta = c - transformConstant; |
michael@0 | 180 | if (delta < 0 || 0xFD < delta) { |
michael@0 | 181 | fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n", |
michael@0 | 182 | (long)c, (long)transformConstant); |
michael@0 | 183 | exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number |
michael@0 | 184 | } |
michael@0 | 185 | return (char)delta; |
michael@0 | 186 | } else { // no such transform type |
michael@0 | 187 | status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 188 | return (char)c; // it should be noted this transform type will not generally work |
michael@0 | 189 | } |
michael@0 | 190 | } |
michael@0 | 191 | |
michael@0 | 192 | void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) { |
michael@0 | 193 | UChar32 c = 0; |
michael@0 | 194 | int32_t len = word.length(); |
michael@0 | 195 | for (int32_t i = 0; i < len; i += U16_LENGTH(c)) { |
michael@0 | 196 | c = word.char32At(i); |
michael@0 | 197 | buf.append(transform(c, errorCode), errorCode); |
michael@0 | 198 | } |
michael@0 | 199 | } |
michael@0 | 200 | |
michael@0 | 201 | public: |
michael@0 | 202 | // sets the desired transformation data. |
michael@0 | 203 | // should be populated from a command line argument |
michael@0 | 204 | // so far the only acceptable format is offset-<hex constant> |
michael@0 | 205 | // eventually others (mask-<hex constant>?) may be enabled |
michael@0 | 206 | // more complex functions may be more difficult |
michael@0 | 207 | void setTransform(const char *t) { |
michael@0 | 208 | if (strncmp(t, "offset-", 7) == 0) { |
michael@0 | 209 | char *end; |
michael@0 | 210 | unsigned long base = uprv_strtoul(t + 7, &end, 16); |
michael@0 | 211 | if (end == (t + 7) || *end != 0 || base > 0x10FF80) { |
michael@0 | 212 | fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7); |
michael@0 | 213 | usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); |
michael@0 | 214 | } |
michael@0 | 215 | transformType = DictionaryData::TRANSFORM_TYPE_OFFSET; |
michael@0 | 216 | transformConstant = (UChar32)base; |
michael@0 | 217 | } |
michael@0 | 218 | else { |
michael@0 | 219 | fprintf(stderr, "Invalid transform specified: %s\n", t); |
michael@0 | 220 | usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); |
michael@0 | 221 | } |
michael@0 | 222 | } |
michael@0 | 223 | |
michael@0 | 224 | // add a word to the trie |
michael@0 | 225 | void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) { |
michael@0 | 226 | if (bt) { |
michael@0 | 227 | CharString buf; |
michael@0 | 228 | transform(word, buf, status); |
michael@0 | 229 | bt->add(buf.toStringPiece(), value, status); |
michael@0 | 230 | } |
michael@0 | 231 | if (ut) { ut->add(word, value, status); } |
michael@0 | 232 | } |
michael@0 | 233 | |
michael@0 | 234 | // if we are a bytestrie, give back the StringPiece representing the serialized version of us |
michael@0 | 235 | StringPiece serializeBytes(UErrorCode &status) { |
michael@0 | 236 | return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status); |
michael@0 | 237 | } |
michael@0 | 238 | |
michael@0 | 239 | // if we are a ucharstrie, produce the UnicodeString representing the serialized version of us |
michael@0 | 240 | void serializeUChars(UnicodeString &s, UErrorCode &status) { |
michael@0 | 241 | ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status); |
michael@0 | 242 | } |
michael@0 | 243 | |
michael@0 | 244 | int32_t getTransform() { |
michael@0 | 245 | return (int32_t)(transformType | transformConstant); |
michael@0 | 246 | } |
michael@0 | 247 | }; |
michael@0 | 248 | #endif |
michael@0 | 249 | |
michael@0 | 250 | static const UChar LINEFEED_CHARACTER = 0x000A; |
michael@0 | 251 | static const UChar CARRIAGE_RETURN_CHARACTER = 0x000D; |
michael@0 | 252 | |
michael@0 | 253 | static UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) { |
michael@0 | 254 | int32_t lineLength; |
michael@0 | 255 | const UChar *line = ucbuf_readline(f, &lineLength, errorCode); |
michael@0 | 256 | if(line == NULL || errorCode.isFailure()) { return FALSE; } |
michael@0 | 257 | // Strip trailing CR/LF, comments, and spaces. |
michael@0 | 258 | const UChar *comment = u_memchr(line, 0x23, lineLength); // '#' |
michael@0 | 259 | if(comment != NULL) { |
michael@0 | 260 | lineLength = (int32_t)(comment - line); |
michael@0 | 261 | } else { |
michael@0 | 262 | while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER || line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; } |
michael@0 | 263 | } |
michael@0 | 264 | while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; } |
michael@0 | 265 | fileLine.setTo(FALSE, line, lineLength); |
michael@0 | 266 | return TRUE; |
michael@0 | 267 | } |
michael@0 | 268 | |
michael@0 | 269 | //---------------------------------------------------------------------------- |
michael@0 | 270 | // |
michael@0 | 271 | // main for gendict |
michael@0 | 272 | // |
michael@0 | 273 | //---------------------------------------------------------------------------- |
michael@0 | 274 | int main(int argc, char **argv) { |
michael@0 | 275 | // |
michael@0 | 276 | // Pick up and check the command line arguments, |
michael@0 | 277 | // using the standard ICU tool utils option handling. |
michael@0 | 278 | // |
michael@0 | 279 | U_MAIN_INIT_ARGS(argc, argv); |
michael@0 | 280 | progName = argv[0]; |
michael@0 | 281 | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); |
michael@0 | 282 | if(argc<0) { |
michael@0 | 283 | // Unrecognized option |
michael@0 | 284 | fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); |
michael@0 | 285 | usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); |
michael@0 | 286 | } |
michael@0 | 287 | |
michael@0 | 288 | if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) { |
michael@0 | 289 | // -? or -h for help. |
michael@0 | 290 | usageAndDie(U_ZERO_ERROR); |
michael@0 | 291 | } |
michael@0 | 292 | |
michael@0 | 293 | UBool verbose = options[ARG_VERBOSE].doesOccur; |
michael@0 | 294 | |
michael@0 | 295 | if (argc < 3) { |
michael@0 | 296 | fprintf(stderr, "input and output file must both be specified.\n"); |
michael@0 | 297 | usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); |
michael@0 | 298 | } |
michael@0 | 299 | const char *outFileName = argv[2]; |
michael@0 | 300 | const char *wordFileName = argv[1]; |
michael@0 | 301 | |
michael@0 | 302 | startTime = uprv_getRawUTCtime(); // initialize start timer |
michael@0 | 303 | // set up the watchdog |
michael@0 | 304 | install_watchdog(progName, outFileName); |
michael@0 | 305 | |
michael@0 | 306 | if (options[ARG_ICUDATADIR].doesOccur) { |
michael@0 | 307 | u_setDataDirectory(options[ARG_ICUDATADIR].value); |
michael@0 | 308 | } |
michael@0 | 309 | |
michael@0 | 310 | const char *copyright = NULL; |
michael@0 | 311 | if (options[ARG_COPYRIGHT].doesOccur) { |
michael@0 | 312 | copyright = U_COPYRIGHT_STRING; |
michael@0 | 313 | } |
michael@0 | 314 | |
michael@0 | 315 | if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) { |
michael@0 | 316 | fprintf(stderr, "you must specify exactly one type of trie to output!\n"); |
michael@0 | 317 | usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); |
michael@0 | 318 | } |
michael@0 | 319 | UBool isBytesTrie = options[ARG_BYTES].doesOccur; |
michael@0 | 320 | if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) { |
michael@0 | 321 | fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n"); |
michael@0 | 322 | usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); |
michael@0 | 323 | } |
michael@0 | 324 | |
michael@0 | 325 | IcuToolErrorCode status("gendict/main()"); |
michael@0 | 326 | |
michael@0 | 327 | #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO |
michael@0 | 328 | const char* outDir=NULL; |
michael@0 | 329 | |
michael@0 | 330 | UNewDataMemory *pData; |
michael@0 | 331 | char msg[1024]; |
michael@0 | 332 | UErrorCode tempstatus = U_ZERO_ERROR; |
michael@0 | 333 | |
michael@0 | 334 | /* write message with just the name */ // potential for a buffer overflow here... |
michael@0 | 335 | sprintf(msg, "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); |
michael@0 | 336 | fprintf(stderr, "%s\n", msg); |
michael@0 | 337 | |
michael@0 | 338 | /* write the dummy data file */ |
michael@0 | 339 | pData = udata_create(outDir, NULL, outFileName, &dataInfo, NULL, &tempstatus); |
michael@0 | 340 | udata_writeBlock(pData, msg, strlen(msg)); |
michael@0 | 341 | udata_finish(pData, &tempstatus); |
michael@0 | 342 | return (int)tempstatus; |
michael@0 | 343 | |
michael@0 | 344 | #else |
michael@0 | 345 | // Read in the dictionary source file |
michael@0 | 346 | if (verbose) { printf("Opening file %s...\n", wordFileName); } |
michael@0 | 347 | const char *codepage = "UTF-8"; |
michael@0 | 348 | UCHARBUF *f = ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status); |
michael@0 | 349 | if (status.isFailure()) { |
michael@0 | 350 | fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName()); |
michael@0 | 351 | exit(status.reset()); |
michael@0 | 352 | } |
michael@0 | 353 | if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); } |
michael@0 | 354 | DataDict dict(isBytesTrie, status); |
michael@0 | 355 | if (status.isFailure()) { |
michael@0 | 356 | fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName()); |
michael@0 | 357 | exit(status.reset()); |
michael@0 | 358 | } |
michael@0 | 359 | if (options[ARG_TRANSFORM].doesOccur) { |
michael@0 | 360 | dict.setTransform(options[ARG_TRANSFORM].value); |
michael@0 | 361 | } |
michael@0 | 362 | |
michael@0 | 363 | UnicodeString fileLine; |
michael@0 | 364 | if (verbose) { puts("Adding words to dictionary..."); } |
michael@0 | 365 | UBool hasValues = FALSE; |
michael@0 | 366 | UBool hasValuelessContents = FALSE; |
michael@0 | 367 | int lineCount = 0; |
michael@0 | 368 | int wordCount = 0; |
michael@0 | 369 | int minlen = 255; |
michael@0 | 370 | int maxlen = 0; |
michael@0 | 371 | UBool isOk = TRUE; |
michael@0 | 372 | while (readLine(f, fileLine, status)) { |
michael@0 | 373 | lineCount++; |
michael@0 | 374 | if (fileLine.isEmpty()) continue; |
michael@0 | 375 | |
michael@0 | 376 | // Parse word [spaces value]. |
michael@0 | 377 | int32_t keyLen; |
michael@0 | 378 | for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {} |
michael@0 | 379 | if (keyLen == 0) { |
michael@0 | 380 | fprintf(stderr, "Error: no word on line %i!\n", lineCount); |
michael@0 | 381 | isOk = FALSE; |
michael@0 | 382 | continue; |
michael@0 | 383 | } |
michael@0 | 384 | int32_t valueStart; |
michael@0 | 385 | for (valueStart = keyLen; |
michael@0 | 386 | valueStart < fileLine.length() && u_isspace(fileLine[valueStart]); |
michael@0 | 387 | ++valueStart) {} |
michael@0 | 388 | |
michael@0 | 389 | if (keyLen < valueStart) { |
michael@0 | 390 | int32_t valueLength = fileLine.length() - valueStart; |
michael@0 | 391 | if (valueLength > 15) { |
michael@0 | 392 | fprintf(stderr, "Error: value too long on line %i!\n", lineCount); |
michael@0 | 393 | isOk = FALSE; |
michael@0 | 394 | continue; |
michael@0 | 395 | } |
michael@0 | 396 | char s[16]; |
michael@0 | 397 | fileLine.extract(valueStart, valueLength, s, 16, US_INV); |
michael@0 | 398 | char *end; |
michael@0 | 399 | unsigned long value = uprv_strtoul(s, &end, 0); |
michael@0 | 400 | if (end == s || *end != 0 || (int32_t)uprv_strlen(s) != valueLength || value > 0xffffffff) { |
michael@0 | 401 | fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount); |
michael@0 | 402 | isOk = FALSE; |
michael@0 | 403 | continue; |
michael@0 | 404 | } |
michael@0 | 405 | dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status); |
michael@0 | 406 | hasValues = TRUE; |
michael@0 | 407 | wordCount++; |
michael@0 | 408 | if (keyLen < minlen) minlen = keyLen; |
michael@0 | 409 | if (keyLen > maxlen) maxlen = keyLen; |
michael@0 | 410 | } else { |
michael@0 | 411 | dict.addWord(fileLine.tempSubString(0, keyLen), 0, status); |
michael@0 | 412 | hasValuelessContents = TRUE; |
michael@0 | 413 | wordCount++; |
michael@0 | 414 | if (keyLen < minlen) minlen = keyLen; |
michael@0 | 415 | if (keyLen > maxlen) maxlen = keyLen; |
michael@0 | 416 | } |
michael@0 | 417 | |
michael@0 | 418 | if (status.isFailure()) { |
michael@0 | 419 | fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n", |
michael@0 | 420 | status.errorName(), lineCount); |
michael@0 | 421 | exit(status.reset()); |
michael@0 | 422 | } |
michael@0 | 423 | } |
michael@0 | 424 | if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); } |
michael@0 | 425 | |
michael@0 | 426 | if (!isOk && status.isSuccess()) { |
michael@0 | 427 | status.set(U_ILLEGAL_ARGUMENT_ERROR); |
michael@0 | 428 | } |
michael@0 | 429 | if (hasValues && hasValuelessContents) { |
michael@0 | 430 | fprintf(stderr, "warning: file contained both valued and unvalued strings!\n"); |
michael@0 | 431 | } |
michael@0 | 432 | |
michael@0 | 433 | if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); } |
michael@0 | 434 | int32_t outDataSize; |
michael@0 | 435 | const void *outData; |
michael@0 | 436 | UnicodeString usp; |
michael@0 | 437 | if (isBytesTrie) { |
michael@0 | 438 | StringPiece sp = dict.serializeBytes(status); |
michael@0 | 439 | outDataSize = sp.size(); |
michael@0 | 440 | outData = sp.data(); |
michael@0 | 441 | } else { |
michael@0 | 442 | dict.serializeUChars(usp, status); |
michael@0 | 443 | outDataSize = usp.length() * U_SIZEOF_UCHAR; |
michael@0 | 444 | outData = usp.getBuffer(); |
michael@0 | 445 | } |
michael@0 | 446 | if (status.isFailure()) { |
michael@0 | 447 | fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName()); |
michael@0 | 448 | exit(status.reset()); |
michael@0 | 449 | } |
michael@0 | 450 | if (verbose) { puts("Opening output file..."); } |
michael@0 | 451 | UNewDataMemory *pData = udata_create(NULL, NULL, outFileName, &dataInfo, copyright, status); |
michael@0 | 452 | if (status.isFailure()) { |
michael@0 | 453 | fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName()); |
michael@0 | 454 | exit(status.reset()); |
michael@0 | 455 | } |
michael@0 | 456 | |
michael@0 | 457 | if (verbose) { puts("Writing to output file..."); } |
michael@0 | 458 | int32_t indexes[DictionaryData::IX_COUNT] = { |
michael@0 | 459 | DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0 |
michael@0 | 460 | }; |
michael@0 | 461 | int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; |
michael@0 | 462 | indexes[DictionaryData::IX_RESERVED1_OFFSET] = size; |
michael@0 | 463 | indexes[DictionaryData::IX_RESERVED2_OFFSET] = size; |
michael@0 | 464 | indexes[DictionaryData::IX_TOTAL_SIZE] = size; |
michael@0 | 465 | |
michael@0 | 466 | indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS; |
michael@0 | 467 | if (hasValues) { |
michael@0 | 468 | indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES; |
michael@0 | 469 | } |
michael@0 | 470 | |
michael@0 | 471 | indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform(); |
michael@0 | 472 | udata_writeBlock(pData, indexes, sizeof(indexes)); |
michael@0 | 473 | udata_writeBlock(pData, outData, outDataSize); |
michael@0 | 474 | size_t bytesWritten = udata_finish(pData, status); |
michael@0 | 475 | if (status.isFailure()) { |
michael@0 | 476 | fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName()); |
michael@0 | 477 | exit(status.reset()); |
michael@0 | 478 | } |
michael@0 | 479 | |
michael@0 | 480 | if (bytesWritten != (size_t)size) { |
michael@0 | 481 | fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); |
michael@0 | 482 | exit(U_INTERNAL_PROGRAM_ERROR); |
michael@0 | 483 | } |
michael@0 | 484 | |
michael@0 | 485 | printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime()); |
michael@0 | 486 | |
michael@0 | 487 | #ifdef TEST_GENDICT |
michael@0 | 488 | if (isBytesTrie) { |
michael@0 | 489 | BytesTrie::Iterator it(outData, outDataSize, status); |
michael@0 | 490 | while (it.hasNext()) { |
michael@0 | 491 | it.next(status); |
michael@0 | 492 | const StringPiece s = it.getString(); |
michael@0 | 493 | int32_t val = it.getValue(); |
michael@0 | 494 | printf("%s -> %i\n", s.data(), val); |
michael@0 | 495 | } |
michael@0 | 496 | } else { |
michael@0 | 497 | UCharsTrie::Iterator it((const UChar *)outData, outDataSize, status); |
michael@0 | 498 | while (it.hasNext()) { |
michael@0 | 499 | it.next(status); |
michael@0 | 500 | const UnicodeString s = it.getString(); |
michael@0 | 501 | int32_t val = it.getValue(); |
michael@0 | 502 | char tmp[1024]; |
michael@0 | 503 | s.extract(0, s.length(), tmp, 1024); |
michael@0 | 504 | printf("%s -> %i\n", tmp, val); |
michael@0 | 505 | } |
michael@0 | 506 | } |
michael@0 | 507 | #endif |
michael@0 | 508 | |
michael@0 | 509 | return 0; |
michael@0 | 510 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
michael@0 | 511 | } |