|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2002-2009, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * |
|
7 * File genbrk.c |
|
8 */ |
|
9 |
|
10 //-------------------------------------------------------------------- |
|
11 // |
|
12 // Tool for generating RuleBasedBreakIterator data files (.brk files). |
|
13 // .brk files contain the precompiled rules for standard types |
|
14 // of iterators - word, line, sentence, etc. |
|
15 // |
|
16 // Usage: genbrk [options] -r rule-file.txt -o output-file.brk |
|
17 // |
|
18 // options: -v verbose |
|
19 // -? or -h help |
|
20 // |
|
21 // The input rule file is a plain text file containing break rules |
|
22 // in the input format accepted by RuleBasedBreakIterators. The |
|
23 // file can be encoded as utf-8, or utf-16 (either endian), or |
|
24 // in the default code page (platform dependent.). utf encoded |
|
25 // files must include a BOM. |
|
26 // |
|
27 //-------------------------------------------------------------------- |
|
28 |
|
29 #include "unicode/utypes.h" |
|
30 #include "unicode/ucnv.h" |
|
31 #include "unicode/unistr.h" |
|
32 #include "unicode/rbbi.h" |
|
33 #include "unicode/uclean.h" |
|
34 #include "unicode/udata.h" |
|
35 #include "unicode/putil.h" |
|
36 |
|
37 #include "uoptions.h" |
|
38 #include "unewdata.h" |
|
39 #include "ucmndata.h" |
|
40 #include "rbbidata.h" |
|
41 #include "cmemory.h" |
|
42 |
|
43 #include <stdio.h> |
|
44 #include <stdlib.h> |
|
45 #include <string.h> |
|
46 |
|
47 U_NAMESPACE_USE |
|
48 |
|
49 static char *progName; |
|
50 static UOption options[]={ |
|
51 UOPTION_HELP_H, /* 0 */ |
|
52 UOPTION_HELP_QUESTION_MARK, /* 1 */ |
|
53 UOPTION_VERBOSE, /* 2 */ |
|
54 { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */ |
|
55 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */ |
|
56 UOPTION_ICUDATADIR, /* 5 */ |
|
57 UOPTION_DESTDIR, /* 6 */ |
|
58 UOPTION_COPYRIGHT, /* 7 */ |
|
59 }; |
|
60 |
|
61 void usageAndDie(int retCode) { |
|
62 printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName); |
|
63 printf("\tRead in break iteration rules text and write out the binary data\n" |
|
64 "options:\n" |
|
65 "\t-h or -? or --help this usage text\n" |
|
66 "\t-V or --version show a version message\n" |
|
67 "\t-c or --copyright include a copyright notice\n" |
|
68 "\t-v or --verbose turn on verbose output\n" |
|
69 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" |
|
70 "\t followed by path, defaults to %s\n" |
|
71 "\t-d or --destdir destination directory, followed by the path\n", |
|
72 u_getDataDirectory()); |
|
73 exit (retCode); |
|
74 } |
|
75 |
|
76 |
|
77 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO |
|
78 |
|
79 /* dummy UDataInfo cf. udata.h */ |
|
80 static UDataInfo dummyDataInfo = { |
|
81 sizeof(UDataInfo), |
|
82 0, |
|
83 |
|
84 U_IS_BIG_ENDIAN, |
|
85 U_CHARSET_FAMILY, |
|
86 U_SIZEOF_UCHAR, |
|
87 0, |
|
88 |
|
89 { 0, 0, 0, 0 }, /* dummy dataFormat */ |
|
90 { 0, 0, 0, 0 }, /* dummy formatVersion */ |
|
91 { 0, 0, 0, 0 } /* dummy dataVersion */ |
|
92 }; |
|
93 |
|
94 #else |
|
95 |
|
96 // |
|
97 // Set up the ICU data header, defined in ucmndata.h |
|
98 // |
|
99 DataHeader dh ={ |
|
100 {sizeof(DataHeader), // Struct MappedData |
|
101 0xda, |
|
102 0x27}, |
|
103 |
|
104 { // struct UDataInfo |
|
105 sizeof(UDataInfo), // size |
|
106 0, // reserved |
|
107 U_IS_BIG_ENDIAN, |
|
108 U_CHARSET_FAMILY, |
|
109 U_SIZEOF_UCHAR, |
|
110 0, // reserved |
|
111 |
|
112 { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk " |
|
113 { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values |
|
114 // from the RBBI rule builder. The values declared |
|
115 // here should never appear in any real RBBI data. |
|
116 { 4, 1, 0, 0 } // dataVersion (Unicode version) |
|
117 }}; |
|
118 |
|
119 #endif |
|
120 |
|
121 //---------------------------------------------------------------------------- |
|
122 // |
|
123 // main for genbrk |
|
124 // |
|
125 //---------------------------------------------------------------------------- |
|
126 int main(int argc, char **argv) { |
|
127 UErrorCode status = U_ZERO_ERROR; |
|
128 const char *ruleFileName; |
|
129 const char *outFileName; |
|
130 const char *outDir = NULL; |
|
131 const char *copyright = NULL; |
|
132 |
|
133 // |
|
134 // Pick up and check the command line arguments, |
|
135 // using the standard ICU tool utils option handling. |
|
136 // |
|
137 U_MAIN_INIT_ARGS(argc, argv); |
|
138 progName = argv[0]; |
|
139 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); |
|
140 if(argc<0) { |
|
141 // Unrecognized option |
|
142 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); |
|
143 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); |
|
144 } |
|
145 |
|
146 if(options[0].doesOccur || options[1].doesOccur) { |
|
147 // -? or -h for help. |
|
148 usageAndDie(0); |
|
149 } |
|
150 |
|
151 if (!(options[3].doesOccur && options[4].doesOccur)) { |
|
152 fprintf(stderr, "rule file and output file must both be specified.\n"); |
|
153 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); |
|
154 } |
|
155 ruleFileName = options[3].value; |
|
156 outFileName = options[4].value; |
|
157 |
|
158 if (options[5].doesOccur) { |
|
159 u_setDataDirectory(options[5].value); |
|
160 } |
|
161 |
|
162 status = U_ZERO_ERROR; |
|
163 |
|
164 /* Combine the directory with the file name */ |
|
165 if(options[6].doesOccur) { |
|
166 outDir = options[6].value; |
|
167 } |
|
168 if (options[7].doesOccur) { |
|
169 copyright = U_COPYRIGHT_STRING; |
|
170 } |
|
171 |
|
172 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO |
|
173 |
|
174 UNewDataMemory *pData; |
|
175 char msg[1024]; |
|
176 |
|
177 /* write message with just the name */ |
|
178 sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); |
|
179 fprintf(stderr, "%s\n", msg); |
|
180 |
|
181 /* write the dummy data file */ |
|
182 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); |
|
183 udata_writeBlock(pData, msg, strlen(msg)); |
|
184 udata_finish(pData, &status); |
|
185 return (int)status; |
|
186 |
|
187 #else |
|
188 /* Initialize ICU */ |
|
189 u_init(&status); |
|
190 if (U_FAILURE(status)) { |
|
191 fprintf(stderr, "%s: can not initialize ICU. status = %s\n", |
|
192 argv[0], u_errorName(status)); |
|
193 exit(1); |
|
194 } |
|
195 status = U_ZERO_ERROR; |
|
196 |
|
197 // |
|
198 // Read in the rule source file |
|
199 // |
|
200 long result; |
|
201 long ruleFileSize; |
|
202 FILE *file; |
|
203 char *ruleBufferC; |
|
204 |
|
205 file = fopen(ruleFileName, "rb"); |
|
206 if( file == 0 ) { |
|
207 fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName); |
|
208 exit(-1); |
|
209 } |
|
210 fseek(file, 0, SEEK_END); |
|
211 ruleFileSize = ftell(file); |
|
212 fseek(file, 0, SEEK_SET); |
|
213 ruleBufferC = new char[ruleFileSize+10]; |
|
214 |
|
215 result = (long)fread(ruleBufferC, 1, ruleFileSize, file); |
|
216 if (result != ruleFileSize) { |
|
217 fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName); |
|
218 exit (-1); |
|
219 } |
|
220 ruleBufferC[ruleFileSize]=0; |
|
221 fclose(file); |
|
222 |
|
223 // |
|
224 // Look for a Unicode Signature (BOM) on the rule file |
|
225 // |
|
226 int32_t signatureLength; |
|
227 const char * ruleSourceC = ruleBufferC; |
|
228 const char* encoding = ucnv_detectUnicodeSignature( |
|
229 ruleSourceC, ruleFileSize, &signatureLength, &status); |
|
230 if (U_FAILURE(status)) { |
|
231 exit(status); |
|
232 } |
|
233 if(encoding!=NULL ){ |
|
234 ruleSourceC += signatureLength; |
|
235 ruleFileSize -= signatureLength; |
|
236 } |
|
237 |
|
238 // |
|
239 // Open a converter to take the rule file to UTF-16 |
|
240 // |
|
241 UConverter* conv; |
|
242 conv = ucnv_open(encoding, &status); |
|
243 if (U_FAILURE(status)) { |
|
244 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); |
|
245 exit(status); |
|
246 } |
|
247 |
|
248 // |
|
249 // Convert the rules to UChar. |
|
250 // Preflight first to determine required buffer size. |
|
251 // |
|
252 uint32_t destCap = ucnv_toUChars(conv, |
|
253 NULL, // dest, |
|
254 0, // destCapacity, |
|
255 ruleSourceC, |
|
256 ruleFileSize, |
|
257 &status); |
|
258 if (status != U_BUFFER_OVERFLOW_ERROR) { |
|
259 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); |
|
260 exit(status); |
|
261 }; |
|
262 |
|
263 status = U_ZERO_ERROR; |
|
264 UChar *ruleSourceU = new UChar[destCap+1]; |
|
265 ucnv_toUChars(conv, |
|
266 ruleSourceU, // dest, |
|
267 destCap+1, |
|
268 ruleSourceC, |
|
269 ruleFileSize, |
|
270 &status); |
|
271 if (U_FAILURE(status)) { |
|
272 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); |
|
273 exit(status); |
|
274 }; |
|
275 ucnv_close(conv); |
|
276 |
|
277 |
|
278 // |
|
279 // Put the source rules into a UnicodeString |
|
280 // |
|
281 UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap); |
|
282 |
|
283 // |
|
284 // Create the break iterator from the rules |
|
285 // This will compile the rules. |
|
286 // |
|
287 UParseError parseError; |
|
288 parseError.line = 0; |
|
289 parseError.offset = 0; |
|
290 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status); |
|
291 if (U_FAILURE(status)) { |
|
292 fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n", |
|
293 u_errorName(status), (int)parseError.line, (int)parseError.offset); |
|
294 exit(status); |
|
295 }; |
|
296 |
|
297 |
|
298 // |
|
299 // Get the compiled rule data from the break iterator. |
|
300 // |
|
301 uint32_t outDataSize; |
|
302 const uint8_t *outData; |
|
303 outData = bi->getBinaryRules(outDataSize); |
|
304 |
|
305 // Copy the data format version numbers from the RBBI data header into the UDataMemory header. |
|
306 uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion)); |
|
307 |
|
308 // |
|
309 // Create the output file |
|
310 // |
|
311 size_t bytesWritten; |
|
312 UNewDataMemory *pData; |
|
313 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); |
|
314 if(U_FAILURE(status)) { |
|
315 fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n", |
|
316 outFileName, u_errorName(status)); |
|
317 exit(status); |
|
318 } |
|
319 |
|
320 |
|
321 // Write the data itself. |
|
322 udata_writeBlock(pData, outData, outDataSize); |
|
323 // finish up |
|
324 bytesWritten = udata_finish(pData, &status); |
|
325 if(U_FAILURE(status)) { |
|
326 fprintf(stderr, "genbrk: error %d writing the output file\n", status); |
|
327 exit(status); |
|
328 } |
|
329 |
|
330 if (bytesWritten != outDataSize) { |
|
331 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); |
|
332 exit(-1); |
|
333 } |
|
334 |
|
335 delete bi; |
|
336 delete[] ruleSourceU; |
|
337 delete[] ruleBufferC; |
|
338 u_cleanup(); |
|
339 |
|
340 |
|
341 printf("genbrk: tool completed successfully.\n"); |
|
342 return 0; |
|
343 |
|
344 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
|
345 } |
|
346 |