intl/unicharutil/src/ucdata.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 * Copyright 1996, 1997, 1998 Computing Research Labs,
michael@0 3 * New Mexico State University
michael@0 4 *
michael@0 5 * Permission is hereby granted, free of charge, to any person obtaining a
michael@0 6 * copy of this software and associated documentation files (the "Software"),
michael@0 7 * to deal in the Software without restriction, including without limitation
michael@0 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
michael@0 9 * and/or sell copies of the Software, and to permit persons to whom the
michael@0 10 * Software is furnished to do so, subject to the following conditions:
michael@0 11 *
michael@0 12 * The above copyright notice and this permission notice shall be included in
michael@0 13 * all copies or substantial portions of the Software.
michael@0 14 *
michael@0 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
michael@0 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
michael@0 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
michael@0 18 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
michael@0 19 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
michael@0 20 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
michael@0 21 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
michael@0 22 */
michael@0 23 #ifndef lint
michael@0 24 #ifdef __GNUC__
michael@0 25 static char rcsid[] __attribute__ ((unused)) = "$Id: ucdata.c,v 1.1 1999/01/08 00:19:11 ftang%netscape.com Exp $";
michael@0 26 #else
michael@0 27 static char rcsid[] = "$Id: ucdata.c,v 1.1 1999/01/08 00:19:11 ftang%netscape.com Exp $";
michael@0 28 #endif
michael@0 29 #endif
michael@0 30
michael@0 31 #include <stdio.h>
michael@0 32 #include <stdlib.h>
michael@0 33 #include <string.h>
michael@0 34 #ifndef WIN32
michael@0 35 #include <unistd.h>
michael@0 36 #endif
michael@0 37
michael@0 38 #include "ucdata.h"
michael@0 39
michael@0 40 /**************************************************************************
michael@0 41 *
michael@0 42 * Miscellaneous types, data, and support functions.
michael@0 43 *
michael@0 44 **************************************************************************/
michael@0 45
michael@0 46 typedef struct {
michael@0 47 unsigned short bom;
michael@0 48 unsigned short cnt;
michael@0 49 union {
michael@0 50 unsigned long bytes;
michael@0 51 unsigned short len[2];
michael@0 52 } size;
michael@0 53 } _ucheader_t;
michael@0 54
michael@0 55 /*
michael@0 56 * A simple array of 32-bit masks for lookup.
michael@0 57 */
michael@0 58 static unsigned long masks32[32] = {
michael@0 59 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
michael@0 60 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
michael@0 61 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
michael@0 62 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
michael@0 63 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
michael@0 64 0x40000000, 0x80000000
michael@0 65 };
michael@0 66
michael@0 67 #define endian_short(cc) (((cc) >> 8) | (((cc) & 0xff) << 8))
michael@0 68 #define endian_long(cc) ((((cc) & 0xff) << 24)|((((cc) >> 8) & 0xff) << 16)|\
michael@0 69 ((((cc) >> 16) & 0xff) << 8)|((cc) >> 24))
michael@0 70
michael@0 71 static FILE *
michael@0 72 #ifdef __STDC__
michael@0 73 _ucopenfile(char *paths, char *filename, char *mode)
michael@0 74 #else
michael@0 75 _ucopenfile(paths, filename, mode)
michael@0 76 char *paths, *filename, *mode;
michael@0 77 #endif
michael@0 78 {
michael@0 79 FILE *f;
michael@0 80 char *fp, *dp, *pp, path[BUFSIZ];
michael@0 81
michael@0 82 if (filename == 0 || *filename == 0)
michael@0 83 return 0;
michael@0 84
michael@0 85 dp = paths;
michael@0 86 while (dp && *dp) {
michael@0 87 pp = path;
michael@0 88 while (*dp && *dp != ':')
michael@0 89 *pp++ = *dp++;
michael@0 90 *pp++ = '/';
michael@0 91
michael@0 92 fp = filename;
michael@0 93 while (*fp)
michael@0 94 *pp++ = *fp++;
michael@0 95 *pp = 0;
michael@0 96
michael@0 97 if ((f = fopen(path, mode)) != 0)
michael@0 98 return f;
michael@0 99
michael@0 100 if (*dp == ':')
michael@0 101 dp++;
michael@0 102 }
michael@0 103
michael@0 104 return 0;
michael@0 105 }
michael@0 106
michael@0 107 /**************************************************************************
michael@0 108 *
michael@0 109 * Support for the character properties.
michael@0 110 *
michael@0 111 **************************************************************************/
michael@0 112
michael@0 113 static unsigned long _ucprop_size;
michael@0 114 static unsigned short *_ucprop_offsets;
michael@0 115 static unsigned long *_ucprop_ranges;
michael@0 116
michael@0 117 static void
michael@0 118 #ifdef __STDC__
michael@0 119 _ucprop_load(char *paths, int reload)
michael@0 120 #else
michael@0 121 _ucprop_load(paths, reload)
michael@0 122 char *paths;
michael@0 123 int reload;
michael@0 124 #endif
michael@0 125 {
michael@0 126 FILE *in;
michael@0 127 unsigned long size, i;
michael@0 128 _ucheader_t hdr;
michael@0 129
michael@0 130 if (_ucprop_size > 0) {
michael@0 131 if (!reload)
michael@0 132 /*
michael@0 133 * The character properties have already been loaded.
michael@0 134 */
michael@0 135 return;
michael@0 136
michael@0 137 /*
michael@0 138 * Unload the current character property data in preparation for
michael@0 139 * loading a new copy. Only the first array has to be deallocated
michael@0 140 * because all the memory for the arrays is allocated as a single
michael@0 141 * block.
michael@0 142 */
michael@0 143 free((char *) _ucprop_offsets);
michael@0 144 _ucprop_size = 0;
michael@0 145 }
michael@0 146
michael@0 147 if ((in = _ucopenfile(paths, "ctype.dat", "rb")) == 0)
michael@0 148 return;
michael@0 149
michael@0 150 /*
michael@0 151 * Load the header.
michael@0 152 */
michael@0 153 fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
michael@0 154
michael@0 155 if (hdr.bom == 0xfffe) {
michael@0 156 hdr.cnt = endian_short(hdr.cnt);
michael@0 157 hdr.size.bytes = endian_long(hdr.size.bytes);
michael@0 158 }
michael@0 159
michael@0 160 if ((_ucprop_size = hdr.cnt) == 0) {
michael@0 161 fclose(in);
michael@0 162 return;
michael@0 163 }
michael@0 164
michael@0 165 /*
michael@0 166 * Allocate all the storage needed for the lookup table.
michael@0 167 */
michael@0 168 _ucprop_offsets = (unsigned short *) malloc(hdr.size.bytes);
michael@0 169
michael@0 170 /*
michael@0 171 * Calculate the offset into the storage for the ranges. The offsets
michael@0 172 * array is on a 4-byte boundary and one larger than the value provided in
michael@0 173 * the header count field. This means the offset to the ranges must be
michael@0 174 * calculated after aligning the count to a 4-byte boundary.
michael@0 175 */
michael@0 176 if ((size = ((hdr.cnt + 1) * sizeof(unsigned short))) & 3)
michael@0 177 size += 4 - (size & 3);
michael@0 178 size >>= 1;
michael@0 179 _ucprop_ranges = (unsigned long *) (_ucprop_offsets + size);
michael@0 180
michael@0 181 /*
michael@0 182 * Load the offset array.
michael@0 183 */
michael@0 184 fread((char *) _ucprop_offsets, sizeof(unsigned short), size, in);
michael@0 185
michael@0 186 /*
michael@0 187 * Do an endian swap if necessary. Don't forget there is an extra node on
michael@0 188 * the end with the final index.
michael@0 189 */
michael@0 190 if (hdr.bom == 0xfffe) {
michael@0 191 for (i = 0; i <= _ucprop_size; i++)
michael@0 192 _ucprop_offsets[i] = endian_short(_ucprop_offsets[i]);
michael@0 193 }
michael@0 194
michael@0 195 /*
michael@0 196 * Load the ranges. The number of elements is in the last array position
michael@0 197 * of the offsets.
michael@0 198 */
michael@0 199 fread((char *) _ucprop_ranges, sizeof(unsigned long),
michael@0 200 _ucprop_offsets[_ucprop_size], in);
michael@0 201
michael@0 202 fclose(in);
michael@0 203
michael@0 204 /*
michael@0 205 * Do an endian swap if necessary.
michael@0 206 */
michael@0 207 if (hdr.bom == 0xfffe) {
michael@0 208 for (i = 0; i < _ucprop_offsets[_ucprop_size]; i++)
michael@0 209 _ucprop_ranges[i] = endian_long(_ucprop_ranges[i]);
michael@0 210 }
michael@0 211 }
michael@0 212
michael@0 213 static void
michael@0 214 #ifdef __STDC__
michael@0 215 _ucprop_unload(void)
michael@0 216 #else
michael@0 217 _ucprop_unload()
michael@0 218 #endif
michael@0 219 {
michael@0 220 if (_ucprop_size == 0)
michael@0 221 return;
michael@0 222
michael@0 223 /*
michael@0 224 * Only need to free the offsets because the memory is allocated as a
michael@0 225 * single block.
michael@0 226 */
michael@0 227 free((char *) _ucprop_offsets);
michael@0 228 _ucprop_size = 0;
michael@0 229 }
michael@0 230
michael@0 231 static int
michael@0 232 #ifdef __STDC__
michael@0 233 _ucprop_lookup(unsigned long code, unsigned long n)
michael@0 234 #else
michael@0 235 _ucprop_lookup(code, n)
michael@0 236 unsigned long code, n;
michael@0 237 #endif
michael@0 238 {
michael@0 239 long l, r, m;
michael@0 240
michael@0 241 /*
michael@0 242 * There is an extra node on the end of the offsets to allow this routine
michael@0 243 * to work right. If the index is 0xffff, then there are no nodes for the
michael@0 244 * property.
michael@0 245 */
michael@0 246 if ((l = _ucprop_offsets[n]) == 0xffff)
michael@0 247 return 0;
michael@0 248
michael@0 249 /*
michael@0 250 * Locate the next offset that is not 0xffff. The sentinel at the end of
michael@0 251 * the array is the max index value.
michael@0 252 */
michael@0 253 for (m = 1;
michael@0 254 n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++) ;
michael@0 255
michael@0 256 r = _ucprop_offsets[n + m] - 1;
michael@0 257
michael@0 258 while (l <= r) {
michael@0 259 /*
michael@0 260 * Determine a "mid" point and adjust to make sure the mid point is at
michael@0 261 * the beginning of a range pair.
michael@0 262 */
michael@0 263 m = (l + r) >> 1;
michael@0 264 m -= (m & 1);
michael@0 265 if (code > _ucprop_ranges[m + 1])
michael@0 266 l = m + 2;
michael@0 267 else if (code < _ucprop_ranges[m])
michael@0 268 r = m - 2;
michael@0 269 else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
michael@0 270 return 1;
michael@0 271 }
michael@0 272 return 0;
michael@0 273 }
michael@0 274
michael@0 275 int
michael@0 276 #ifdef __STDC__
michael@0 277 ucisprop(unsigned long code, unsigned long mask1, unsigned long mask2)
michael@0 278 #else
michael@0 279 ucisprop(code, mask1, mask2)
michael@0 280 unsigned long code, mask1, mask2;
michael@0 281 #endif
michael@0 282 {
michael@0 283 unsigned long i;
michael@0 284
michael@0 285 if (mask1 == 0 && mask2 == 0)
michael@0 286 return 0;
michael@0 287
michael@0 288 for (i = 0; mask1 && i < 32; i++) {
michael@0 289 if ((mask1 & masks32[i]) && _ucprop_lookup(code, i))
michael@0 290 return 1;
michael@0 291 }
michael@0 292
michael@0 293 for (i = 32; mask2 && i < _ucprop_size; i++) {
michael@0 294 if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i))
michael@0 295 return 1;
michael@0 296 }
michael@0 297
michael@0 298 return 0;
michael@0 299 }
michael@0 300
michael@0 301 /**************************************************************************
michael@0 302 *
michael@0 303 * Support for case mapping.
michael@0 304 *
michael@0 305 **************************************************************************/
michael@0 306
michael@0 307 static unsigned long _uccase_size;
michael@0 308 static unsigned short _uccase_len[2];
michael@0 309 static unsigned long *_uccase_map;
michael@0 310
michael@0 311 static void
michael@0 312 #ifdef __STDC__
michael@0 313 _uccase_load(char *paths, int reload)
michael@0 314 #else
michael@0 315 _uccase_load(paths, reload)
michael@0 316 char *paths;
michael@0 317 int reload;
michael@0 318 #endif
michael@0 319 {
michael@0 320 FILE *in;
michael@0 321 unsigned long i;
michael@0 322 _ucheader_t hdr;
michael@0 323
michael@0 324 if (_uccase_size > 0) {
michael@0 325 if (!reload)
michael@0 326 /*
michael@0 327 * The case mappings have already been loaded.
michael@0 328 */
michael@0 329 return;
michael@0 330
michael@0 331 free((char *) _uccase_map);
michael@0 332 _uccase_size = 0;
michael@0 333 }
michael@0 334
michael@0 335 if ((in = _ucopenfile(paths, "case.dat", "rb")) == 0)
michael@0 336 return;
michael@0 337
michael@0 338 /*
michael@0 339 * Load the header.
michael@0 340 */
michael@0 341 fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
michael@0 342
michael@0 343 if (hdr.bom == 0xfffe) {
michael@0 344 hdr.cnt = endian_short(hdr.cnt);
michael@0 345 hdr.size.len[0] = endian_short(hdr.size.len[0]);
michael@0 346 hdr.size.len[1] = endian_short(hdr.size.len[1]);
michael@0 347 }
michael@0 348
michael@0 349 /*
michael@0 350 * Set the node count and lengths of the upper and lower case mapping
michael@0 351 * tables.
michael@0 352 */
michael@0 353 _uccase_size = hdr.cnt * 3;
michael@0 354 _uccase_len[0] = hdr.size.len[0] * 3;
michael@0 355 _uccase_len[1] = hdr.size.len[1] * 3;
michael@0 356
michael@0 357 _uccase_map = (unsigned long *)
michael@0 358 malloc(_uccase_size * sizeof(unsigned long));
michael@0 359
michael@0 360 /*
michael@0 361 * Load the case mapping table.
michael@0 362 */
michael@0 363 fread((char *) _uccase_map, sizeof(unsigned long), _uccase_size, in);
michael@0 364
michael@0 365 /*
michael@0 366 * Do an endian swap if necessary.
michael@0 367 */
michael@0 368 if (hdr.bom == 0xfffe) {
michael@0 369 for (i = 0; i < _uccase_size; i++)
michael@0 370 _uccase_map[i] = endian_long(_uccase_map[i]);
michael@0 371 }
michael@0 372 }
michael@0 373
michael@0 374 static void
michael@0 375 #ifdef __STDC__
michael@0 376 _uccase_unload(void)
michael@0 377 #else
michael@0 378 _uccase_unload()
michael@0 379 #endif
michael@0 380 {
michael@0 381 if (_uccase_size == 0)
michael@0 382 return;
michael@0 383
michael@0 384 free((char *) _uccase_map);
michael@0 385 _uccase_size = 0;
michael@0 386 }
michael@0 387
michael@0 388 static unsigned long
michael@0 389 #ifdef __STDC__
michael@0 390 _uccase_lookup(unsigned long code, long l, long r, int field)
michael@0 391 #else
michael@0 392 _uccase_lookup(code, l, r, field)
michael@0 393 unsigned long code;
michael@0 394 long l, r;
michael@0 395 int field;
michael@0 396 #endif
michael@0 397 {
michael@0 398 long m;
michael@0 399
michael@0 400 /*
michael@0 401 * Do the binary search.
michael@0 402 */
michael@0 403 while (l <= r) {
michael@0 404 /*
michael@0 405 * Determine a "mid" point and adjust to make sure the mid point is at
michael@0 406 * the beginning of a case mapping triple.
michael@0 407 */
michael@0 408 m = (l + r) >> 1;
michael@0 409 m -= (m % 3);
michael@0 410 if (code > _uccase_map[m])
michael@0 411 l = m + 3;
michael@0 412 else if (code < _uccase_map[m])
michael@0 413 r = m - 3;
michael@0 414 else if (code == _uccase_map[m])
michael@0 415 return _uccase_map[m + field];
michael@0 416 }
michael@0 417
michael@0 418 return code;
michael@0 419 }
michael@0 420
michael@0 421 unsigned long
michael@0 422 #ifdef __STDC__
michael@0 423 uctoupper(unsigned long code)
michael@0 424 #else
michael@0 425 uctoupper(code)
michael@0 426 unsigned long code;
michael@0 427 #endif
michael@0 428 {
michael@0 429 int field;
michael@0 430 long l, r;
michael@0 431
michael@0 432 if (ucisupper(code))
michael@0 433 return code;
michael@0 434
michael@0 435 if (ucislower(code)) {
michael@0 436 /*
michael@0 437 * The character is lower case.
michael@0 438 */
michael@0 439 field = 1;
michael@0 440 l = _uccase_len[0];
michael@0 441 r = (l + _uccase_len[1]) - 1;
michael@0 442 } else {
michael@0 443 /*
michael@0 444 * The character is title case.
michael@0 445 */
michael@0 446 field = 2;
michael@0 447 l = _uccase_len[0] + _uccase_len[1];
michael@0 448 r = _uccase_size - 1;
michael@0 449 }
michael@0 450 return _uccase_lookup(code, l, r, field);
michael@0 451 }
michael@0 452
michael@0 453 unsigned long
michael@0 454 #ifdef __STDC__
michael@0 455 uctolower(unsigned long code)
michael@0 456 #else
michael@0 457 uctolower(code)
michael@0 458 unsigned long code;
michael@0 459 #endif
michael@0 460 {
michael@0 461 int field;
michael@0 462 long l, r;
michael@0 463
michael@0 464 if (ucislower(code))
michael@0 465 return code;
michael@0 466
michael@0 467 if (ucisupper(code)) {
michael@0 468 /*
michael@0 469 * The character is upper case.
michael@0 470 */
michael@0 471 field = 1;
michael@0 472 l = 0;
michael@0 473 r = _uccase_len[0] - 1;
michael@0 474 } else {
michael@0 475 /*
michael@0 476 * The character is title case.
michael@0 477 */
michael@0 478 field = 2;
michael@0 479 l = _uccase_len[0] + _uccase_len[1];
michael@0 480 r = _uccase_size - 1;
michael@0 481 }
michael@0 482 return _uccase_lookup(code, l, r, field);
michael@0 483 }
michael@0 484
michael@0 485 unsigned long
michael@0 486 #ifdef __STDC__
michael@0 487 uctotitle(unsigned long code)
michael@0 488 #else
michael@0 489 uctotitle(code)
michael@0 490 unsigned long code;
michael@0 491 #endif
michael@0 492 {
michael@0 493 int field;
michael@0 494 long l, r;
michael@0 495
michael@0 496 if (ucistitle(code))
michael@0 497 return code;
michael@0 498
michael@0 499 /*
michael@0 500 * The offset will always be the same for converting to title case.
michael@0 501 */
michael@0 502 field = 2;
michael@0 503
michael@0 504 if (ucisupper(code)) {
michael@0 505 /*
michael@0 506 * The character is upper case.
michael@0 507 */
michael@0 508 l = 0;
michael@0 509 r = _uccase_len[0] - 1;
michael@0 510 } else {
michael@0 511 /*
michael@0 512 * The character is lower case.
michael@0 513 */
michael@0 514 l = _uccase_len[0];
michael@0 515 r = (l + _uccase_len[1]) - 1;
michael@0 516 }
michael@0 517 return _uccase_lookup(code, l, r, field);
michael@0 518 }
michael@0 519
michael@0 520 /**************************************************************************
michael@0 521 *
michael@0 522 * Support for decompositions.
michael@0 523 *
michael@0 524 **************************************************************************/
michael@0 525
michael@0 526 static unsigned long _ucdcmp_size;
michael@0 527 static unsigned long *_ucdcmp_nodes;
michael@0 528 static unsigned long *_ucdcmp_decomp;
michael@0 529
michael@0 530 static void
michael@0 531 #ifdef __STDC__
michael@0 532 _ucdcmp_load(char *paths, int reload)
michael@0 533 #else
michael@0 534 _ucdcmp_load(paths, reload)
michael@0 535 char *paths;
michael@0 536 int reload;
michael@0 537 #endif
michael@0 538 {
michael@0 539 FILE *in;
michael@0 540 unsigned long size, i;
michael@0 541 _ucheader_t hdr;
michael@0 542
michael@0 543 if (_ucdcmp_size > 0) {
michael@0 544 if (!reload)
michael@0 545 /*
michael@0 546 * The decompositions have already been loaded.
michael@0 547 */
michael@0 548 return;
michael@0 549
michael@0 550 free((char *) _ucdcmp_nodes);
michael@0 551 _ucdcmp_size = 0;
michael@0 552 }
michael@0 553
michael@0 554 if ((in = _ucopenfile(paths, "decomp.dat", "rb")) == 0)
michael@0 555 return;
michael@0 556
michael@0 557 /*
michael@0 558 * Load the header.
michael@0 559 */
michael@0 560 fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
michael@0 561
michael@0 562 if (hdr.bom == 0xfffe) {
michael@0 563 hdr.cnt = endian_short(hdr.cnt);
michael@0 564 hdr.size.bytes = endian_long(hdr.size.bytes);
michael@0 565 }
michael@0 566
michael@0 567 _ucdcmp_size = hdr.cnt << 1;
michael@0 568 _ucdcmp_nodes = (unsigned long *) malloc(hdr.size.bytes);
michael@0 569 _ucdcmp_decomp = _ucdcmp_nodes + (_ucdcmp_size + 1);
michael@0 570
michael@0 571 /*
michael@0 572 * Read the decomposition data in.
michael@0 573 */
michael@0 574 size = hdr.size.bytes / sizeof(unsigned long);
michael@0 575 fread((char *) _ucdcmp_nodes, sizeof(unsigned long), size, in);
michael@0 576
michael@0 577 /*
michael@0 578 * Do an endian swap if necessary.
michael@0 579 */
michael@0 580 if (hdr.bom == 0xfffe) {
michael@0 581 for (i = 0; i < size; i++)
michael@0 582 _ucdcmp_nodes[i] = endian_long(_ucdcmp_nodes[i]);
michael@0 583 }
michael@0 584 }
michael@0 585
michael@0 586 static void
michael@0 587 #ifdef __STDC__
michael@0 588 _ucdcmp_unload(void)
michael@0 589 #else
michael@0 590 _ucdcmp_unload()
michael@0 591 #endif
michael@0 592 {
michael@0 593 if (_ucdcmp_size == 0)
michael@0 594 return;
michael@0 595
michael@0 596 /*
michael@0 597 * Only need to free the offsets because the memory is allocated as a
michael@0 598 * single block.
michael@0 599 */
michael@0 600 free((char *) _ucdcmp_nodes);
michael@0 601 _ucdcmp_size = 0;
michael@0 602 }
michael@0 603
michael@0 604 int
michael@0 605 #ifdef __STDC__
michael@0 606 ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
michael@0 607 #else
michael@0 608 ucdecomp(code, num, decomp)
michael@0 609 unsigned long code, *num, **decomp;
michael@0 610 #endif
michael@0 611 {
michael@0 612 long l, r, m;
michael@0 613
michael@0 614 l = 0;
michael@0 615 r = _ucdcmp_nodes[_ucdcmp_size] - 1;
michael@0 616
michael@0 617 while (l <= r) {
michael@0 618 /*
michael@0 619 * Determine a "mid" point and adjust to make sure the mid point is at
michael@0 620 * the beginning of a code+offset pair.
michael@0 621 */
michael@0 622 m = (l + r) >> 1;
michael@0 623 m -= (m & 1);
michael@0 624 if (code > _ucdcmp_nodes[m])
michael@0 625 l = m + 2;
michael@0 626 else if (code < _ucdcmp_nodes[m])
michael@0 627 r = m - 2;
michael@0 628 else if (code == _ucdcmp_nodes[m]) {
michael@0 629 *num = _ucdcmp_nodes[m + 3] - _ucdcmp_nodes[m + 1];
michael@0 630 *decomp = &_ucdcmp_decomp[_ucdcmp_nodes[m + 1]];
michael@0 631 return 1;
michael@0 632 }
michael@0 633 }
michael@0 634 return 0;
michael@0 635 }
michael@0 636
michael@0 637 int
michael@0 638 #ifdef __STDC__
michael@0 639 ucdecomp_hangul(unsigned long code, unsigned long *num, unsigned long decomp[])
michael@0 640 #else
michael@0 641 ucdecomp_hangul(code, num, decomp)
michael@0 642 unsigned long code, *num, decomp[];
michael@0 643 #endif
michael@0 644 {
michael@0 645 if (!ucishangul(code))
michael@0 646 return 0;
michael@0 647
michael@0 648 code -= 0xac00;
michael@0 649 decomp[0] = 0x1100 + (unsigned long) (code / 588);
michael@0 650 decomp[1] = 0x1161 + (unsigned long) ((code % 588) / 28);
michael@0 651 decomp[2] = 0x11a7 + (unsigned long) (code % 28);
michael@0 652 *num = (decomp[2] != 0x11a7) ? 3 : 2;
michael@0 653
michael@0 654 return 1;
michael@0 655 }
michael@0 656
michael@0 657 /**************************************************************************
michael@0 658 *
michael@0 659 * Support for combining classes.
michael@0 660 *
michael@0 661 **************************************************************************/
michael@0 662
michael@0 663 static unsigned long _uccmcl_size;
michael@0 664 static unsigned long *_uccmcl_nodes;
michael@0 665
michael@0 666 static void
michael@0 667 #ifdef __STDC__
michael@0 668 _uccmcl_load(char *paths, int reload)
michael@0 669 #else
michael@0 670 _uccmcl_load(paths, reload)
michael@0 671 char *paths;
michael@0 672 int reload;
michael@0 673 #endif
michael@0 674 {
michael@0 675 FILE *in;
michael@0 676 unsigned long i;
michael@0 677 _ucheader_t hdr;
michael@0 678
michael@0 679 if (_uccmcl_size > 0) {
michael@0 680 if (!reload)
michael@0 681 /*
michael@0 682 * The combining classes have already been loaded.
michael@0 683 */
michael@0 684 return;
michael@0 685
michael@0 686 free((char *) _uccmcl_nodes);
michael@0 687 _uccmcl_size = 0;
michael@0 688 }
michael@0 689
michael@0 690 if ((in = _ucopenfile(paths, "cmbcl.dat", "rb")) == 0)
michael@0 691 return;
michael@0 692
michael@0 693 /*
michael@0 694 * Load the header.
michael@0 695 */
michael@0 696 fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
michael@0 697
michael@0 698 if (hdr.bom == 0xfffe) {
michael@0 699 hdr.cnt = endian_short(hdr.cnt);
michael@0 700 hdr.size.bytes = endian_long(hdr.size.bytes);
michael@0 701 }
michael@0 702
michael@0 703 _uccmcl_size = hdr.cnt * 3;
michael@0 704 _uccmcl_nodes = (unsigned long *) malloc(hdr.size.bytes);
michael@0 705
michael@0 706 /*
michael@0 707 * Read the combining classes in.
michael@0 708 */
michael@0 709 fread((char *) _uccmcl_nodes, sizeof(unsigned long), _uccmcl_size, in);
michael@0 710
michael@0 711 /*
michael@0 712 * Do an endian swap if necessary.
michael@0 713 */
michael@0 714 if (hdr.bom == 0xfffe) {
michael@0 715 for (i = 0; i < _uccmcl_size; i++)
michael@0 716 _uccmcl_nodes[i] = endian_long(_uccmcl_nodes[i]);
michael@0 717 }
michael@0 718 }
michael@0 719
michael@0 720 static void
michael@0 721 #ifdef __STDC__
michael@0 722 _uccmcl_unload(void)
michael@0 723 #else
michael@0 724 _uccmcl_unload()
michael@0 725 #endif
michael@0 726 {
michael@0 727 if (_uccmcl_size == 0)
michael@0 728 return;
michael@0 729
michael@0 730 free((char *) _uccmcl_nodes);
michael@0 731 _uccmcl_size = 0;
michael@0 732 }
michael@0 733
michael@0 734 unsigned long
michael@0 735 #ifdef __STDC__
michael@0 736 uccombining_class(unsigned long code)
michael@0 737 #else
michael@0 738 uccombining_class(code)
michael@0 739 unsigned long code;
michael@0 740 #endif
michael@0 741 {
michael@0 742 long l, r, m;
michael@0 743
michael@0 744 l = 0;
michael@0 745 r = _uccmcl_size - 1;
michael@0 746
michael@0 747 while (l <= r) {
michael@0 748 m = (l + r) >> 1;
michael@0 749 m -= (m % 3);
michael@0 750 if (code > _uccmcl_nodes[m + 1])
michael@0 751 l = m + 3;
michael@0 752 else if (code < _uccmcl_nodes[m])
michael@0 753 r = m - 3;
michael@0 754 else if (code >= _uccmcl_nodes[m] && code <= _uccmcl_nodes[m + 1])
michael@0 755 return _uccmcl_nodes[m + 2];
michael@0 756 }
michael@0 757 return 0;
michael@0 758 }
michael@0 759
michael@0 760 /**************************************************************************
michael@0 761 *
michael@0 762 * Support for numeric values.
michael@0 763 *
michael@0 764 **************************************************************************/
michael@0 765
michael@0 766 static unsigned long *_ucnum_nodes;
michael@0 767 static unsigned long _ucnum_size;
michael@0 768 static short *_ucnum_vals;
michael@0 769
michael@0 770 static void
michael@0 771 #ifdef __STDC__
michael@0 772 _ucnumb_load(char *paths, int reload)
michael@0 773 #else
michael@0 774 _ucnumb_load(paths, reload)
michael@0 775 char *paths;
michael@0 776 int reload;
michael@0 777 #endif
michael@0 778 {
michael@0 779 FILE *in;
michael@0 780 unsigned long size, i;
michael@0 781 _ucheader_t hdr;
michael@0 782
michael@0 783 if (_ucnum_size > 0) {
michael@0 784 if (!reload)
michael@0 785 /*
michael@0 786 * The numbers have already been loaded.
michael@0 787 */
michael@0 788 return;
michael@0 789
michael@0 790 free((char *) _ucnum_nodes);
michael@0 791 _ucnum_size = 0;
michael@0 792 }
michael@0 793
michael@0 794 if ((in = _ucopenfile(paths, "num.dat", "rb")) == 0)
michael@0 795 return;
michael@0 796
michael@0 797 /*
michael@0 798 * Load the header.
michael@0 799 */
michael@0 800 fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
michael@0 801
michael@0 802 if (hdr.bom == 0xfffe) {
michael@0 803 hdr.cnt = endian_short(hdr.cnt);
michael@0 804 hdr.size.bytes = endian_long(hdr.size.bytes);
michael@0 805 }
michael@0 806
michael@0 807 _ucnum_size = hdr.cnt;
michael@0 808 _ucnum_nodes = (unsigned long *) malloc(hdr.size.bytes);
michael@0 809 _ucnum_vals = (short *) (_ucnum_nodes + _ucnum_size);
michael@0 810
michael@0 811 /*
michael@0 812 * Read the combining classes in.
michael@0 813 */
michael@0 814 fread((char *) _ucnum_nodes, sizeof(unsigned char), hdr.size.bytes, in);
michael@0 815
michael@0 816 /*
michael@0 817 * Do an endian swap if necessary.
michael@0 818 */
michael@0 819 if (hdr.bom == 0xfffe) {
michael@0 820 for (i = 0; i < _ucnum_size; i++)
michael@0 821 _ucnum_nodes[i] = endian_long(_ucnum_nodes[i]);
michael@0 822
michael@0 823 /*
michael@0 824 * Determine the number of values that have to be adjusted.
michael@0 825 */
michael@0 826 size = (hdr.size.bytes -
michael@0 827 (_ucnum_size * (sizeof(unsigned long) << 1))) /
michael@0 828 sizeof(short);
michael@0 829
michael@0 830 for (i = 0; i < size; i++)
michael@0 831 _ucnum_vals[i] = endian_short(_ucnum_vals[i]);
michael@0 832 }
michael@0 833 }
michael@0 834
michael@0 835 static void
michael@0 836 #ifdef __STDC__
michael@0 837 _ucnumb_unload(void)
michael@0 838 #else
michael@0 839 _ucnumb_unload()
michael@0 840 #endif
michael@0 841 {
michael@0 842 if (_ucnum_size == 0)
michael@0 843 return;
michael@0 844
michael@0 845 free((char *) _ucnum_nodes);
michael@0 846 _ucnum_size = 0;
michael@0 847 }
michael@0 848
michael@0 849 int
michael@0 850 #ifdef __STDC__
michael@0 851 ucnumber_lookup(unsigned long code, struct ucnumber *num)
michael@0 852 #else
michael@0 853 ucnumber_lookup(code, num)
michael@0 854 unsigned long code;
michael@0 855 struct ucnumber *num;
michael@0 856 #endif
michael@0 857 {
michael@0 858 long l, r, m;
michael@0 859 short *vp;
michael@0 860
michael@0 861 l = 0;
michael@0 862 r = _ucnum_size - 1;
michael@0 863 while (l <= r) {
michael@0 864 /*
michael@0 865 * Determine a "mid" point and adjust to make sure the mid point is at
michael@0 866 * the beginning of a code+offset pair.
michael@0 867 */
michael@0 868 m = (l + r) >> 1;
michael@0 869 m -= (m & 1);
michael@0 870 if (code > _ucnum_nodes[m])
michael@0 871 l = m + 2;
michael@0 872 else if (code < _ucnum_nodes[m])
michael@0 873 r = m - 2;
michael@0 874 else {
michael@0 875 vp = _ucnum_vals + _ucnum_nodes[m + 1];
michael@0 876 num->numerator = (int) *vp++;
michael@0 877 num->denominator = (int) *vp;
michael@0 878 return 1;
michael@0 879 }
michael@0 880 }
michael@0 881 return 0;
michael@0 882 }
michael@0 883
michael@0 884 int
michael@0 885 #ifdef __STDC__
michael@0 886 ucdigit_lookup(unsigned long code, int *digit)
michael@0 887 #else
michael@0 888 ucdigit_lookup(code, digit)
michael@0 889 unsigned long code;
michael@0 890 int *digit;
michael@0 891 #endif
michael@0 892 {
michael@0 893 long l, r, m;
michael@0 894 short *vp;
michael@0 895
michael@0 896 l = 0;
michael@0 897 r = _ucnum_size - 1;
michael@0 898 while (l <= r) {
michael@0 899 /*
michael@0 900 * Determine a "mid" point and adjust to make sure the mid point is at
michael@0 901 * the beginning of a code+offset pair.
michael@0 902 */
michael@0 903 m = (l + r) >> 1;
michael@0 904 m -= (m & 1);
michael@0 905 if (code > _ucnum_nodes[m])
michael@0 906 l = m + 2;
michael@0 907 else if (code < _ucnum_nodes[m])
michael@0 908 r = m - 2;
michael@0 909 else {
michael@0 910 vp = _ucnum_vals + _ucnum_nodes[m + 1];
michael@0 911 if (*vp == *(vp + 1)) {
michael@0 912 *digit = *vp;
michael@0 913 return 1;
michael@0 914 }
michael@0 915 return 0;
michael@0 916 }
michael@0 917 }
michael@0 918 return 0;
michael@0 919 }
michael@0 920
michael@0 921 struct ucnumber
michael@0 922 #ifdef __STDC__
michael@0 923 ucgetnumber(unsigned long code)
michael@0 924 #else
michael@0 925 ucgetnumber(code)
michael@0 926 unsigned long code;
michael@0 927 #endif
michael@0 928 {
michael@0 929 struct ucnumber num;
michael@0 930
michael@0 931 /*
michael@0 932 * Initialize with some arbitrary value, because the caller simply cannot
michael@0 933 * tell for sure if the code is a number without calling the ucisnumber()
michael@0 934 * macro before calling this function.
michael@0 935 */
michael@0 936 num.numerator = num.denominator = -111;
michael@0 937
michael@0 938 (void) ucnumber_lookup(code, &num);
michael@0 939
michael@0 940 return num;
michael@0 941 }
michael@0 942
michael@0 943 int
michael@0 944 #ifdef __STDC__
michael@0 945 ucgetdigit(unsigned long code)
michael@0 946 #else
michael@0 947 ucgetdigit(code)
michael@0 948 unsigned long code;
michael@0 949 #endif
michael@0 950 {
michael@0 951 int dig;
michael@0 952
michael@0 953 /*
michael@0 954 * Initialize with some arbitrary value, because the caller simply cannot
michael@0 955 * tell for sure if the code is a number without calling the ucisdigit()
michael@0 956 * macro before calling this function.
michael@0 957 */
michael@0 958 dig = -111;
michael@0 959
michael@0 960 (void) ucdigit_lookup(code, &dig);
michael@0 961
michael@0 962 return dig;
michael@0 963 }
michael@0 964
michael@0 965 /**************************************************************************
michael@0 966 *
michael@0 967 * Setup and cleanup routines.
michael@0 968 *
michael@0 969 **************************************************************************/
michael@0 970
michael@0 971 void
michael@0 972 #ifdef __STDC__
michael@0 973 ucdata_load(char *paths, int masks)
michael@0 974 #else
michael@0 975 ucdata_load(paths, masks)
michael@0 976 char *paths;
michael@0 977 int masks;
michael@0 978 #endif
michael@0 979 {
michael@0 980 if (masks & UCDATA_CTYPE)
michael@0 981 _ucprop_load(paths, 0);
michael@0 982 if (masks & UCDATA_CASE)
michael@0 983 _uccase_load(paths, 0);
michael@0 984 if (masks & UCDATA_DECOMP)
michael@0 985 _ucdcmp_load(paths, 0);
michael@0 986 if (masks & UCDATA_CMBCL)
michael@0 987 _uccmcl_load(paths, 0);
michael@0 988 if (masks & UCDATA_NUM)
michael@0 989 _ucnumb_load(paths, 0);
michael@0 990 }
michael@0 991
michael@0 992 void
michael@0 993 #ifdef __STDC__
michael@0 994 ucdata_unload(int masks)
michael@0 995 #else
michael@0 996 ucdata_unload(masks)
michael@0 997 int masks;
michael@0 998 #endif
michael@0 999 {
michael@0 1000 if (masks & UCDATA_CTYPE)
michael@0 1001 _ucprop_unload();
michael@0 1002 if (masks & UCDATA_CASE)
michael@0 1003 _uccase_unload();
michael@0 1004 if (masks & UCDATA_DECOMP)
michael@0 1005 _ucdcmp_unload();
michael@0 1006 if (masks & UCDATA_CMBCL)
michael@0 1007 _uccmcl_unload();
michael@0 1008 if (masks & UCDATA_NUM)
michael@0 1009 _ucnumb_unload();
michael@0 1010 }
michael@0 1011
michael@0 1012 void
michael@0 1013 #ifdef __STDC__
michael@0 1014 ucdata_reload(char *paths, int masks)
michael@0 1015 #else
michael@0 1016 ucdata_reload(paths, masks)
michael@0 1017 char *paths;
michael@0 1018 int masks;
michael@0 1019 #endif
michael@0 1020 {
michael@0 1021 if (masks & UCDATA_CTYPE)
michael@0 1022 _ucprop_load(paths, 1);
michael@0 1023 if (masks & UCDATA_CASE)
michael@0 1024 _uccase_load(paths, 1);
michael@0 1025 if (masks & UCDATA_DECOMP)
michael@0 1026 _ucdcmp_load(paths, 1);
michael@0 1027 if (masks & UCDATA_CMBCL)
michael@0 1028 _uccmcl_load(paths, 1);
michael@0 1029 if (masks & UCDATA_NUM)
michael@0 1030 _ucnumb_load(paths, 1);
michael@0 1031 }
michael@0 1032
michael@0 1033 #ifdef TEST
michael@0 1034
michael@0 1035 void
michael@0 1036 #ifdef __STDC__
michael@0 1037 main(void)
michael@0 1038 #else
michael@0 1039 main()
michael@0 1040 #endif
michael@0 1041 {
michael@0 1042 int dig;
michael@0 1043 unsigned long i, lo, *dec;
michael@0 1044 struct ucnumber num;
michael@0 1045
michael@0 1046 ucdata_setup(".");
michael@0 1047
michael@0 1048 if (ucisweak(0x30))
michael@0 1049 printf("WEAK\n");
michael@0 1050 else
michael@0 1051 printf("NOT WEAK\n");
michael@0 1052
michael@0 1053 printf("LOWER 0x%04lX\n", uctolower(0xff3a));
michael@0 1054 printf("UPPER 0x%04lX\n", uctoupper(0xff5a));
michael@0 1055
michael@0 1056 if (ucisalpha(0x1d5))
michael@0 1057 printf("ALPHA\n");
michael@0 1058 else
michael@0 1059 printf("NOT ALPHA\n");
michael@0 1060
michael@0 1061 if (ucisupper(0x1d5)) {
michael@0 1062 printf("UPPER\n");
michael@0 1063 lo = uctolower(0x1d5);
michael@0 1064 printf("0x%04lx\n", lo);
michael@0 1065 lo = uctotitle(0x1d5);
michael@0 1066 printf("0x%04lx\n", lo);
michael@0 1067 } else
michael@0 1068 printf("NOT UPPER\n");
michael@0 1069
michael@0 1070 if (ucistitle(0x1d5))
michael@0 1071 printf("TITLE\n");
michael@0 1072 else
michael@0 1073 printf("NOT TITLE\n");
michael@0 1074
michael@0 1075 if (uciscomposite(0x1d5))
michael@0 1076 printf("COMPOSITE\n");
michael@0 1077 else
michael@0 1078 printf("NOT COMPOSITE\n");
michael@0 1079
michael@0 1080 if (ucdecomp(0x1d5, &lo, &dec)) {
michael@0 1081 for (i = 0; i < lo; i++)
michael@0 1082 printf("0x%04lx ", dec[i]);
michael@0 1083 putchar('\n');
michael@0 1084 }
michael@0 1085
michael@0 1086 if ((lo = uccombining_class(0x41)) != 0)
michael@0 1087 printf("0x41 CCL %ld\n", lo);
michael@0 1088
michael@0 1089 if (ucisxdigit(0xfeff))
michael@0 1090 printf("0xFEFF HEX DIGIT\n");
michael@0 1091 else
michael@0 1092 printf("0xFEFF NOT HEX DIGIT\n");
michael@0 1093
michael@0 1094 if (ucisdefined(0x10000))
michael@0 1095 printf("0x10000 DEFINED\n");
michael@0 1096 else
michael@0 1097 printf("0x10000 NOT DEFINED\n");
michael@0 1098
michael@0 1099 if (ucnumber_lookup(0x30, &num)) {
michael@0 1100 if (num.numerator != num.denominator)
michael@0 1101 printf("UCNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
michael@0 1102 else
michael@0 1103 printf("UCNUMBER: 0x30 = %d\n", num.numerator);
michael@0 1104 } else
michael@0 1105 printf("UCNUMBER: 0x30 NOT A NUMBER\n");
michael@0 1106
michael@0 1107 if (ucnumber_lookup(0xbc, &num)) {
michael@0 1108 if (num.numerator != num.denominator)
michael@0 1109 printf("UCNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
michael@0 1110 else
michael@0 1111 printf("UCNUMBER: 0xbc = %d\n", num.numerator);
michael@0 1112 } else
michael@0 1113 printf("UCNUMBER: 0xbc NOT A NUMBER\n");
michael@0 1114
michael@0 1115
michael@0 1116 if (ucnumber_lookup(0xff19, &num)) {
michael@0 1117 if (num.numerator != num.denominator)
michael@0 1118 printf("UCNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
michael@0 1119 else
michael@0 1120 printf("UCNUMBER: 0xff19 = %d\n", num.numerator);
michael@0 1121 } else
michael@0 1122 printf("UCNUMBER: 0xff19 NOT A NUMBER\n");
michael@0 1123
michael@0 1124 if (ucnumber_lookup(0x4e00, &num)) {
michael@0 1125 if (num.numerator != num.denominator)
michael@0 1126 printf("UCNUMBER: 0x4e00 = %d/%d\n", num.numerator, num.denominator);
michael@0 1127 else
michael@0 1128 printf("UCNUMBER: 0x4e00 = %d\n", num.numerator);
michael@0 1129 } else
michael@0 1130 printf("UCNUMBER: 0x4e00 NOT A NUMBER\n");
michael@0 1131
michael@0 1132 if (ucdigit_lookup(0x06f9, &dig))
michael@0 1133 printf("UCDIGIT: 0x6f9 = %d\n", dig);
michael@0 1134 else
michael@0 1135 printf("UCDIGIT: 0x6f9 NOT A NUMBER\n");
michael@0 1136
michael@0 1137 dig = ucgetdigit(0x0969);
michael@0 1138 printf("UCGETDIGIT: 0x969 = %d\n", dig);
michael@0 1139
michael@0 1140 num = ucgetnumber(0x30);
michael@0 1141 if (num.numerator != num.denominator)
michael@0 1142 printf("UCGETNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
michael@0 1143 else
michael@0 1144 printf("UCGETNUMBER: 0x30 = %d\n", num.numerator);
michael@0 1145
michael@0 1146 num = ucgetnumber(0xbc);
michael@0 1147 if (num.numerator != num.denominator)
michael@0 1148 printf("UCGETNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
michael@0 1149 else
michael@0 1150 printf("UCGETNUMBER: 0xbc = %d\n", num.numerator);
michael@0 1151
michael@0 1152 num = ucgetnumber(0xff19);
michael@0 1153 if (num.numerator != num.denominator)
michael@0 1154 printf("UCGETNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
michael@0 1155 else
michael@0 1156 printf("UCGETNUMBER: 0xff19 = %d\n", num.numerator);
michael@0 1157
michael@0 1158 ucdata_cleanup();
michael@0 1159 exit(0);
michael@0 1160 }
michael@0 1161
michael@0 1162 #endif /* TEST */

mercurial