intl/icu/source/i18n/ucol_imp.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/ucol_imp.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1093 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 1998-2013, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*
    1.12 +* Private implementation header for C collation
    1.13 +*   file name:  ucol_imp.h
    1.14 +*   encoding:   US-ASCII
    1.15 +*   tab size:   8 (not used)
    1.16 +*   indentation:4
    1.17 +*
    1.18 +*   created on: 2000dec11
    1.19 +*   created by: Vladimir Weinstein
    1.20 +*
    1.21 +* Modification history
    1.22 +* Date        Name      Comments
    1.23 +* 02/16/2001  synwee    Added UCOL_GETPREVCE for the use in ucoleitr
    1.24 +* 02/27/2001  synwee    Added getMaxExpansion data structure in UCollator
    1.25 +* 03/02/2001  synwee    Added UCOL_IMPLICIT_CE
    1.26 +* 03/12/2001  synwee    Added pointer start to collIterate.
    1.27 +*/
    1.28 +
    1.29 +#ifndef UCOL_IMP_H
    1.30 +#define UCOL_IMP_H
    1.31 +
    1.32 +#include "unicode/utypes.h"
    1.33 +#ifdef __cplusplus
    1.34 +#   include "unicode/utf16.h"
    1.35 +#endif
    1.36 +
    1.37 +#define UCA_DATA_TYPE "icu"
    1.38 +#define UCA_DATA_NAME "ucadata"
    1.39 +#define INVC_DATA_TYPE "icu"
    1.40 +#define INVC_DATA_NAME "invuca"
    1.41 +
    1.42 +/**
    1.43 + * Convenience string denoting the Collation data tree
    1.44 + * @internal ICU 3.0
    1.45 + */
    1.46 +#define U_ICUDATA_COLL U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll"
    1.47 +
    1.48 +#if !UCONFIG_NO_COLLATION
    1.49 +
    1.50 +#ifdef __cplusplus
    1.51 +#include "unicode/normalizer2.h"
    1.52 +#include "unicode/unistr.h"
    1.53 +#endif
    1.54 +#include "unicode/ucol.h"
    1.55 +#include "ucol_data.h"
    1.56 +#include "utrie.h"
    1.57 +#include "cmemory.h"
    1.58 +
    1.59 +/* This is the internal header file which contains important declarations for 
    1.60 + * the collation framework. 
    1.61 + * Ready to use collators are stored as binary images. Both UCA and tailorings 
    1.62 + * share the same binary format. Individual files (currently only UCA) have a 
    1.63 + * udata header in front of the image and should be opened using udata_open.
    1.64 + * Tailoring images are currently stored inside resource bundles and are intialized
    1.65 + * through ucol_open API.
    1.66 + *
    1.67 + * The following describes the formats for collation binaries
    1.68 + * (UCA & tailorings) and for the inverse UCA table.
    1.69 + * Substructures are described in the collation design document at
    1.70 + * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
    1.71 + *
    1.72 + * -------------------------------------------------------------
    1.73 + *
    1.74 + * Here is the format of binary collation image.
    1.75 + *
    1.76 + * Physical order of structures:
    1.77 + * - header (UCATableHeader)
    1.78 + * - options (UColOptionSet)
    1.79 + * - expansions (CE[])
    1.80 + * - contractions (UChar[contractionSize] + CE[contractionSize])
    1.81 + * - serialized UTrie with mappings of code points to CEs
    1.82 + * - max expansion tables (CE[endExpansionCECount] + uint8_t[endExpansionCECount])
    1.83 + * - two bit sets for backward processing in strcoll (identical prefixes)
    1.84 + *   and for backward CE iteration (each set is uint8_t[UCOL_UNSAFECP_TABLE_SIZE])
    1.85 + * - UCA constants (UCAConstants)
    1.86 + * - UCA contractions (UChar[contractionUCACombosSize][contractionUCACombosWidth])
    1.87 + *
    1.88 + * UCATableHeader fields:
    1.89 + *
    1.90 + * int32_t size; - image size in bytes
    1.91 + *
    1.92 + * Offsets to interesting data. All offsets are in bytes.
    1.93 + * to get the address add to the header address and cast properly.
    1.94 + * Some offsets are zero if the corresponding structures are empty.
    1.95 + *
    1.96 + * Tailoring binaries that only set options and contain no mappings etc.
    1.97 + * will have all offsets 0 except for the options and expansion offsets,
    1.98 + * which give the position and length of the options array.
    1.99 + *
   1.100 + * uint32_t options; - offset to default collator options (UColOptionSet *),
   1.101 + *                     a set of 32-bit values. See declaration of UColOptionSet for more details
   1.102 + *
   1.103 + * uint32_t UCAConsts; - only used (!=0) in UCA image - structure which holds values for indirect positioning and implicit ranges
   1.104 + *                       See declaration of UCAConstants structure. This is a set of unsigned 32-bit values used to store 
   1.105 + *                       important constant values that are defined in the UCA and used for building and runtime.
   1.106 + *
   1.107 + * uint32_t contractionUCACombos; - only used (!=0) in UCA image - list of UCA contractions. This is a zero terminated array of UChar[contractionUCACombosWidth],
   1.108 + *                                  containing contractions from the UCA. These are needed in the build process to copy UCA contractions
   1.109 + *                                  in case the base contraction symbol is tailored.
   1.110 + *
   1.111 + * uint32_t magic; - must contain UCOL_HEADER_MAGIC (formatVersion 2.3)
   1.112 + *
   1.113 + * uint32_t mappingPosition;  - offset to UTrie (const uint8_t *mappingPosition). This is a serialized UTrie and should be treated as such. 
   1.114 + *                              Used as a primary lookup table for collation elements.
   1.115 + *
   1.116 + * uint32_t expansion;  - offset to expansion table (uint32_t *expansion). This is an array of expansion CEs. Never 0.
   1.117 + *
   1.118 + * uint32_t contractionIndex; - offset to contraction table (UChar *contractionIndex). Used to look up contraction sequences. Contents
   1.119 + *                              are aligned with the contents of contractionCEs table. 0 if no contractions.
   1.120 + *
   1.121 + * uint32_t contractionCEs;  - offset to resulting contraction CEs (uint32_t *contractionCEs). When a contraction is resolved in the
   1.122 + *                             in the contractionIndex table, the resulting index is used to look up corresponding CE in this table. 
   1.123 + *                             0 if no contractions.
   1.124 + * uint32_t contractionSize; - size of contraction table in elements (both Index and CEs). 
   1.125 + *
   1.126 + * Tables described below are used for Boyer-Moore searching algorithm - they define the size of longest expansion
   1.127 + * and last CEs in expansions.
   1.128 + * uint32_t endExpansionCE; - offset to array of last collation element in expansion (uint32_t *).
   1.129 + *                            Never 0.
   1.130 + * uint32_t expansionCESize; - array of maximum expansion sizes (uint8_t *)
   1.131 + * int32_t  endExpansionCECount; - size of endExpansionCE. See UCOL_GETMAXEXPANSION
   1.132 + *                                 for the usage model
   1.133 + *
   1.134 + * These two offsets point to byte tables that are used in the backup heuristics.
   1.135 + * uint32_t unsafeCP; - hash table of unsafe code points (uint8_t *). See ucol_unsafeCP function.
   1.136 + * uint32_t contrEndCP; - hash table of final code points in contractions (uint8_t *). See ucol_contractionEndCP.              
   1.137 + *
   1.138 + * int32_t contractionUCACombosSize; - number of UChar[contractionUCACombosWidth] in contractionUCACombos
   1.139 + *                                     (formatVersion 2.3)
   1.140 + * UBool jamoSpecial; - Jamo special indicator (uint8_t). If TRUE, Jamos are special, so we cannot use simple Hangul decomposition.
   1.141 + * UBool isBigEndian; - endianness of this collation binary (formatVersion 2.3)
   1.142 + * uint8_t charSetFamily; - charset family of this collation binary (formatVersion 2.3)
   1.143 + * uint8_t contractionUCACombosWidth; - number of UChars per UCA contraction in contractionUCACombos (formatVersion 2.3)
   1.144 + *
   1.145 + * Various version fields
   1.146 + * UVersionInfo version; - version 4 uint8_t
   1.147 + * UVersionInfo UCAVersion;  - version 4 uint8_t
   1.148 + * UVersionInfo UCDVersion;  - version 4 uint8_t
   1.149 + * UVersionInfo formatVersion; - version of the format of the collation binary
   1.150 + *                               same formatVersion as in ucadata.icu's UDataInfo header
   1.151 + *                               (formatVersion 2.3)
   1.152 + *
   1.153 + * uint32_t offset to the reordering code to lead CE byte remapping table
   1.154 + * uint32_t offset to the lead CE byte to reordering code mapping table
   1.155 + *
   1.156 + * uint8_t reserved[76];  - currently unused
   1.157 + *
   1.158 + * -------------------------------------------------------------
   1.159 + *
   1.160 + * Inverse UCA is used for constructing collators from rules. It is always an individual file
   1.161 + * and always has a UDataInfo header. 
   1.162 + * here is the structure:
   1.163 + * 
   1.164 + * uint32_t byteSize; - size of inverse UCA image in bytes
   1.165 + * uint32_t tableSize; - length of inverse table (number of uint32_t[3] rows)
   1.166 + * uint32_t contsSize; - size of continuation table (number of UChars in table)
   1.167 + *
   1.168 + * uint32_t table; - offset to inverse table (uint32_t *)
   1.169 + *                   Inverse table contains of rows of 3 uint32_t values. First two values are CE and a possible continuation
   1.170 + *                   the third value is either a code unit (if there is only one code unit for element) or an index to continuation 
   1.171 + *                   (number of code units combined with an index).
   1.172 + *                   table. If more than one codepoint have the same CE, continuation table contains code units separated by FFFF and final
   1.173 + *                   code unit sequence for a CE is terminated by FFFE.
   1.174 + * uint32_t conts; - offset to continuation table (uint16_t *). Contains code units that transform to a same CE.
   1.175 + *
   1.176 + * UVersionInfo UCAVersion; -  version of the UCA, read from file 4 uint8_t
   1.177 + * uint8_t padding[8]; - padding 8 uint8_t
   1.178 + * Header is followed by the table and continuation table.
   1.179 +*/
   1.180 +
   1.181 +/* definition of UCOL_HEADER_MAGIC moved to common/ucol_data.h */
   1.182 +
   1.183 +/* UDataInfo for UCA mapping table */
   1.184 +/* dataFormat="UCol"            */
   1.185 +#define UCA_DATA_FORMAT_0 ((uint8_t)0x55)
   1.186 +#define UCA_DATA_FORMAT_1 ((uint8_t)0x43)
   1.187 +#define UCA_DATA_FORMAT_2 ((uint8_t)0x6f)
   1.188 +#define UCA_DATA_FORMAT_3 ((uint8_t)0x6c)
   1.189 +
   1.190 +#define UCA_FORMAT_VERSION_0 ((uint8_t)3)
   1.191 +#define UCA_FORMAT_VERSION_1 0
   1.192 +#define UCA_FORMAT_VERSION_2 ((uint8_t)0)
   1.193 +#define UCA_FORMAT_VERSION_3 ((uint8_t)0)
   1.194 +
   1.195 +/* UDataInfo for inverse UCA table */
   1.196 +/* dataFormat="InvC"            */
   1.197 +#define INVUCA_DATA_FORMAT_0 ((uint8_t)0x49)
   1.198 +#define INVUCA_DATA_FORMAT_1 ((uint8_t)0x6E)
   1.199 +#define INVUCA_DATA_FORMAT_2 ((uint8_t)0x76)
   1.200 +#define INVUCA_DATA_FORMAT_3 ((uint8_t)0x43)
   1.201 +
   1.202 +#define INVUCA_FORMAT_VERSION_0 ((uint8_t)2)
   1.203 +#define INVUCA_FORMAT_VERSION_1 ((uint8_t)1)
   1.204 +#define INVUCA_FORMAT_VERSION_2 ((uint8_t)0)
   1.205 +#define INVUCA_FORMAT_VERSION_3 ((uint8_t)0)
   1.206 +
   1.207 +/* This is the size of the stack allocated buffer for sortkey generation and similar operations */
   1.208 +/* if it is too small, heap allocation will occur.*/
   1.209 +/* you can change this value if you need memory - it will affect the performance, though, since we're going to malloc */
   1.210 +#define UCOL_MAX_BUFFER 128
   1.211 +
   1.212 +#define UCOL_NORMALIZATION_GROWTH 2
   1.213 +#define UCOL_NORMALIZATION_MAX_BUFFER UCOL_MAX_BUFFER*UCOL_NORMALIZATION_GROWTH
   1.214 +
   1.215 +/* This writable buffer is used if we encounter Thai and need to reorder the string on the fly */
   1.216 +/* Sometimes we already have a writable buffer (like in case of normalized strings). */
   1.217 +/*
   1.218 +you can change this value to any value >= 4 if you need memory -
   1.219 +it will affect the performance, though, since we're going to malloc.
   1.220 +Note 3 is the minimum value for Thai collation and 4 is the
   1.221 +minimum number for special Jamo
   1.222 +*/
   1.223 +#define UCOL_WRITABLE_BUFFER_SIZE 256
   1.224 +
   1.225 +/* This is the size of the buffer for expansion CE's */
   1.226 +/* In reality we should not have to deal with expm sequences longer then 16 */
   1.227 +/* you can change this value if you need memory */
   1.228 +/* WARNING THIS BUFFER DOES HAVE MALLOC FALLBACK. If you make it too small, you'll get into performance trouble */
   1.229 +/* Reasonable small value is around 10, if you don't do Arabic or other funky collations that have long expansion sequence */
   1.230 +/* This is the longest expansion sequence we can handle without bombing out */
   1.231 +#define UCOL_EXPAND_CE_BUFFER_SIZE 64
   1.232 +
   1.233 +/* This is the size to increase the buffer for expansion CE's */
   1.234 +#define UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE 64
   1.235 +
   1.236 +
   1.237 +/* Unsafe UChar hash table table size.                                           */
   1.238 +/*  size is 32 bytes for 1 bit for each latin 1 char + some power of two for     */
   1.239 +/*  hashing the rest of the chars.   Size in bytes                               */
   1.240 +#define UCOL_UNSAFECP_TABLE_SIZE 1056
   1.241 +                                    /* mask value down to "some power of two"-1  */
   1.242 +                                    /*  number of bits, not num of bytes.        */
   1.243 +#define UCOL_UNSAFECP_TABLE_MASK 0x1fff
   1.244 +
   1.245 +
   1.246 +/* flags bits for collIterate.flags       */
   1.247 +/*                                        */
   1.248 +/*  NORM - set for incremental normalize of source string */
   1.249 +#define UCOL_ITER_NORM  1
   1.250 +
   1.251 +#define UCOL_ITER_HASLEN 2
   1.252 +
   1.253 +                              /* UCOL_ITER_INNORMBUF - set if the "pos" is in          */
   1.254 +                              /*               the writable side buffer, handling      */
   1.255 +                              /*               incrementally normalized characters.    */
   1.256 +#define UCOL_ITER_INNORMBUF 4
   1.257 +
   1.258 +                              /* UCOL_ITER_ALLOCATED - set if this iterator has        */
   1.259 +                              /*    malloced storage to expand a buffer.               */
   1.260 +#define UCOL_ITER_ALLOCATED 8
   1.261 +                              /* UCOL_HIRAGANA_Q - note if the codepoint was hiragana  */
   1.262 +#define UCOL_HIRAGANA_Q     16
   1.263 +                              /* UCOL_WAS_HIRAGANA - set to TRUE if there was a Hiragana */
   1.264 +                              /* otherwise set to false                                  */
   1.265 +#define UCOL_WAS_HIRAGANA   32 
   1.266 +                              /* UCOL_USE_ITERATOR - set this if collIterate uses a */
   1.267 +                              /* character iterator instead of simply accessing string */
   1.268 +                              /* by index */
   1.269 +#define UCOL_USE_ITERATOR   64
   1.270 +
   1.271 +#define UCOL_FORCE_HAN_IMPLICIT 128
   1.272 +
   1.273 +#define NFC_ZERO_CC_BLOCK_LIMIT_  0x300
   1.274 +
   1.275 +#ifdef __cplusplus
   1.276 +
   1.277 +U_NAMESPACE_BEGIN
   1.278 +
   1.279 +typedef struct collIterate : public UMemory {
   1.280 +  const UChar *string; /* Original string */
   1.281 +  /* UChar *start;  Pointer to the start of the source string. Either points to string
   1.282 +                    or to writableBuffer */
   1.283 +  const UChar *endp; /* string end ptr.  Is undefined for null terminated strings */
   1.284 +  const UChar *pos; /* This is position in the string.  Can be to original or writable buf */
   1.285 +
   1.286 +  uint32_t *toReturn; /* This is the CE from CEs buffer that should be returned */
   1.287 +  uint32_t *CEpos; /* This is the position to which we have stored processed CEs */
   1.288 +
   1.289 +  int32_t *offsetReturn; /* This is the offset to return, if non-NULL */
   1.290 +  int32_t *offsetStore;  /* This is the pointer for storing offsets */
   1.291 +  int32_t offsetRepeatCount;  /* Repeat stored offset if non-zero */
   1.292 +  int32_t offsetRepeatValue;  /* offset value to repeat */
   1.293 +
   1.294 +  UnicodeString writableBuffer;
   1.295 +  const UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
   1.296 +  const UCollator *coll;
   1.297 +  const Normalizer2 *nfd;
   1.298 +  uint8_t   flags;
   1.299 +  uint8_t   origFlags;
   1.300 +  uint32_t *extendCEs; /* This is use if CEs is not big enough */
   1.301 +  int32_t extendCEsSize; /* Holds the size of the dynamic CEs buffer */
   1.302 +  uint32_t CEs[UCOL_EXPAND_CE_BUFFER_SIZE]; /* This is where we store CEs */
   1.303 +
   1.304 +  int32_t *offsetBuffer;    /* A dynamic buffer to hold offsets */
   1.305 +  int32_t offsetBufferSize; /* The size of the offset buffer */
   1.306 +
   1.307 +  UCharIterator *iterator;
   1.308 +  /*int32_t iteratorIndex;*/
   1.309 +
   1.310 +  // The offsetBuffer should probably be a UVector32, but helper functions
   1.311 +  // are an improvement over duplicated code.
   1.312 +  void appendOffset(int32_t offset, UErrorCode &errorCode);
   1.313 +} collIterate;
   1.314 +
   1.315 +U_NAMESPACE_END
   1.316 +
   1.317 +#else
   1.318 +
   1.319 +typedef struct collIterate collIterate;
   1.320 +
   1.321 +#endif
   1.322 +
   1.323 +#define paddedsize(something) ((something)+((((something)%4)!=0)?(4-(something)%4):0))
   1.324 +#define headersize (paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)))
   1.325 +
   1.326 +/* 
   1.327 +struct used internally in getSpecial*CE.
   1.328 +data similar to collIterate.
   1.329 +*/
   1.330 +struct collIterateState {
   1.331 +    const UChar *pos; /* This is position in the string.  Can be to original or writable buf */
   1.332 +    const UChar *returnPos;
   1.333 +    const UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
   1.334 +    const UChar *bufferaddress; /* address of the normalization buffer */
   1.335 +    int32_t  buffersize;
   1.336 +    uint8_t   flags;
   1.337 +    uint8_t   origFlags;
   1.338 +    uint32_t   iteratorIndex;
   1.339 +    int32_t    iteratorMove;
   1.340 +};
   1.341 +
   1.342 +U_CAPI void U_EXPORT2
   1.343 +uprv_init_collIterate(const UCollator *collator,
   1.344 +                      const UChar *sourceString, int32_t sourceLen,
   1.345 +                      U_NAMESPACE_QUALIFIER collIterate *s, UErrorCode *status);
   1.346 +
   1.347 +/* Internal functions for C test code. */
   1.348 +U_CAPI U_NAMESPACE_QUALIFIER collIterate * U_EXPORT2
   1.349 +uprv_new_collIterate(UErrorCode *status);
   1.350 +
   1.351 +U_CAPI void U_EXPORT2
   1.352 +uprv_delete_collIterate(U_NAMESPACE_QUALIFIER collIterate *s);
   1.353 +
   1.354 +/* @return s->pos == s->endp */
   1.355 +U_CAPI UBool U_EXPORT2
   1.356 +uprv_collIterateAtEnd(U_NAMESPACE_QUALIFIER collIterate *s);
   1.357 +
   1.358 +#ifdef __cplusplus
   1.359 +
   1.360 +U_NAMESPACE_BEGIN
   1.361 +
   1.362 +struct UCollationPCE;
   1.363 +typedef struct UCollationPCE UCollationPCE;
   1.364 +
   1.365 +U_NAMESPACE_END
   1.366 +
   1.367 +struct UCollationElements : public icu::UMemory
   1.368 +{
   1.369 +  /**
   1.370 +  * Struct wrapper for source data
   1.371 +  */
   1.372 +        icu::collIterate iteratordata_;
   1.373 +  /**
   1.374 +  * Indicates if this data has been reset.
   1.375 +  */
   1.376 +        UBool              reset_;
   1.377 +  /**
   1.378 +  * Indicates if the data should be deleted.
   1.379 +  */
   1.380 +        UBool              isWritable;
   1.381 +
   1.382 +/**
   1.383 + * Data for getNextProcessed, getPreviousProcessed.
   1.384 + */
   1.385 +        icu::UCollationPCE     *pce;
   1.386 +};
   1.387 +
   1.388 +#else
   1.389 +/*opaque type*/
   1.390 +struct UCollationElements;
   1.391 +#endif
   1.392 +
   1.393 +U_CAPI void U_EXPORT2
   1.394 +uprv_init_pce(const struct UCollationElements *elems);
   1.395 +
   1.396 +#define UCOL_LEVELTERMINATOR 1
   1.397 +
   1.398 +/* mask off anything but primary order */
   1.399 +#define UCOL_PRIMARYORDERMASK 0xffff0000
   1.400 +/* mask off anything but secondary order */
   1.401 +#define UCOL_SECONDARYORDERMASK 0x0000ff00
   1.402 +/* mask off anything but tertiary order */
   1.403 +#define UCOL_TERTIARYORDERMASK 0x000000ff
   1.404 +/* primary order shift */
   1.405 +#define UCOL_PRIMARYORDERSHIFT 16
   1.406 +/* secondary order shift */
   1.407 +#define UCOL_SECONDARYORDERSHIFT 8
   1.408 +
   1.409 +#define UCOL_BYTE_SIZE_MASK 0xFF
   1.410 +
   1.411 +#define UCOL_CASE_BYTE_START 0x80
   1.412 +#define UCOL_CASE_SHIFT_START 7
   1.413 +
   1.414 +#define UCOL_IGNORABLE 0
   1.415 +
   1.416 +/* get weights from a CE */
   1.417 +#define UCOL_PRIMARYORDER(order) (((order) & UCOL_PRIMARYORDERMASK)>> UCOL_PRIMARYORDERSHIFT)
   1.418 +#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
   1.419 +#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
   1.420 +
   1.421 +/**
   1.422 + * Determine if a character is a Thai vowel (which sorts after
   1.423 + * its base consonant).
   1.424 + */
   1.425 +#define UCOL_ISTHAIPREVOWEL(ch) ((((uint32_t)(ch) - 0xe40) <= (0xe44 - 0xe40)) || \
   1.426 +                                 (((uint32_t)(ch) - 0xec0) <= (0xec4 - 0xec0)))
   1.427 +
   1.428 +/**
   1.429 + * Determine if a character is a Thai base consonant
   1.430 + */
   1.431 +#define UCOL_ISTHAIBASECONSONANT(ch) ((uint32_t)(ch) - 0xe01) <= (0xe2e - 0xe01)
   1.432 +
   1.433 +#define UCOL_ISJAMO(ch) ((((uint32_t)(ch) - 0x1100) <= (0x1112 - 0x1100)) || \
   1.434 +                         (((uint32_t)(ch) - 0x1161) <= (0x1175 - 0x1161)) || \
   1.435 +                         (((uint32_t)(ch) - 0x11A8) <= (0x11C2 - 0x11A8)))
   1.436 +
   1.437 +/* Han character ranges */
   1.438 +#define UCOL_FIRST_HAN 0x4E00
   1.439 +#define UCOL_LAST_HAN  0x9FFF
   1.440 +#define UCOL_FIRST_HAN_A 0x3400
   1.441 +#define UCOL_LAST_HAN_A  0x4DBF
   1.442 +#define UCOL_FIRST_HAN_COMPAT 0xFAE0
   1.443 +#define UCOL_LAST_HAN_COMPAT  0xFA2F
   1.444 +
   1.445 +/* Han extension B is in plane 2 */
   1.446 +#define UCOL_FIRST_HAN_B       0x20000
   1.447 +#define UCOL_LAST_HAN_B        0x2A6DF
   1.448 +
   1.449 +/* Hangul range */
   1.450 +#define UCOL_FIRST_HANGUL 0xAC00
   1.451 +#define UCOL_LAST_HANGUL  0xD7AF
   1.452 +
   1.453 +/* Jamo ranges */
   1.454 +#define UCOL_FIRST_L_JAMO 0x1100
   1.455 +#define UCOL_FIRST_V_JAMO 0x1161
   1.456 +#define UCOL_FIRST_T_JAMO 0x11A8
   1.457 +#define UCOL_LAST_T_JAMO  0x11F9
   1.458 +
   1.459 +
   1.460 +#if 0
   1.461 +/* initializes collIterate structure */
   1.462 +/* made as macro to speed up things */
   1.463 +#define init_collIterate(collator, sourceString, sourceLen, s) { \
   1.464 +    (s)->start = (s)->string = (s)->pos = (UChar *)(sourceString); \
   1.465 +    (s)->endp  = (sourceLen) == -1 ? NULL :(UChar *)(sourceString)+(sourceLen); \
   1.466 +    (s)->CEpos = (s)->toReturn = (s)->CEs; \
   1.467 +    (s)->isThai = TRUE; \
   1.468 +    (s)->writableBuffer = (s)->stackWritableBuffer; \
   1.469 +    (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE; \
   1.470 +    (s)->coll = (collator); \
   1.471 +    (s)->fcdPosition = 0;   \
   1.472 +    (s)->flags = 0; \
   1.473 +    if(((collator)->normalizationMode == UCOL_ON)) (s)->flags |= UCOL_ITER_NORM; \
   1.474 +}
   1.475 +#endif
   1.476 +
   1.477 +
   1.478 +
   1.479 +/*
   1.480 +* Macro to get the maximum size of an expansion ending with the argument ce.
   1.481 +* Used in the Boyer Moore algorithm.
   1.482 +* Note for tailoring, the UCA maxexpansion table has been merged.
   1.483 +* Hence we only have to search the tailored collator only.
   1.484 +* @param coll const UCollator pointer
   1.485 +* @param order last collation element of the expansion sequence
   1.486 +* @param result size of the longest expansion with argument collation element
   1.487 +*        as the last element
   1.488 +*/
   1.489 +#define UCOL_GETMAXEXPANSION(coll, order, result) {                          \
   1.490 +  const uint32_t *start;                                                     \
   1.491 +  const uint32_t *limit;                                                     \
   1.492 +  const uint32_t *mid;                                                       \
   1.493 +  start = (coll)->endExpansionCE;                                            \
   1.494 +  limit = (coll)->lastEndExpansionCE;                                        \
   1.495 +  while (start < limit - 1) {                                                \
   1.496 +    mid = start + ((limit - start) >> 1);                                    \
   1.497 +    if ((order) <= *mid) {                                                   \
   1.498 +      limit = mid;                                                           \
   1.499 +    }                                                                        \
   1.500 +    else {                                                                   \
   1.501 +      start = mid;                                                           \
   1.502 +    }                                                                        \
   1.503 +  }                                                                          \
   1.504 +  if (*start == order) {                                                     \
   1.505 +    result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE));  \
   1.506 +  }                                                                          \
   1.507 +  else if (*limit == order) {                                                \
   1.508 +         result = *(coll->expansionCESize + (limit - coll->endExpansionCE)); \
   1.509 +       }                                                                     \
   1.510 +       else if ((order & 0xFFFF) == 0x00C0) {                                \
   1.511 +              result = 2;                                                    \
   1.512 +            }                                                                \
   1.513 +            else {                                                           \
   1.514 +              result = 1;                                                    \
   1.515 +            }                                                                \
   1.516 +}
   1.517 +
   1.518 +U_CFUNC
   1.519 +uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE,
   1.520 +                               U_NAMESPACE_QUALIFIER collIterate *source, UErrorCode *status);
   1.521 +
   1.522 +U_CFUNC
   1.523 +uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
   1.524 +                                   U_NAMESPACE_QUALIFIER collIterate *source, UErrorCode *status);
   1.525 +U_CAPI uint32_t U_EXPORT2 ucol_getNextCE(const UCollator *coll,
   1.526 +                                         U_NAMESPACE_QUALIFIER collIterate *collationSource, UErrorCode *status);
   1.527 +U_CFUNC uint32_t U_EXPORT2 ucol_getPrevCE(const UCollator *coll,
   1.528 +                                          U_NAMESPACE_QUALIFIER collIterate *collationSource,
   1.529 +                                          UErrorCode *status);
   1.530 +/* get some memory */
   1.531 +void *ucol_getABuffer(const UCollator *coll, uint32_t size);
   1.532 +
   1.533 +#ifdef __cplusplus
   1.534 +
   1.535 +U_NAMESPACE_BEGIN
   1.536 +
   1.537 +class CollationKey;
   1.538 +class SortKeyByteSink;
   1.539 +
   1.540 +U_NAMESPACE_END
   1.541 +
   1.542 +/* function used by C++ getCollationKey to prevent restarting the calculation */
   1.543 +U_CFUNC int32_t
   1.544 +ucol_getCollationKey(const UCollator *coll,
   1.545 +                     const UChar *source, int32_t sourceLength,
   1.546 +                     icu::CollationKey &key,
   1.547 +                     UErrorCode &errorCode);
   1.548 +
   1.549 +typedef void U_CALLCONV
   1.550 +SortKeyGenerator(const    UCollator    *coll,
   1.551 +        const    UChar        *source,
   1.552 +        int32_t        sourceLength,
   1.553 +        icu::SortKeyByteSink &result,
   1.554 +        UErrorCode *status);
   1.555 +
   1.556 +/* worker function for generating sortkeys */
   1.557 +U_CFUNC
   1.558 +void U_CALLCONV
   1.559 +ucol_calcSortKey(const    UCollator    *coll,
   1.560 +        const    UChar        *source,
   1.561 +        int32_t        sourceLength,
   1.562 +        icu::SortKeyByteSink &result,
   1.563 +        UErrorCode *status);
   1.564 +
   1.565 +U_CFUNC
   1.566 +void U_CALLCONV
   1.567 +ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
   1.568 +        const    UChar        *source,
   1.569 +        int32_t        sourceLength,
   1.570 +        icu::SortKeyByteSink &result,
   1.571 +        UErrorCode *status);
   1.572 +
   1.573 +#else
   1.574 +
   1.575 +typedef void U_CALLCONV
   1.576 +SortKeyGenerator(const    UCollator    *coll,
   1.577 +        const    UChar        *source,
   1.578 +        int32_t        sourceLength,
   1.579 +        void *result,
   1.580 +        UErrorCode *status);
   1.581 +
   1.582 +#endif
   1.583 +
   1.584 +/**
   1.585 + * Used to set requested and valid locales on a collator returned by the collator
   1.586 + * service.
   1.587 + */
   1.588 +U_CFUNC void U_EXPORT2
   1.589 +ucol_setReqValidLocales(UCollator *coll, char *requestedLocaleToAdopt, char *validLocaleToAdopt, char *actualLocaleToAdopt);
   1.590 +
   1.591 +#define UCOL_SPECIAL_FLAG 0xF0000000
   1.592 +#define UCOL_TAG_SHIFT 24
   1.593 +#define UCOL_TAG_MASK 0x0F000000
   1.594 +#define INIT_EXP_TABLE_SIZE 1024
   1.595 +#define UCOL_NOT_FOUND 0xF0000000
   1.596 +#define UCOL_EXPANSION 0xF1000000
   1.597 +#define UCOL_CONTRACTION 0xF2000000
   1.598 +#define UCOL_THAI 0xF3000000
   1.599 +#define UCOL_UNMARKED 0x03
   1.600 +#define UCOL_NEW_TERTIARYORDERMASK 0x0000003f
   1.601 +
   1.602 +/* Bit mask for primary collation strength. */
   1.603 +#define UCOL_PRIMARYMASK    0xFFFF0000
   1.604 +
   1.605 +/* Bit mask for secondary collation strength. */
   1.606 +#define UCOL_SECONDARYMASK  0x0000FF00
   1.607 +
   1.608 +/* Bit mask for tertiary collation strength. */
   1.609 +#define UCOL_TERTIARYMASK   0x000000FF
   1.610 +
   1.611 +/**
   1.612 + * Internal.
   1.613 + * This indicates the last element in a UCollationElements has been consumed.
   1.614 + * Compare with the UCOL_NULLORDER, UCOL_NULLORDER is returned if error occurs.
   1.615 + */
   1.616 +#define UCOL_NO_MORE_CES            0x00010101
   1.617 +#define UCOL_NO_MORE_CES_PRIMARY    0x00010000
   1.618 +#define UCOL_NO_MORE_CES_SECONDARY  0x00000100
   1.619 +#define UCOL_NO_MORE_CES_TERTIARY   0x00000001
   1.620 +
   1.621 +#define isSpecial(CE) ((((CE)&UCOL_SPECIAL_FLAG)>>28)==0xF)
   1.622 +
   1.623 +#define UCOL_UPPER_CASE 0x80
   1.624 +#define UCOL_MIXED_CASE 0x40
   1.625 +#define UCOL_LOWER_CASE 0x00
   1.626 +
   1.627 +#define UCOL_CONTINUATION_MARKER 0xC0
   1.628 +#define UCOL_REMOVE_CONTINUATION 0xFFFFFF3F
   1.629 +
   1.630 +#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
   1.631 +#define isFlagged(CE) (((CE) & 0x80) == 0x80)
   1.632 +#define isLongPrimary(CE) (((CE) & 0xC0) == 0xC0)
   1.633 +
   1.634 +#define getCETag(CE) (((CE)&UCOL_TAG_MASK)>>UCOL_TAG_SHIFT)
   1.635 +#define isContraction(CE) (isSpecial((CE)) && (getCETag((CE)) == CONTRACTION_TAG))
   1.636 +#define isPrefix(CE) (isSpecial((CE)) && (getCETag((CE)) == SPEC_PROC_TAG))
   1.637 +#define constructContractCE(tag, CE) (UCOL_SPECIAL_FLAG | ((tag)<<UCOL_TAG_SHIFT) | ((CE)&0xFFFFFF))
   1.638 +#define constructSpecProcCE(CE) (UCOL_SPECIAL_FLAG | (SPEC_PROC_TAG<<UCOL_TAG_SHIFT) | ((CE)&0xFFFFFF))
   1.639 +#define getContractOffset(CE) ((CE)&0xFFFFFF)
   1.640 +#define getExpansionOffset(CE) (((CE)&0x00FFFFF0)>>4)
   1.641 +#define getExpansionCount(CE) ((CE)&0xF)
   1.642 +#define isCEIgnorable(CE) (((CE) & 0xFFFFFFBF) == 0)
   1.643 +
   1.644 +/* StringSearch internal use */
   1.645 +#define inNormBuf(coleiter) ((coleiter)->iteratordata_.flags & UCOL_ITER_INNORMBUF)
   1.646 +#define isFCDPointerNull(coleiter) ((coleiter)->iteratordata_.fcdPosition == NULL)
   1.647 +#define hasExpansion(coleiter) ((coleiter)->iteratordata_.CEpos != (coleiter)->iteratordata_.CEs)
   1.648 +#define getExpansionPrefix(coleiter) ((coleiter)->iteratordata_.toReturn - (coleiter)->iteratordata_.CEs)
   1.649 +#define setExpansionPrefix(coleiter, offset) ((coleiter)->iteratordata_.CEs + offset)
   1.650 +#define getExpansionSuffix(coleiter) ((coleiter)->iteratordata_.CEpos - (coleiter)->iteratordata_.toReturn)
   1.651 +#define setExpansionSuffix(coleiter, offset) ((coleiter)->iteratordata_.toReturn = (coleiter)->iteratordata_.CEpos - leftoverces)
   1.652 +
   1.653 +/* This is an enum that lists magic special byte values from the fractional UCA.
   1.654 + * See also http://site.icu-project.org/design/collation/bytes */
   1.655 +/* TODO: all the #defines that refer to special byte values from the UCA should be changed to point here */
   1.656 +
   1.657 +enum {
   1.658 +    UCOL_BYTE_ZERO = 0x00,
   1.659 +    UCOL_BYTE_LEVEL_SEPARATOR = 0x01,
   1.660 +    UCOL_BYTE_SORTKEY_GLUE = 0x02,
   1.661 +    UCOL_BYTE_SHIFT_PREFIX = 0x03,
   1.662 +    UCOL_BYTE_UNSHIFTED_MIN = UCOL_BYTE_SHIFT_PREFIX,
   1.663 +    UCOL_BYTE_FIRST_TAILORED = 0x04,
   1.664 +    UCOL_BYTE_COMMON = 0x05,
   1.665 +    UCOL_BYTE_FIRST_UCA = UCOL_BYTE_COMMON,
   1.666 +    /* TODO: Make the following values dynamic since they change with almost every UCA version. */
   1.667 +    UCOL_CODAN_PLACEHOLDER = 0x12,
   1.668 +    UCOL_BYTE_FIRST_NON_LATIN_PRIMARY = 0x5B,
   1.669 +    UCOL_BYTE_UNSHIFTED_MAX = 0xFF
   1.670 +}; 
   1.671 +
   1.672 +#if 0
   1.673 +#define UCOL_RESET_TOP_VALUE                0x9F000303
   1.674 +#define UCOL_FIRST_PRIMARY_IGNORABLE        0x00008705
   1.675 +#define UCOL_LAST_PRIMARY_IGNORABLE         0x0000DD05
   1.676 +#define UCOL_LAST_PRIMARY_IGNORABLE_CONT    0x000051C0
   1.677 +#define UCOL_FIRST_SECONDARY_IGNORABLE      0x00000000
   1.678 +#define UCOL_LAST_SECONDARY_IGNORABLE       0x00000500
   1.679 +#define UCOL_FIRST_TERTIARY_IGNORABLE       0x00000000
   1.680 +#define UCOL_LAST_TERTIARY_IGNORABLE        0x00000000
   1.681 +#define UCOL_FIRST_VARIABLE                 0x05070505
   1.682 +#define UCOL_LAST_VARIABLE                  0x179B0505
   1.683 +#define UCOL_FIRST_NON_VARIABLE             0x1A200505
   1.684 +#define UCOL_LAST_NON_VARIABLE              0x7B41058F
   1.685 +
   1.686 +#define UCOL_NEXT_TOP_VALUE                 0xE8960303
   1.687 +#define UCOL_NEXT_FIRST_PRIMARY_IGNORABLE   0x00008905
   1.688 +#define UCOL_NEXT_LAST_PRIMARY_IGNORABLE    0x03000303
   1.689 +#define UCOL_NEXT_FIRST_SECONDARY_IGNORABLE 0x00008705
   1.690 +#define UCOL_NEXT_LAST_SECONDARY_IGNORABLE  0x00000500
   1.691 +#define UCOL_NEXT_FIRST_TERTIARY_IGNORABLE  0x00000000
   1.692 +#define UCOL_NEXT_LAST_TERTIARY_IGNORABLE   0x00000000
   1.693 +#define UCOL_NEXT_FIRST_VARIABLE            0x05090505
   1.694 +#define UCOL_NEXT_LAST_VARIABLE             0x1A200505
   1.695 +
   1.696 +#define PRIMARY_IMPLICIT_MIN 0xE8000000
   1.697 +#define PRIMARY_IMPLICIT_MAX 0xF0000000
   1.698 +#endif
   1.699 +
   1.700 +/* These constants can be changed - sortkey size is affected by them */
   1.701 +#define UCOL_PROPORTION2 0.5
   1.702 +#define UCOL_PROPORTION3 0.667
   1.703 +
   1.704 +/* These values come from the UCA */
   1.705 +#define UCOL_COMMON_BOT2 UCOL_BYTE_COMMON
   1.706 +#define UCOL_COMMON_TOP2 0x86u
   1.707 +#define UCOL_TOTAL2 (UCOL_COMMON_TOP2-UCOL_COMMON_BOT2-1) 
   1.708 +
   1.709 +#define UCOL_FLAG_BIT_MASK_CASE_SW_OFF 0x80
   1.710 +#define UCOL_FLAG_BIT_MASK_CASE_SW_ON 0x40
   1.711 +#define UCOL_COMMON_TOP3_CASE_SW_OFF 0x85
   1.712 +#define UCOL_COMMON_TOP3_CASE_SW_LOWER 0x45
   1.713 +#define UCOL_COMMON_TOP3_CASE_SW_UPPER 0xC5
   1.714 +
   1.715 +/* These values come from the UCA */
   1.716 +#define UCOL_COMMON_BOT3 0x05
   1.717 +
   1.718 +#define UCOL_COMMON_BOTTOM3_CASE_SW_UPPER 0x86;
   1.719 +#define UCOL_COMMON_BOTTOM3_CASE_SW_LOWER UCOL_COMMON_BOT3;
   1.720 +
   1.721 +#define UCOL_TOP_COUNT2  (UCOL_PROPORTION2*UCOL_TOTAL2)
   1.722 +#define UCOL_BOT_COUNT2  (UCOL_TOTAL2-UCOL_TOP_COUNT2)
   1.723 +
   1.724 +
   1.725 +#define UCOL_COMMON2 UCOL_COMMON_BOT2
   1.726 +#define UCOL_COMMON3_UPPERFIRST 0xC5
   1.727 +#define UCOL_COMMON3_NORMAL UCOL_COMMON_BOT3
   1.728 +
   1.729 +#define UCOL_COMMON4 0xFF
   1.730 +
   1.731 +/* constants for case level/case first handling */
   1.732 +/* used to instantiate UCollators fields in ucol_updateInternalState */
   1.733 +#define UCOL_CASE_SWITCH      0xC0
   1.734 +#define UCOL_NO_CASE_SWITCH   0x00
   1.735 +
   1.736 +#define UCOL_REMOVE_CASE      0x3F
   1.737 +#define UCOL_KEEP_CASE        0xFF
   1.738 +
   1.739 +#define UCOL_CASE_BIT_MASK    0xC0
   1.740 +
   1.741 +#define UCOL_TERT_CASE_MASK   0xFF
   1.742 +
   1.743 +#define UCOL_ENDOFLATINONERANGE 0xFF
   1.744 +#define UCOL_LATINONETABLELEN   (UCOL_ENDOFLATINONERANGE+50)
   1.745 +#define UCOL_BAIL_OUT_CE      0xFF000000
   1.746 +
   1.747 +
   1.748 +typedef enum {
   1.749 +    NOT_FOUND_TAG = 0,
   1.750 +    EXPANSION_TAG = 1,       /* This code point results in an expansion */
   1.751 +    CONTRACTION_TAG = 2,     /* Start of a contraction */
   1.752 +    THAI_TAG = 3,            /* Thai character - do the reordering */
   1.753 +    CHARSET_TAG = 4,         /* Charset processing, not yet implemented */
   1.754 +    SURROGATE_TAG = 5,       /* Lead surrogate that is tailored and doesn't start a contraction */
   1.755 +    HANGUL_SYLLABLE_TAG = 6, /* AC00-D7AF*/
   1.756 +    LEAD_SURROGATE_TAG = 7,  /* D800-DBFF*/
   1.757 +    TRAIL_SURROGATE_TAG = 8,     /* DC00-DFFF*/
   1.758 +    CJK_IMPLICIT_TAG = 9,    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   1.759 +    IMPLICIT_TAG = 10,
   1.760 +    SPEC_PROC_TAG = 11,
   1.761 +    /* ICU 2.1 */
   1.762 +    LONG_PRIMARY_TAG = 12,   /* This is a three byte primary with starting secondaries and tertiaries */
   1.763 +                             /* It fits in a single 32 bit CE and is used instead of expansion to save */
   1.764 +                             /* space without affecting the performance (hopefully) */
   1.765 +                             
   1.766 +    DIGIT_TAG = 13,          /* COllate Digits As Numbers (CODAN) implementation */
   1.767 +    
   1.768 +    CE_TAGS_COUNT
   1.769 +} UColCETags;
   1.770 +
   1.771 +/*
   1.772 + *****************************************************************************************
   1.773 + * set to zero
   1.774 + * NON_CHARACTER FDD0 - FDEF, FFFE, FFFF, 1FFFE, 1FFFF, 2FFFE, 2FFFF,...e.g. **FFFE, **FFFF
   1.775 + ******************************************************************************************
   1.776 + */
   1.777 +
   1.778 +typedef struct {
   1.779 +      uint32_t variableTopValue;
   1.780 +      /*UColAttributeValue*/ int32_t frenchCollation;
   1.781 +      /*UColAttributeValue*/ int32_t alternateHandling; /* attribute for handling variable elements*/
   1.782 +      /*UColAttributeValue*/ int32_t caseFirst;         /* who goes first, lower case or uppercase */
   1.783 +      /*UColAttributeValue*/ int32_t caseLevel;         /* do we have an extra case level */
   1.784 +      /*UColAttributeValue*/ int32_t normalizationMode; /* attribute for normalization */
   1.785 +      /*UColAttributeValue*/ int32_t strength;          /* attribute for strength */
   1.786 +      /*UColAttributeValue*/ int32_t hiraganaQ;         /* attribute for special Hiragana */
   1.787 +      /*UColAttributeValue*/ int32_t numericCollation;  /* attribute for numeric collation */
   1.788 +      uint32_t reserved[15];                 /* for future use */
   1.789 +} UColOptionSet;
   1.790 +
   1.791 +typedef struct {
   1.792 +  uint32_t UCA_FIRST_TERTIARY_IGNORABLE[2];       /*0x00000000*/
   1.793 +  uint32_t UCA_LAST_TERTIARY_IGNORABLE[2];        /*0x00000000*/
   1.794 +  uint32_t UCA_FIRST_PRIMARY_IGNORABLE[2];        /*0x00008705*/
   1.795 +  uint32_t UCA_FIRST_SECONDARY_IGNORABLE[2];      /*0x00000000*/
   1.796 +  uint32_t UCA_LAST_SECONDARY_IGNORABLE[2];       /*0x00000500*/
   1.797 +  uint32_t UCA_LAST_PRIMARY_IGNORABLE[2];         /*0x0000DD05*/
   1.798 +  uint32_t UCA_FIRST_VARIABLE[2];                 /*0x05070505*/
   1.799 +  uint32_t UCA_LAST_VARIABLE[2];                  /*0x13CF0505*/
   1.800 +  uint32_t UCA_FIRST_NON_VARIABLE[2];             /*0x16200505*/
   1.801 +  uint32_t UCA_LAST_NON_VARIABLE[2];              /*0x767C0505*/
   1.802 +  uint32_t UCA_RESET_TOP_VALUE[2];                /*0x9F000303*/
   1.803 +  uint32_t UCA_FIRST_IMPLICIT[2];
   1.804 +  uint32_t UCA_LAST_IMPLICIT[2]; 
   1.805 +  uint32_t UCA_FIRST_TRAILING[2];
   1.806 +  uint32_t UCA_LAST_TRAILING[2]; 
   1.807 +
   1.808 +#if 0
   1.809 +  uint32_t UCA_NEXT_TOP_VALUE[2];                 /*0xE8960303*/
   1.810 +  uint32_t UCA_NEXT_FIRST_PRIMARY_IGNORABLE;   /*0x00008905*/
   1.811 +  uint32_t UCA_NEXT_LAST_PRIMARY_IGNORABLE;    /*0x03000303*/
   1.812 +  uint32_t UCA_NEXT_FIRST_SECONDARY_IGNORABLE; /*0x00008705*/
   1.813 +  uint32_t UCA_NEXT_LAST_SECONDARY_IGNORABLE;  /*0x00000500*/
   1.814 +  uint32_t UCA_NEXT_FIRST_TERTIARY_IGNORABLE;  /*0x00000000*/
   1.815 +  uint32_t UCA_NEXT_LAST_TERTIARY_IGNORABLE;   /*0x00000000*/
   1.816 +  uint32_t UCA_NEXT_FIRST_VARIABLE;            /*0x05090505*/
   1.817 +  uint32_t UCA_NEXT_LAST_VARIABLE;             /*0x16200505*/
   1.818 +#endif
   1.819 +
   1.820 +  uint32_t UCA_PRIMARY_TOP_MIN;
   1.821 +  uint32_t UCA_PRIMARY_IMPLICIT_MIN; /*0xE8000000*/
   1.822 +  uint32_t UCA_PRIMARY_IMPLICIT_MAX; /*0xF0000000*/
   1.823 +  uint32_t UCA_PRIMARY_TRAILING_MIN; /*0xE8000000*/
   1.824 +  uint32_t UCA_PRIMARY_TRAILING_MAX; /*0xF0000000*/
   1.825 +  uint32_t UCA_PRIMARY_SPECIAL_MIN; /*0xE8000000*/
   1.826 +  uint32_t UCA_PRIMARY_SPECIAL_MAX; /*0xF0000000*/
   1.827 +} UCAConstants;
   1.828 +
   1.829 +/* definition of UCATableHeader moved to common/ucol_data.h */
   1.830 +
   1.831 +#define U_UNKNOWN_STATE 0
   1.832 +#define U_COLLATOR_STATE 0x01
   1.833 +#define U_STATE_LIMIT 0x02
   1.834 +
   1.835 +/* This is the first structure in a state */
   1.836 +/* it should be machine independent */
   1.837 +typedef struct {
   1.838 +  /* this structure is supposed to be readable on all the platforms.*/
   1.839 +  /* first 2 fields hold the size of the structure in a platform independent way */
   1.840 +  uint8_t sizeLo;
   1.841 +  uint8_t sizeHi;
   1.842 +  /* identifying the writing platform */
   1.843 +  uint8_t isBigEndian;
   1.844 +  /* see U_CHARSET_FAMILY values in utypes.h */
   1.845 +  uint8_t charsetFamily;
   1.846 +  /* version of ICU this state structure comes from */
   1.847 +  uint8_t icuVersion[4];
   1.848 +  /* What is the data following this state */
   1.849 +  uint8_t type;
   1.850 +  /* more stuff to come, keep it on 16 byte boundary */
   1.851 +  uint8_t reserved[7];
   1.852 +} UStateStruct;
   1.853 +
   1.854 +/* This structure follows UStatusStruct */
   1.855 +/* and contains data specific for the collators */
   1.856 +/* Endianess needs to be decided before accessing this structure */
   1.857 +/* However, it's size IS endianess independent */
   1.858 +typedef struct {
   1.859 +  /* size of this structure */
   1.860 +  uint8_t sizeLo;
   1.861 +  uint8_t sizeHi;
   1.862 +  /* This state is followed by the frozen tailoring */
   1.863 +  uint8_t containsTailoring;
   1.864 +  /* This state is followed by the frozen UCA */
   1.865 +  uint8_t containsUCA;
   1.866 +  /* Version info - the same one */
   1.867 +  uint8_t versionInfo[4];
   1.868 +
   1.869 +  /* for charset CEs */
   1.870 +  uint8_t charsetName[32];                 
   1.871 +  /* this is the resolved locale name*/
   1.872 +  uint8_t locale[32];                      
   1.873 +
   1.874 +  /* Attributes. Open ended */
   1.875 +  /* all the following will be moved to uint32_t because of portability */
   1.876 +  /* variable top value */
   1.877 +  uint32_t variableTopValue;
   1.878 +  /* attribute for handling variable elements*/
   1.879 +  uint32_t /*UColAttributeValue*/ alternateHandling; 
   1.880 +  /* how to handle secondary weights */
   1.881 +  uint32_t /*UColAttributeValue*/ frenchCollation;
   1.882 +  /* who goes first, lower case or uppercase */
   1.883 +  uint32_t /*UColAttributeValue*/ caseFirst;         
   1.884 +  /* do we have an extra case level */
   1.885 +  uint32_t /*UColAttributeValue*/ caseLevel;         
   1.886 +  /* attribute for normalization */
   1.887 +  uint32_t /*UColAttributeValue*/ normalizationMode; 
   1.888 +  /* attribute for strength */
   1.889 +  uint32_t /*UColAttributeValue*/ strength;
   1.890 +  /* to be immediately 16 byte aligned */
   1.891 +  uint8_t reserved[12];
   1.892 +} UColStateStruct;
   1.893 +
   1.894 +#define UCOL_INV_SIZEMASK 0xFFF00000
   1.895 +#define UCOL_INV_OFFSETMASK 0x000FFFFF
   1.896 +#define UCOL_INV_SHIFTVALUE 20
   1.897 +
   1.898 +U_CDECL_BEGIN
   1.899 +
   1.900 +/* definition of InverseUCATableHeader moved to common/ucol_data.h */
   1.901 +
   1.902 +typedef void U_CALLCONV
   1.903 +ResourceCleaner(UCollator *coll);
   1.904 +
   1.905 +
   1.906 +struct UCollator {
   1.907 +    UColOptionSet  *options;
   1.908 +    SortKeyGenerator *sortKeyGen;
   1.909 +    uint32_t *latinOneCEs;
   1.910 +    char* actualLocale;
   1.911 +    char* validLocale;
   1.912 +    char* requestedLocale;
   1.913 +    const UChar *rules;
   1.914 +    const UChar *ucaRules;
   1.915 +    const UCollator *UCA;
   1.916 +    const UCATableHeader *image;
   1.917 +    UTrie mapping;
   1.918 +    const uint32_t *latinOneMapping;
   1.919 +    const uint32_t *expansion;
   1.920 +    const UChar    *contractionIndex;
   1.921 +    const uint32_t *contractionCEs;
   1.922 +
   1.923 +    const uint32_t *endExpansionCE;    /* array of last ces in an expansion ce.
   1.924 +                                          corresponds to expansionCESize */
   1.925 +    const uint32_t *lastEndExpansionCE;/* pointer to the last element in endExpansionCE */
   1.926 +    const uint8_t  *expansionCESize;   /* array of the maximum size of a
   1.927 +                                         expansion ce with the last ce
   1.928 +                                         corresponding to endExpansionCE,
   1.929 +                                         terminated with a null */
   1.930 +    const uint8_t *unsafeCP;           /* unsafe code points hashtable */
   1.931 +    const uint8_t *contrEndCP;         /* Contraction ending chars hash table */
   1.932 +    UChar          minUnsafeCP;        /* Smallest unsafe Code Point. */
   1.933 +    UChar          minContrEndCP;      /* Smallest code point at end of a contraction */
   1.934 +
   1.935 +    int32_t rulesLength;
   1.936 +    int32_t latinOneTableLen;
   1.937 +
   1.938 +    uint32_t variableTopValue;
   1.939 +    UColAttributeValue frenchCollation;
   1.940 +    UColAttributeValue alternateHandling; /* attribute for handling variable elements*/
   1.941 +    UColAttributeValue caseFirst;         /* who goes first, lower case or uppercase */
   1.942 +    UColAttributeValue caseLevel;         /* do we have an extra case level */
   1.943 +    UColAttributeValue normalizationMode; /* attribute for normalization */
   1.944 +    UColAttributeValue strength;          /* attribute for strength */
   1.945 +    UColAttributeValue hiraganaQ;         /* attribute for Hiragana */
   1.946 +    UColAttributeValue numericCollation;
   1.947 +    UBool variableTopValueisDefault;
   1.948 +    UBool frenchCollationisDefault;
   1.949 +    UBool alternateHandlingisDefault; /* attribute for handling variable elements*/
   1.950 +    UBool caseFirstisDefault;         /* who goes first, lower case or uppercase */
   1.951 +    UBool caseLevelisDefault;         /* do we have an extra case level */
   1.952 +    UBool normalizationModeisDefault; /* attribute for normalization */
   1.953 +    UBool strengthisDefault;          /* attribute for strength */
   1.954 +    UBool hiraganaQisDefault;         /* attribute for Hiragana */
   1.955 +    UBool numericCollationisDefault;
   1.956 +    UBool hasRealData;                /* some collators have only options, like French, no rules */
   1.957 +                                      /* to speed up things, we use the UCA image, but we don't want it */
   1.958 +                                      /* to run around */
   1.959 +
   1.960 +    UBool freeOnClose;
   1.961 +    UBool freeOptionsOnClose;
   1.962 +    UBool freeRulesOnClose;
   1.963 +    UBool freeImageOnClose;
   1.964 +    UBool freeDefaultReorderCodesOnClose;
   1.965 +    UBool freeReorderCodesOnClose;
   1.966 +    UBool freeLeadBytePermutationTableOnClose;
   1.967 +
   1.968 +    UBool latinOneUse;
   1.969 +    UBool latinOneRegenTable;
   1.970 +    UBool latinOneFailed;
   1.971 +
   1.972 +    int8_t tertiaryAddition; /* when switching case, we need to add or subtract different values */
   1.973 +    uint8_t caseSwitch;
   1.974 +    uint8_t tertiaryCommon;
   1.975 +    uint8_t tertiaryMask;
   1.976 +    uint8_t tertiaryTop; /* Upper range when compressing */
   1.977 +    uint8_t tertiaryBottom; /* Upper range when compressing */
   1.978 +    uint8_t tertiaryTopCount;
   1.979 +    uint8_t tertiaryBottomCount;
   1.980 +
   1.981 +    UVersionInfo dataVersion;               /* Data info of UCA table */
   1.982 +    int32_t* defaultReorderCodes;
   1.983 +    int32_t defaultReorderCodesLength;
   1.984 +    int32_t* reorderCodes;
   1.985 +    int32_t reorderCodesLength;
   1.986 +    uint8_t* leadBytePermutationTable;
   1.987 +    void  *delegate;  /* if non-null: C++ object to delegate all API calls to. */
   1.988 +};
   1.989 +
   1.990 +U_CDECL_END
   1.991 +
   1.992 +/* various internal functions */
   1.993 +
   1.994 +/* do not close UCA returned by ucol_initUCA! */
   1.995 +U_CFUNC
   1.996 +UCollator* ucol_initUCA(UErrorCode *status);
   1.997 +
   1.998 +U_CFUNC
   1.999 +UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status);
  1.1000 +
  1.1001 +U_CFUNC
  1.1002 +void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status);
  1.1003 +
  1.1004 +U_CFUNC
  1.1005 +UCollator* ucol_open_internal(const char* loc, UErrorCode* status);
  1.1006 +
  1.1007 +#if 0
  1.1008 +U_CFUNC
  1.1009 +void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status);
  1.1010 +#endif
  1.1011 +
  1.1012 +U_CFUNC
  1.1013 +void ucol_updateInternalState(UCollator *coll, UErrorCode *status);
  1.1014 +
  1.1015 +U_CFUNC uint32_t U_EXPORT2 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status);
  1.1016 +U_CAPI UBool U_EXPORT2 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status);
  1.1017 +
  1.1018 +U_CAPI const InverseUCATableHeader* U_EXPORT2 ucol_initInverseUCA(UErrorCode *status);
  1.1019 +
  1.1020 +U_CAPI void U_EXPORT2 
  1.1021 +uprv_uca_initImplicitConstants(UErrorCode *status);
  1.1022 +
  1.1023 +U_CAPI uint32_t U_EXPORT2
  1.1024 +uprv_uca_getImplicitFromRaw(UChar32 cp);
  1.1025 +
  1.1026 +/*U_CFUNC uint32_t U_EXPORT2
  1.1027 +uprv_uca_getImplicitPrimary(UChar32 cp);*/
  1.1028 +
  1.1029 +U_CAPI UChar32 U_EXPORT2
  1.1030 +uprv_uca_getRawFromImplicit(uint32_t implicit);
  1.1031 +
  1.1032 +U_CAPI UChar32 U_EXPORT2
  1.1033 +uprv_uca_getRawFromCodePoint(UChar32 i);
  1.1034 +
  1.1035 +U_CAPI UChar32 U_EXPORT2
  1.1036 +uprv_uca_getCodePointFromRaw(UChar32 i);
  1.1037 +
  1.1038 +typedef const UChar* GetCollationRulesFunction(void* context, const char* locale, const char* type, int32_t* pLength, UErrorCode* status);
  1.1039 +
  1.1040 +U_CAPI UCollator* U_EXPORT2
  1.1041 +ucol_openRulesForImport( const UChar        *rules,
  1.1042 +                         int32_t            rulesLength,
  1.1043 +                         UColAttributeValue normalizationMode,
  1.1044 +                         UCollationStrength strength,
  1.1045 +                         UParseError        *parseError,
  1.1046 +                         GetCollationRulesFunction  importFunc,
  1.1047 +                         void* context,
  1.1048 +                         UErrorCode         *status);
  1.1049 +
  1.1050 +       
  1.1051 +U_CFUNC void U_EXPORT2 
  1.1052 +ucol_buildPermutationTable(UCollator *coll, UErrorCode *status);
  1.1053 +
  1.1054 +U_CFUNC int U_EXPORT2 
  1.1055 +ucol_getLeadBytesForReorderCode(const UCollator *uca, int reorderCode, uint16_t* returnLeadBytes, int returnCapacity);
  1.1056 +
  1.1057 +U_CFUNC int U_EXPORT2 
  1.1058 +ucol_getReorderCodesForLeadByte(const UCollator *uca, int leadByte, int16_t* returnReorderCodes, int returnCapacity);
  1.1059 +
  1.1060 +#ifdef __cplusplus
  1.1061 +/*
  1.1062 + *  Test whether a character is potentially "unsafe" for use as a collation
  1.1063 + *  starting point.  Unsafe chars are those with combining class != 0 plus
  1.1064 + *  those that are the 2nd thru nth character in a contraction sequence.
  1.1065 + *
  1.1066 + *  Function is in header file because it's used in both collation and string search,
  1.1067 + *  and needs to be inline for performance.
  1.1068 + */
  1.1069 +static inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) {
  1.1070 +    int32_t  hash;
  1.1071 +    uint8_t  htbyte;
  1.1072 +
  1.1073 +    if (c < coll->minUnsafeCP) {
  1.1074 +        return FALSE;
  1.1075 +    }
  1.1076 +
  1.1077 +    hash = c;
  1.1078 +    if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
  1.1079 +        if(U16_IS_SURROGATE(c)) {
  1.1080 +            /*  Lead or trail surrogate             */
  1.1081 +            /*  These are always considered unsafe. */
  1.1082 +            return TRUE;
  1.1083 +        }
  1.1084 +        hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
  1.1085 +    }
  1.1086 +    htbyte = coll->unsafeCP[hash>>3];
  1.1087 +    return ((htbyte >> (hash & 7)) & 1);
  1.1088 +}
  1.1089 +#endif /* __cplusplus */
  1.1090 +
  1.1091 +/* The offsetBuffer in collIterate might need to be freed to avoid memory leaks. */
  1.1092 +void ucol_freeOffsetBuffer(U_NAMESPACE_QUALIFIER collIterate *s); 
  1.1093 +
  1.1094 +#endif /* #if !UCONFIG_NO_COLLATION */
  1.1095 +
  1.1096 +#endif

mercurial