michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 1999-2013 International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: rbbidata.h michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * RBBI data formats Includes michael@0: * michael@0: * Structs that describes the format of the Binary RBBI data, michael@0: * as it is stored in ICU's data file. michael@0: * michael@0: * RBBIDataWrapper - Instances of this class sit between the michael@0: * raw data structs and the RulesBasedBreakIterator objects michael@0: * that are created by applications. The wrapper class michael@0: * provides reference counting for the underlying data, michael@0: * and direct pointers to data that would not otherwise michael@0: * be accessible without ugly pointer arithmetic. The michael@0: * wrapper does not attempt to provide any higher level michael@0: * abstractions for the data itself. michael@0: * michael@0: * There will be only one instance of RBBIDataWrapper for any michael@0: * set of RBBI run time data being shared by instances michael@0: * (clones) of RulesBasedBreakIterator. michael@0: */ michael@0: michael@0: #ifndef __RBBIDATA_H__ michael@0: #define __RBBIDATA_H__ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/udata.h" michael@0: #include "udataswp.h" michael@0: michael@0: /** michael@0: * Swap RBBI data. See udataswp.h. michael@0: * @internal michael@0: */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: ubrk_swap(const UDataSwapper *ds, michael@0: const void *inData, int32_t length, void *outData, michael@0: UErrorCode *pErrorCode); michael@0: michael@0: #ifdef __cplusplus michael@0: michael@0: #include "unicode/uobject.h" michael@0: #include "unicode/unistr.h" michael@0: #include "umutex.h" michael@0: #include "utrie.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: /* michael@0: * The following structs map exactly onto the raw data from ICU common data file. michael@0: */ michael@0: struct RBBIDataHeader { michael@0: uint32_t fMagic; /* == 0xbla0 */ michael@0: uint8_t fFormatVersion[4]; /* Data Format. Same as the value in struct UDataInfo */ michael@0: /* if there is one associated with this data. */ michael@0: /* (version originates in rbbi, is copied to UDataInfo) */ michael@0: /* For ICU 3.2 and earlier, this field was */ michael@0: /* uint32_t fVersion */ michael@0: /* with a value of 1. */ michael@0: uint32_t fLength; /* Total length in bytes of this RBBI Data, */ michael@0: /* including all sections, not just the header. */ michael@0: uint32_t fCatCount; /* Number of character categories. */ michael@0: michael@0: /* */ michael@0: /* Offsets and sizes of each of the subsections within the RBBI data. */ michael@0: /* All offsets are bytes from the start of the RBBIDataHeader. */ michael@0: /* All sizes are in bytes. */ michael@0: /* */ michael@0: uint32_t fFTable; /* forward state transition table. */ michael@0: uint32_t fFTableLen; michael@0: uint32_t fRTable; /* Offset to the reverse state transition table. */ michael@0: uint32_t fRTableLen; michael@0: uint32_t fSFTable; /* safe point forward transition table */ michael@0: uint32_t fSFTableLen; michael@0: uint32_t fSRTable; /* safe point reverse transition table */ michael@0: uint32_t fSRTableLen; michael@0: uint32_t fTrie; /* Offset to Trie data for character categories */ michael@0: uint32_t fTrieLen; michael@0: uint32_t fRuleSource; /* Offset to the source for for the break */ michael@0: uint32_t fRuleSourceLen; /* rules. Stored UChar *. */ michael@0: uint32_t fStatusTable; /* Offset to the table of rule status values */ michael@0: uint32_t fStatusTableLen; michael@0: michael@0: uint32_t fReserved[6]; /* Reserved for expansion */ michael@0: michael@0: }; michael@0: michael@0: michael@0: michael@0: struct RBBIStateTableRow { michael@0: int16_t fAccepting; /* Non-zero if this row is for an accepting state. */ michael@0: /* Value 0: not an accepting state. */ michael@0: /* -1: Unconditional Accepting state. */ michael@0: /* positive: Look-ahead match has completed. */ michael@0: /* Actual boundary position happened earlier */ michael@0: /* Value here == fLookAhead in earlier */ michael@0: /* state, at actual boundary pos. */ michael@0: int16_t fLookAhead; /* Non-zero if this row is for a state that */ michael@0: /* corresponds to a '/' in the rule source. */ michael@0: /* Value is the same as the fAccepting */ michael@0: /* value for the rule (which will appear */ michael@0: /* in a different state. */ michael@0: int16_t fTagIdx; /* Non-zero if this row covers a {tagged} position */ michael@0: /* from a rule. Value is the index in the */ michael@0: /* StatusTable of the set of matching */ michael@0: /* tags (rule status values) */ michael@0: int16_t fReserved; michael@0: uint16_t fNextState[2]; /* Next State, indexed by char category. */ michael@0: /* This array does not have two elements */ michael@0: /* Array Size is actually fData->fHeader->fCatCount */ michael@0: /* CAUTION: see RBBITableBuilder::getTableSize() */ michael@0: /* before changing anything here. */ michael@0: }; michael@0: michael@0: michael@0: struct RBBIStateTable { michael@0: uint32_t fNumStates; /* Number of states. */ michael@0: uint32_t fRowLen; /* Length of a state table row, in bytes. */ michael@0: uint32_t fFlags; /* Option Flags for this state table */ michael@0: uint32_t fReserved; /* reserved */ michael@0: char fTableData[4]; /* First RBBIStateTableRow begins here. */ michael@0: /* (making it char[] simplifies ugly address */ michael@0: /* arithmetic for indexing variable length rows.) */ michael@0: }; michael@0: michael@0: typedef enum { michael@0: RBBI_LOOKAHEAD_HARD_BREAK = 1, michael@0: RBBI_BOF_REQUIRED = 2 michael@0: } RBBIStateTableFlags; michael@0: michael@0: michael@0: /* */ michael@0: /* The reference counting wrapper class */ michael@0: /* */ michael@0: class RBBIDataWrapper : public UMemory { michael@0: public: michael@0: enum EDontAdopt { michael@0: kDontAdopt michael@0: }; michael@0: RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status); michael@0: RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status); michael@0: RBBIDataWrapper(UDataMemory* udm, UErrorCode &status); michael@0: ~RBBIDataWrapper(); michael@0: michael@0: void init(const RBBIDataHeader *data, UErrorCode &status); michael@0: RBBIDataWrapper *addReference(); michael@0: void removeReference(); michael@0: UBool operator ==(const RBBIDataWrapper &other) const; michael@0: int32_t hashCode(); michael@0: const UnicodeString &getRuleSourceString() const; michael@0: #ifdef RBBI_DEBUG michael@0: void printData(); michael@0: void printTable(const char *heading, const RBBIStateTable *table); michael@0: #else michael@0: #define printData() michael@0: #define printTable(heading, table) michael@0: #endif michael@0: michael@0: /* */ michael@0: /* Pointers to items within the data */ michael@0: /* */ michael@0: const RBBIDataHeader *fHeader; michael@0: const RBBIStateTable *fForwardTable; michael@0: const RBBIStateTable *fReverseTable; michael@0: const RBBIStateTable *fSafeFwdTable; michael@0: const RBBIStateTable *fSafeRevTable; michael@0: const UChar *fRuleSource; michael@0: const int32_t *fRuleStatusTable; michael@0: michael@0: /* number of int32_t values in the rule status table. Used to sanity check indexing */ michael@0: int32_t fStatusMaxIdx; michael@0: michael@0: UTrie fTrie; michael@0: michael@0: private: michael@0: u_atomic_int32_t fRefCount; michael@0: UDataMemory *fUDataMem; michael@0: UnicodeString fRuleString; michael@0: UBool fDontFreeData; michael@0: michael@0: RBBIDataWrapper(const RBBIDataWrapper &other); /* forbid copying of this class */ michael@0: RBBIDataWrapper &operator=(const RBBIDataWrapper &other); /* forbid copying of this class */ michael@0: }; michael@0: michael@0: michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* C++ */ michael@0: michael@0: #endif