michael@0: /* michael@0: ******************************************************************************* michael@0: * Copyright (C) 2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ******************************************************************************* michael@0: * dictionarydata.h michael@0: * michael@0: * created on: 2012may31 michael@0: * created by: Markus W. Scherer & Maxime Serrano michael@0: */ michael@0: michael@0: #include "dictionarydata.h" michael@0: #include "unicode/ucharstrie.h" michael@0: #include "unicode/bytestrie.h" michael@0: #include "unicode/udata.h" michael@0: #include "cmemory.h" michael@0: michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: const int32_t DictionaryData::TRIE_TYPE_BYTES = 0; michael@0: const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1; michael@0: const int32_t DictionaryData::TRIE_TYPE_MASK = 7; michael@0: const int32_t DictionaryData::TRIE_HAS_VALUES = 8; michael@0: michael@0: const int32_t DictionaryData::TRANSFORM_NONE = 0; michael@0: const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; michael@0: const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; michael@0: const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; michael@0: michael@0: DictionaryMatcher::~DictionaryMatcher() { michael@0: } michael@0: michael@0: UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { michael@0: udata_close(file); michael@0: } michael@0: michael@0: int32_t UCharsDictionaryMatcher::getType() const { michael@0: return DictionaryData::TRIE_TYPE_UCHARS; michael@0: } michael@0: michael@0: int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { michael@0: UCharsTrie uct(characters); michael@0: UChar32 c = utext_next32(text); michael@0: if (c < 0) { michael@0: return 0; michael@0: } michael@0: UStringTrieResult result = uct.first(c); michael@0: int32_t numChars = 1; michael@0: count = 0; michael@0: for (;;) { michael@0: if (USTRINGTRIE_HAS_VALUE(result)) { michael@0: if (count < limit) { michael@0: if (values != NULL) { michael@0: values[count] = uct.getValue(); michael@0: } michael@0: lengths[count++] = numChars; michael@0: } michael@0: if (result == USTRINGTRIE_FINAL_VALUE) { michael@0: break; michael@0: } michael@0: } michael@0: else if (result == USTRINGTRIE_NO_MATCH) { michael@0: break; michael@0: } michael@0: michael@0: // TODO: why do we have a text limit if the UText knows its length? michael@0: if (numChars >= maxLength) { michael@0: break; michael@0: } michael@0: michael@0: c = utext_next32(text); michael@0: if (c < 0) { michael@0: break; michael@0: } michael@0: ++numChars; michael@0: result = uct.next(c); michael@0: } michael@0: return numChars; michael@0: } michael@0: michael@0: BytesDictionaryMatcher::~BytesDictionaryMatcher() { michael@0: udata_close(file); michael@0: } michael@0: michael@0: UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { michael@0: if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { michael@0: if (c == 0x200D) { michael@0: return 0xFF; michael@0: } else if (c == 0x200C) { michael@0: return 0xFE; michael@0: } michael@0: int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); michael@0: if (delta < 0 || 0xFD < delta) { michael@0: return U_SENTINEL; michael@0: } michael@0: return (UChar32)delta; michael@0: } michael@0: return c; michael@0: } michael@0: michael@0: int32_t BytesDictionaryMatcher::getType() const { michael@0: return DictionaryData::TRIE_TYPE_BYTES; michael@0: } michael@0: michael@0: int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { michael@0: BytesTrie bt(characters); michael@0: UChar32 c = utext_next32(text); michael@0: if (c < 0) { michael@0: return 0; michael@0: } michael@0: UStringTrieResult result = bt.first(transform(c)); michael@0: int32_t numChars = 1; michael@0: count = 0; michael@0: for (;;) { michael@0: if (USTRINGTRIE_HAS_VALUE(result)) { michael@0: if (count < limit) { michael@0: if (values != NULL) { michael@0: values[count] = bt.getValue(); michael@0: } michael@0: lengths[count++] = numChars; michael@0: } michael@0: if (result == USTRINGTRIE_FINAL_VALUE) { michael@0: break; michael@0: } michael@0: } michael@0: else if (result == USTRINGTRIE_NO_MATCH) { michael@0: break; michael@0: } michael@0: michael@0: // TODO: why do we have a text limit if the UText knows its length? michael@0: if (numChars >= maxLength) { michael@0: break; michael@0: } michael@0: michael@0: c = utext_next32(text); michael@0: if (c < 0) { michael@0: break; michael@0: } michael@0: ++numChars; michael@0: result = bt.next(transform(c)); michael@0: } michael@0: return numChars; michael@0: } michael@0: michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: U_NAMESPACE_USE michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, michael@0: void *outData, UErrorCode *pErrorCode) { michael@0: const UDataInfo *pInfo; michael@0: int32_t headerSize; michael@0: const uint8_t *inBytes; michael@0: uint8_t *outBytes; michael@0: const int32_t *inIndexes; michael@0: int32_t indexes[DictionaryData::IX_COUNT]; michael@0: int32_t i, offset, size; michael@0: michael@0: headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); michael@0: if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; michael@0: pInfo = (const UDataInfo *)((const char *)inData + 4); michael@0: if (!(pInfo->dataFormat[0] == 0x44 && michael@0: pInfo->dataFormat[1] == 0x69 && michael@0: pInfo->dataFormat[2] == 0x63 && michael@0: pInfo->dataFormat[3] == 0x74 && michael@0: pInfo->formatVersion[0] == 1)) { michael@0: udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", michael@0: pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); michael@0: *pErrorCode = U_UNSUPPORTED_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: inBytes = (const uint8_t *)inData + headerSize; michael@0: outBytes = (uint8_t *)outData + headerSize; michael@0: michael@0: inIndexes = (const int32_t *)inBytes; michael@0: if (length >= 0) { michael@0: length -= headerSize; michael@0: if (length < (int32_t)(sizeof(indexes))) { michael@0: udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); michael@0: *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: for (i = 0; i < DictionaryData::IX_COUNT; i++) { michael@0: indexes[i] = udata_readInt32(ds, inIndexes[i]); michael@0: } michael@0: michael@0: size = indexes[DictionaryData::IX_TOTAL_SIZE]; michael@0: michael@0: if (length >= 0) { michael@0: if (length < size) { michael@0: udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); michael@0: *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: if (inBytes != outBytes) { michael@0: uprv_memcpy(outBytes, inBytes, size); michael@0: } michael@0: michael@0: offset = 0; michael@0: ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); michael@0: offset = (int32_t)sizeof(indexes); michael@0: int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; michael@0: int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; michael@0: michael@0: if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { michael@0: ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); michael@0: } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { michael@0: // nothing to do michael@0: } else { michael@0: udata_printError(ds, "udict_swap(): unknown trie type!\n"); michael@0: *pErrorCode = U_UNSUPPORTED_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: // these next two sections are empty in the current format, michael@0: // but may be used later. michael@0: offset = nextOffset; michael@0: nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; michael@0: offset = nextOffset; michael@0: nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; michael@0: offset = nextOffset; michael@0: } michael@0: return headerSize + size; michael@0: } michael@0: #endif